diff --git a/doc/configuration.txt b/doc/configuration.txt index 58aaaf062..ad6defb9f 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -10043,6 +10043,46 @@ ipmask() table entries and as such use the same server. The mask can be passed in dotted form (eg: 255.255.255.0) or in CIDR form (eg: 24). +json([]) + Escapes the input string and produces an ASCII ouput string ready to use as a + JSON string. The converter tries to decode the input string according to the + parameter. It can be "ascii", "utf8", "utf8s", "utf8"" or + "utf8ps". The "ascii" decoder never fails. The "utf8" decoder detects 3 types + of errors: + - bad UTF-8 sequence (lone continuation byte, bad number of continuation + bytes, ...) + - invalid range (the decoded value is within a UTF-8 prohibited range), + - code overlong (the value is encoded with more bytes than necessary). + + The UTF-8 JSON encoding can produce a "too long value" error when the UTF-8 + character is greater than 0xffff because the JSON string escape specification + only authorizes 4 hex digits for the value encoding. The UTF-8 decoder exists + in 4 variants designated by a combination of two suffix letters : "p" for + "permissive" and "s" for "silently ignore". The behaviors of the decoders + are : + - "ascii" : never fails ; + - "utf8" : fails on any detected errors ; + - "utf8s" : never fails, but removes characters corresponding to errors ; + - "utf8p" : accepts and fixes the overlong errors, but fails on any other + error ; + - "utf8ps" : never fails, accepts and fixes the overlong errors, but removes + characters corresponding to the other errors. + + This converter is particularly useful for building properly escaped JSON for + logging to servers which consume JSON-formated traffic logs. + + Example: + capture request header user-agent len 150 + capture request header Host len 15 + log-format {"ip":"%[src]","user-agent":"%[capture.req.hdr(1),json]"} + + Input request from client 127.0.0.1: + GET / HTTP/1.0 + User-Agent: Very "Ugly" UA 1/2 + + Output log: + {"ip":"127.0.0.1","user-agent":"Very \"Ugly\" UA 1\/2"} + language([,]) Returns the value with the highest q-factor from a list as extracted from the "accept-language" header using "req.fhdr". Values with no q-factor have a diff --git a/include/common/standard.h b/include/common/standard.h index 8811c6f91..e9900d54a 100644 --- a/include/common/standard.h +++ b/include/common/standard.h @@ -914,4 +914,22 @@ static inline unsigned long caddr_clr_flags(unsigned long caddr, unsigned int da return caddr & ~(unsigned long)(data & 3); } +/* UTF-8 decoder status */ +#define UTF8_CODE_OK 0x00 +#define UTF8_CODE_OVERLONG 0x10 +#define UTF8_CODE_INVRANGE 0x20 +#define UTF8_CODE_BADSEQ 0x40 + +unsigned char utf8_next(const char *s, int len, unsigned int *c); + +static inline unsigned char utf8_return_code(unsigned int code) +{ + return code & 0xf0; +} + +static inline unsigned char utf8_return_length(unsigned char code) +{ + return code & 0x0f; +} + #endif /* _COMMON_STANDARD_H */ diff --git a/src/sample.c b/src/sample.c index 33437395e..70f47bbd2 100644 --- a/src/sample.c +++ b/src/sample.c @@ -11,6 +11,7 @@ * */ +#include #include #include #include @@ -1386,6 +1387,188 @@ static int sample_conv_wt6(const struct arg *arg_p, struct sample *smp) return 1; } +/* This function escape special json characters. The returned string can be + * safely set between two '"' and used as json string. The json string is + * defined like this: + * + * any Unicode character except '"' or '\' or control character + * \", \\, \/, \b, \f, \n, \r, \t, \u + four-hex-digits + * + * The enum input_type contain all the allowed mode for decoding the input + * string. + */ +enum input_type { + IT_ASCII = 0, + IT_UTF8, + IT_UTF8S, + IT_UTF8P, + IT_UTF8PS, +}; +static int sample_conv_json_check(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (!arg) { + memprintf(err, "Unexpected empty arg list"); + return 0; + } + + if (arg->type != ARGT_STR) { + memprintf(err, "Unexpected arg type"); + return 0; + } + + if (strcmp(arg->data.str.str, "") == 0) { + arg->type = ARGT_UINT; + arg->data.uint = IT_ASCII; + return 1; + } + + else if (strcmp(arg->data.str.str, "ascii") == 0) { + arg->type = ARGT_UINT; + arg->data.uint = IT_ASCII; + return 1; + } + + else if (strcmp(arg->data.str.str, "utf8") == 0) { + arg->type = ARGT_UINT; + arg->data.uint = IT_UTF8; + return 1; + } + + else if (strcmp(arg->data.str.str, "utf8s") == 0) { + arg->type = ARGT_UINT; + arg->data.uint = IT_UTF8S; + return 1; + } + + else if (strcmp(arg->data.str.str, "utf8p") == 0) { + arg->type = ARGT_UINT; + arg->data.uint = IT_UTF8P; + return 1; + } + + else if (strcmp(arg->data.str.str, "utf8ps") == 0) { + arg->type = ARGT_UINT; + arg->data.uint = IT_UTF8PS; + return 1; + } + + memprintf(err, "Unexpected input code type at file '%s', line %d. " + "Allowed value are 'ascii', 'utf8', 'utf8p' and 'utf8pp'", file, line); + return 0; +} + +static int sample_conv_json(const struct arg *arg_p, struct sample *smp) +{ + struct chunk *temp; + char _str[7]; /* \u + 4 hex digit + null char for sprintf. */ + const char *str; + int len; + enum input_type input_type = IT_ASCII; + unsigned int c; + unsigned int ret; + char *p; + + if (arg_p) + input_type = arg_p->data.uint; + + temp = get_trash_chunk(); + temp->len = 0; + + p = smp->data.str.str; + while (p < smp->data.str.str + smp->data.str.len) { + + if (input_type == IT_ASCII) { + /* Read input as ASCII. */ + c = *(unsigned char *)p; + p++; + } + else { + /* Read input as UTF8. */ + ret = utf8_next(p, smp->data.str.len - ( p - smp->data.str.str ), &c); + p += utf8_return_length(ret); + + if (input_type == IT_UTF8 && utf8_return_code(ret) != UTF8_CODE_OK) + return 0; + if (input_type == IT_UTF8S && utf8_return_code(ret) != UTF8_CODE_OK) + continue; + if (input_type == IT_UTF8P && utf8_return_code(ret) & (UTF8_CODE_INVRANGE|UTF8_CODE_BADSEQ)) + return 0; + if (input_type == IT_UTF8PS && utf8_return_code(ret) & (UTF8_CODE_INVRANGE|UTF8_CODE_BADSEQ)) + continue; + + /* Check too big values. */ + if ((unsigned int)c > 0xffff) { + if (input_type == IT_UTF8 || input_type == IT_UTF8P) + return 0; + continue; + } + } + + /* Convert character. */ + if (c == '"') { + len = 2; + str = "\\\""; + } + else if (c == '\\') { + len = 2; + str = "\\\\"; + } + else if (c == '/') { + len = 2; + str = "\\/"; + } + else if (c == '\b') { + len = 2; + str = "\\b"; + } + else if (c == '\f') { + len = 2; + str = "\\f"; + } + else if (c == '\r') { + len = 2; + str = "\\r"; + } + else if (c == '\n') { + len = 2; + str = "\\n"; + } + else if (c == '\t') { + len = 2; + str = "\\t"; + } + else if (c > 0xff || !isprint(c)) { + /* isprint generate a segfault if c is too big. The man says that + * c must have the value of an unsigned char or EOF. + */ + len = 6; + _str[0] = '\\'; + _str[1] = 'u'; + snprintf(&_str[2], 5, "%04x", (unsigned short)c); + str = _str; + } + else { + len = 1; + str = (char *)&c; + } + + /* Check length */ + if (temp->len + len > temp->size) + return 0; + + /* Copy string. */ + memcpy(temp->str + temp->len, str, len); + temp->len += len; + } + + smp->flags &= ~SMP_F_CONST; + smp->data.str = *temp; + smp->type = SMP_T_STR; + + return 1; +} + /************************************************************************/ /* All supported sample fetch functions must be declared here */ /************************************************************************/ @@ -1493,6 +1676,7 @@ static struct sample_conv_kw_list sample_conv_kws = {ILH, { { "djb2", sample_conv_djb2, ARG1(0,UINT), NULL, SMP_T_BIN, SMP_T_UINT }, { "sdbm", sample_conv_sdbm, ARG1(0,UINT), NULL, SMP_T_BIN, SMP_T_UINT }, { "wt6", sample_conv_wt6, ARG1(0,UINT), NULL, SMP_T_BIN, SMP_T_UINT }, + { "json", sample_conv_json, ARG1(1,STR), sample_conv_json_check, SMP_T_STR, SMP_T_STR }, { NULL, NULL, 0, 0, 0 }, }}; diff --git a/src/standard.c b/src/standard.c index f57724c4a..00e672add 100644 --- a/src/standard.c +++ b/src/standard.c @@ -2533,6 +2533,126 @@ const char *strnistr(const char *str1, int len_str1, const char *str2, int len_s return NULL; } +/* This function read the next valid utf8 char. + * is the byte srray to be decode, is its length. + * The function returns decoded char encoded like this: + * The 4 msb are the return code (UTF8_CODE_*), the 4 lsb + * are the length read. The decoded character is stored in . + */ +unsigned char utf8_next(const char *s, int len, unsigned int *c) +{ + const unsigned char *p = (unsigned char *)s; + int dec; + unsigned char code = UTF8_CODE_OK; + + if (len < 1) + return UTF8_CODE_OK; + + /* Check the type of UTF8 sequence + * + * 0... .... 0x00 <= x <= 0x7f : 1 byte: ascii char + * 10.. .... 0x80 <= x <= 0xbf : invalid sequence + * 110. .... 0xc0 <= x <= 0xdf : 2 bytes + * 1110 .... 0xe0 <= x <= 0xef : 3 bytes + * 1111 0... 0xf0 <= x <= 0xf7 : 4 bytes + * 1111 10.. 0xf8 <= x <= 0xfb : 5 bytes + * 1111 110. 0xfc <= x <= 0xfd : 6 bytes + * 1111 111. 0xfe <= x <= 0xff : invalid sequence + */ + switch (*p) { + case 0x00 ... 0x7f: + *c = *p; + return UTF8_CODE_OK | 1; + + case 0x80 ... 0xbf: + *c = *p; + return UTF8_CODE_BADSEQ | 1; + + case 0xc0 ... 0xdf: + if (len < 2) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x1f; + dec = 1; + break; + + case 0xe0 ... 0xef: + if (len < 3) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x0f; + dec = 2; + break; + + case 0xf0 ... 0xf7: + if (len < 4) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x07; + dec = 3; + break; + + case 0xf8 ... 0xfb: + if (len < 5) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x03; + dec = 4; + break; + + case 0xfc ... 0xfd: + if (len < 6) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x01; + dec = 5; + break; + + case 0xfe ... 0xff: + default: + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + + p++; + + while (dec > 0) { + + /* need 0x10 for the 2 first bits */ + if ( ( *p & 0xc0 ) != 0x80 ) + return UTF8_CODE_BADSEQ | ((p-(unsigned char *)s)&0xffff); + + /* add data at char */ + *c = ( *c << 6 ) | ( *p & 0x3f ); + + dec--; + p++; + } + + /* Check ovelong encoding. + * 1 byte : 5 + 6 : 11 : 0x80 ... 0x7ff + * 2 bytes : 4 + 6 + 6 : 16 : 0x800 ... 0xffff + * 3 bytes : 3 + 6 + 6 + 6 : 21 : 0x10000 ... 0x1fffff + */ + if ((*c >= 0x00 && *c <= 0x7f && (p-(unsigned char *)s) > 1) || + (*c >= 0x80 && *c <= 0x7ff && (p-(unsigned char *)s) > 2) || + (*c >= 0x800 && *c <= 0xffff && (p-(unsigned char *)s) > 3) || + (*c >= 0x10000 && *c <= 0x1fffff && (p-(unsigned char *)s) > 4)) + code |= UTF8_CODE_OVERLONG; + + /* Check invalid UTF8 range. */ + if ((*c >= 0xd800 && *c <= 0xdfff) || + (*c >= 0xfffe && *c <= 0xffff)) + code |= UTF8_CODE_INVRANGE; + + return code | ((p-(unsigned char *)s)&0x0f); +} + /* * Local variables: * c-indent-level: 8