haproxy/src/regex.c
Christian Ruppert de898712a0 MEDIUM: regex: Use pcre_study always when PCRE is used, regardless of JIT
pcre_study() has been around long before JIT has been added. It also seems to
affect the performance in some cases (positive).

Below I've attached some test restults. The test is based on
http://sljit.sourceforge.net/regex_perf.html (see bottom). It has been modified
to just test pcre_study vs. no pcre_study. Note: This test does not try to
match specific header it's instead run over a larger text with more and less
complex patterns to make the differences more clear.

% ./runtest
'mark.txt' loaded. (Length: 19665221 bytes)
-----------------
Regex: 'Twain'
[pcre-nostudy] time:    14 ms (2388 matches)
[pcre-study] time:    21 ms (2388 matches)
-----------------
Regex: '^Twain'
[pcre-nostudy] time:   109 ms (100 matches)
[pcre-study] time:   109 ms (100 matches)
-----------------
Regex: 'Twain$'
[pcre-nostudy] time:    14 ms (127 matches)
[pcre-study] time:    16 ms (127 matches)
-----------------
Regex: 'Huck[a-zA-Z]+|Finn[a-zA-Z]+'
[pcre-nostudy] time:   695 ms (83 matches)
[pcre-study] time:    26 ms (83 matches)
-----------------
Regex: 'a[^x]{20}b'
[pcre-nostudy] time:    90 ms (12495 matches)
[pcre-study] time:    91 ms (12495 matches)
-----------------
Regex: 'Tom|Sawyer|Huckleberry|Finn'
[pcre-nostudy] time:  1236 ms (3015 matches)
[pcre-study] time:    34 ms (3015 matches)
-----------------
Regex: '.{0,3}(Tom|Sawyer|Huckleberry|Finn)'
[pcre-nostudy] time:  5696 ms (3015 matches)
[pcre-study] time:  5655 ms (3015 matches)
-----------------
Regex: '[a-zA-Z]+ing'
[pcre-nostudy] time:  1290 ms (95863 matches)
[pcre-study] time:  1167 ms (95863 matches)
-----------------
Regex: '^[a-zA-Z]{0,4}ing[^a-zA-Z]'
[pcre-nostudy] time:   136 ms (4507 matches)
[pcre-study] time:   134 ms (4507 matches)
-----------------
Regex: '[a-zA-Z]+ing$'
[pcre-nostudy] time:  1334 ms (5360 matches)
[pcre-study] time:  1214 ms (5360 matches)
-----------------
Regex: '^[a-zA-Z ]{5,}$'
[pcre-nostudy] time:   198 ms (26236 matches)
[pcre-study] time:   197 ms (26236 matches)
-----------------
Regex: '^.{16,20}$'
[pcre-nostudy] time:   173 ms (4902 matches)
[pcre-study] time:   175 ms (4902 matches)
-----------------
Regex: '([a-f](.[d-m].){0,2}[h-n]){2}'
[pcre-nostudy] time:  1242 ms (68621 matches)
[pcre-study] time:   690 ms (68621 matches)
-----------------
Regex: '([A-Za-z]awyer|[A-Za-z]inn)[^a-zA-Z]'
[pcre-nostudy] time:  1215 ms (675 matches)
[pcre-study] time:   952 ms (675 matches)
-----------------
Regex: '"[^"]{0,30}[?!\.]"'
[pcre-nostudy] time:    27 ms (5972 matches)
[pcre-study] time:    28 ms (5972 matches)
-----------------
Regex: 'Tom.{10,25}river|river.{10,25}Tom'
[pcre-nostudy] time:   705 ms (2 matches)
[pcre-study] time:    68 ms (2 matches)

In some cases it's more or less the same but when it's faster than by a huge margin.
It always depends on the pattern, the string(s) to match against etc.

Signed-off-by: Christian Ruppert <c.ruppert@babiel.com>
2014-11-18 13:26:18 +01:00

319 lines
7.5 KiB
C

/*
* Regex and string management functions.
*
* Copyright 2000-2010 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <common/config.h>
#include <common/defaults.h>
#include <common/regex.h>
#include <common/standard.h>
#include <proto/log.h>
/* regex trash buffer used by various regex tests */
regmatch_t pmatch[MAX_MATCH]; /* rm_so, rm_eo for regular expressions */
int exp_replace(char *dst, unsigned int dst_size, char *src, const char *str, const regmatch_t *matches)
{
char *old_dst = dst;
char* dst_end = dst + dst_size;
while (*str) {
if (*str == '\\') {
str++;
if (!*str)
return -1;
if (isdigit((unsigned char)*str)) {
int len, num;
num = *str - '0';
str++;
if (matches[num].rm_eo > -1 && matches[num].rm_so > -1) {
len = matches[num].rm_eo - matches[num].rm_so;
if (dst + len >= dst_end)
return -1;
memcpy(dst, src + matches[num].rm_so, len);
dst += len;
}
} else if (*str == 'x') {
unsigned char hex1, hex2;
str++;
if (!*str)
return -1;
hex1 = toupper(*str++) - '0';
if (!*str)
return -1;
hex2 = toupper(*str++) - '0';
if (hex1 > 9) hex1 -= 'A' - '9' - 1;
if (hex2 > 9) hex2 -= 'A' - '9' - 1;
if (dst >= dst_end)
return -1;
*dst++ = (hex1<<4) + hex2;
} else {
if (dst >= dst_end)
return -1;
*dst++ = *str++;
}
} else {
if (dst >= dst_end)
return -1;
*dst++ = *str++;
}
}
if (dst >= dst_end)
return -1;
*dst = '\0';
return dst - old_dst;
}
/* returns NULL if the replacement string <str> is valid, or the pointer to the first error */
const char *check_replace_string(const char *str)
{
const char *err = NULL;
while (*str) {
if (*str == '\\') {
err = str; /* in case of a backslash, we return the pointer to it */
str++;
if (!*str)
return err;
else if (isdigit((unsigned char)*str))
err = NULL;
else if (*str == 'x') {
str++;
if (!ishex(*str))
return err;
str++;
if (!ishex(*str))
return err;
err = NULL;
}
else {
Warning("'\\%c' : deprecated use of a backslash before something not '\\','x' or a digit.\n", *str);
err = NULL;
}
}
str++;
}
return err;
}
/* returns the pointer to an error in the replacement string, or NULL if OK */
const char *chain_regex(struct hdr_exp **head, struct my_regex *preg,
int action, const char *replace, void *cond)
{
struct hdr_exp *exp;
if (replace != NULL) {
const char *err;
err = check_replace_string(replace);
if (err)
return err;
}
while (*head != NULL)
head = &(*head)->next;
exp = calloc(1, sizeof(struct hdr_exp));
exp->preg = preg;
exp->replace = replace;
exp->action = action;
exp->cond = cond;
*head = exp;
return NULL;
}
/* This function apply regex. It take const null terminated char as input.
* If the function doesn't match, it returns false, else it returns true.
* When it is compiled with JIT, this function execute strlen on the subject.
*/
int regex_exec_match(const struct my_regex *preg, const char *subject,
size_t nmatch, regmatch_t pmatch[]) {
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
int ret;
int matches[MAX_MATCH * 3];
int enmatch;
int i;
/* Silently limit the number of allowed matches. max
* match i the maximum value for match, in fact this
* limit is not applyied.
*/
enmatch = nmatch;
if (enmatch > MAX_MATCH)
enmatch = MAX_MATCH;
/* The value returned by pcre_exec() is one more than the highest numbered
* pair that has been set. For example, if two substrings have been captured,
* the returned value is 3. If there are no capturing subpatterns, the return
* value from a successful match is 1, indicating that just the first pair of
* offsets has been set.
*
* It seems that this function returns 0 if it detect more matches than avalaible
* space in the matches array.
*/
ret = pcre_exec(preg->reg, preg->extra, subject, strlen(subject), 0, 0, matches, enmatch * 3);
if (ret < 0)
return 0;
if (ret == 0)
ret = enmatch;
for (i=0; i<nmatch; i++) {
/* Copy offset. */
if (i < ret) {
pmatch[i].rm_so = matches[(i*2)];
pmatch[i].rm_eo = matches[(i*2)+1];
continue;
}
/* Set the unmatvh flag (-1). */
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
return 1;
#else
int match;
match = regexec(&preg->regex, subject, nmatch, pmatch, 0);
if (match == REG_NOMATCH)
return 0;
return 1;
#endif
}
/* This function apply regex. It take a "char *" ans length as input. The
* <subject> can be modified during the processing. If the function doesn't
* match, it returns false, else it returns true.
* When it is compiled with standard POSIX regex or PCRE, this function add
* a temporary null chracters at the end of the <subject>. The <subject> must
* have a real length of <length> + 1.
*/
int regex_exec_match2(const struct my_regex *preg, char *subject, int length,
size_t nmatch, regmatch_t pmatch[]) {
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
int ret;
int matches[MAX_MATCH * 3];
int enmatch;
int i;
/* Silently limit the number of allowed matches. max
* match i the maximum value for match, in fact this
* limit is not applyied.
*/
enmatch = nmatch;
if (enmatch > MAX_MATCH)
enmatch = MAX_MATCH;
/* The value returned by pcre_exec() is one more than the highest numbered
* pair that has been set. For example, if two substrings have been captured,
* the returned value is 3. If there are no capturing subpatterns, the return
* value from a successful match is 1, indicating that just the first pair of
* offsets has been set.
*
* It seems that this function returns 0 if it detect more matches than avalaible
* space in the matches array.
*/
ret = pcre_exec(preg->reg, preg->extra, subject, length, 0, 0, matches, enmatch * 3);
if (ret < 0)
return 0;
if (ret == 0)
ret = enmatch;
for (i=0; i<nmatch; i++) {
/* Copy offset. */
if (i < ret) {
pmatch[i].rm_so = matches[(i*2)];
pmatch[i].rm_eo = matches[(i*2)+1];
continue;
}
/* Set the unmatvh flag (-1). */
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
return 1;
#else
char old_char = subject[length];
int match;
subject[length] = 0;
match = regexec(&preg->regex, subject, nmatch, pmatch, 0);
subject[length] = old_char;
if (match == REG_NOMATCH)
return 0;
return 1;
#endif
}
int regex_comp(const char *str, struct my_regex *regex, int cs, int cap, char **err)
{
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
int flags = 0;
const char *error;
int erroffset;
if (!cs)
flags |= PCRE_CASELESS;
if (!cap)
flags |= PCRE_NO_AUTO_CAPTURE;
regex->reg = pcre_compile(str, flags, &error, &erroffset, NULL);
if (!regex->reg) {
memprintf(err, "regex '%s' is invalid (error=%s, erroffset=%d)", str, error, erroffset);
return 0;
}
regex->extra = pcre_study(regex->reg, PCRE_STUDY_JIT_COMPILE, &error);
if (!regex->extra && error != NULL) {
pcre_free(regex->reg);
memprintf(err, "failed to compile regex '%s' (error=%s)", str, error);
return 0;
}
#else
int flags = REG_EXTENDED;
if (!cs)
flags |= REG_ICASE;
if (!cap)
flags |= REG_NOSUB;
if (regcomp(&regex->regex, str, flags) != 0) {
memprintf(err, "regex '%s' is invalid", str);
return 0;
}
#endif
return 1;
}
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/