haproxy/src/regex.c
Willy Tarreau fda6dc9597 MINOR: regex: use a thread-local match pointer for pcre2
The pcre2 matching requires an array of matches for grouping, that is
allocated when executing the rule by pre-processing it, and that is
immediately freed after use. This is quite inefficient and results in
annoying patterns in "show profiling" that attribute the allocations
to libpcre2 and the releases to haproxy.

A good suggestion from Dragan is to pre-allocate these per thread,
since the entry is not specific to a regex. In addition we're already
limited to MAX_MATCH matches so we don't even have the problem of
having to grow it while parsing nor processing.

The current patch adds a per-thread pair of init/deinit functions to
allocate a thread-local entry for that, and gets rid of the dynamic
allocations. It will result in cleaner memory management patterns and
slightly higher performance (+2.5%) when using pcre2.
2025-10-13 16:56:43 +02:00

470 lines
12 KiB
C

/*
* Regex and string management functions.
*
* Copyright 2000-2010 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <haproxy/api.h>
#include <haproxy/errors.h>
#include <haproxy/global.h>
#include <haproxy/regex.h>
#include <haproxy/tools.h>
/* regex trash buffer used by various regex tests */
THREAD_LOCAL regmatch_t pmatch[MAX_MATCH]; /* rm_so, rm_eo for regular expressions */
#if defined(USE_PCRE2)
/* avoid alloc/free cycles */
THREAD_LOCAL pcre2_match_data *local_pcre2_match = NULL;
#endif
int exp_replace(char *dst, unsigned int dst_size, char *src, const char *str, const regmatch_t *matches)
{
char *old_dst = dst;
char* dst_end = dst + dst_size;
while (*str) {
if (*str == '\\') {
str++;
if (!*str)
return -1;
if (isdigit((unsigned char)*str)) {
int len, num;
num = *str - '0';
str++;
if (matches[num].rm_eo > -1 && matches[num].rm_so > -1) {
len = matches[num].rm_eo - matches[num].rm_so;
if (dst + len >= dst_end)
return -1;
memcpy(dst, src + matches[num].rm_so, len);
dst += len;
}
} else if (*str == 'x') {
unsigned char hex1, hex2;
str++;
if (!*str)
return -1;
hex1 = toupper((unsigned char)*str++) - '0';
if (!*str)
return -1;
hex2 = toupper((unsigned char)*str++) - '0';
if (hex1 > 9) hex1 -= 'A' - '9' - 1;
if (hex2 > 9) hex2 -= 'A' - '9' - 1;
if (dst >= dst_end)
return -1;
*dst++ = (hex1<<4) + hex2;
} else {
if (dst >= dst_end)
return -1;
*dst++ = *str++;
}
} else {
if (dst >= dst_end)
return -1;
*dst++ = *str++;
}
}
if (dst >= dst_end)
return -1;
*dst = '\0';
return dst - old_dst;
}
/* returns NULL if the replacement string <str> is valid, or the pointer to the first error */
const char *check_replace_string(const char *str)
{
const char *err = NULL;
while (*str) {
if (*str == '\\') {
err = str; /* in case of a backslash, we return the pointer to it */
str++;
if (!*str)
return err;
else if (isdigit((unsigned char)*str))
err = NULL;
else if (*str == 'x') {
str++;
if (!ishex(*str))
return err;
str++;
if (!ishex(*str))
return err;
err = NULL;
}
else {
ha_warning("'\\%c' : deprecated use of a backslash before something not '\\','x' or a digit.\n", *str);
err = NULL;
}
}
str++;
}
return err;
}
/* This function apply regex. It take const null terminated char as input.
* If the function doesn't match, it returns false, else it returns true.
* When it is compiled with JIT, this function execute strlen on the subject.
* Currently the only supported flag is REG_NOTBOL.
*/
int regex_exec_match(const struct my_regex *preg, const char *subject,
size_t nmatch, regmatch_t pmatch[], int flags) {
#if defined(USE_PCRE) || defined(USE_PCRE_JIT) || defined(USE_PCRE2) || defined(USE_PCRE2_JIT)
int ret;
#ifdef USE_PCRE2
PCRE2_SIZE *matches;
#else
int matches[MAX_MATCH * 3];
#endif
int enmatch;
int i;
int options;
/* Silently limit the number of allowed matches. max
* match i the maximum value for match, in fact this
* limit is not applied.
*/
enmatch = nmatch;
if (enmatch > MAX_MATCH)
enmatch = MAX_MATCH;
options = 0;
if (flags & REG_NOTBOL)
#ifdef USE_PCRE2
options |= PCRE2_NOTBOL;
#else
options |= PCRE_NOTBOL;
#endif
/* The value returned by pcre_exec()/pcre2_match() is one more than the highest numbered
* pair that has been set. For example, if two substrings have been captured,
* the returned value is 3. If there are no capturing subpatterns, the return
* value from a successful match is 1, indicating that just the first pair of
* offsets has been set.
*
* It seems that this function returns 0 if it detects more matches than available
* space in the matches array.
*/
#ifdef USE_PCRE2
ret = preg->mfn(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)strlen(subject), 0, options, local_pcre2_match, NULL);
if (ret < 0)
return 0;
matches = pcre2_get_ovector_pointer(local_pcre2_match);
#else
ret = pcre_exec(preg->reg, preg->extra, subject, strlen(subject), 0, options, matches, enmatch * 3);
if (ret < 0)
return 0;
#endif
if (ret == 0)
ret = enmatch;
for (i=0; i<nmatch; i++) {
/* Copy offset. */
if (i < ret) {
pmatch[i].rm_so = matches[(i*2)];
pmatch[i].rm_eo = matches[(i*2)+1];
continue;
}
/* Set the unmatvh flag (-1). */
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
return 1;
#else
int match;
flags &= REG_NOTBOL;
match = regexec(&preg->regex, subject, nmatch, pmatch, flags);
if (match == REG_NOMATCH)
return 0;
return 1;
#endif
}
/* This function apply regex. It take a "char *" ans length as input. The
* <subject> can be modified during the processing. If the function doesn't
* match, it returns false, else it returns true.
* When it is compiled with standard POSIX regex or PCRE, this function add
* a temporary null characters at the end of the <subject>. The <subject> must
* have a real length of <length> + 1. Currently the only supported flag is
* REG_NOTBOL.
*/
int regex_exec_match2(const struct my_regex *preg, char *subject, int length,
size_t nmatch, regmatch_t pmatch[], int flags) {
#if defined(USE_PCRE) || defined(USE_PCRE_JIT) || defined(USE_PCRE2) || defined(USE_PCRE2_JIT)
int ret;
#ifdef USE_PCRE2
PCRE2_SIZE *matches;
#else
int matches[MAX_MATCH * 3];
#endif
int enmatch;
int i;
int options;
/* Silently limit the number of allowed matches. max
* match i the maximum value for match, in fact this
* limit is not applied.
*/
enmatch = nmatch;
if (enmatch > MAX_MATCH)
enmatch = MAX_MATCH;
options = 0;
if (flags & REG_NOTBOL)
#ifdef USE_PCRE2
options |= PCRE2_NOTBOL;
#else
options |= PCRE_NOTBOL;
#endif
/* The value returned by pcre_exec()/pcre2_(jit)_match() is one more than the highest numbered
* pair that has been set. For example, if two substrings have been captured,
* the returned value is 3. If there are no capturing subpatterns, the return
* value from a successful match is 1, indicating that just the first pair of
* offsets has been set.
*
* It seems that this function returns 0 if it detects more matches than available
* space in the matches array.
*/
#ifdef USE_PCRE2
ret = preg->mfn(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)length, 0, options, local_pcre2_match, NULL);
if (ret < 0)
return 0;
matches = pcre2_get_ovector_pointer(local_pcre2_match);
#else
ret = pcre_exec(preg->reg, preg->extra, subject, length, 0, options, matches, enmatch * 3);
if (ret < 0)
return 0;
#endif
if (ret == 0)
ret = enmatch;
for (i=0; i<nmatch; i++) {
/* Copy offset. */
if (i < ret) {
pmatch[i].rm_so = matches[(i*2)];
pmatch[i].rm_eo = matches[(i*2)+1];
continue;
}
/* Set the unmatvh flag (-1). */
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
return 1;
#else
char old_char = subject[length];
int match;
flags &= REG_NOTBOL;
subject[length] = 0;
match = regexec(&preg->regex, subject, nmatch, pmatch, flags);
subject[length] = old_char;
if (match == REG_NOMATCH)
return 0;
return 1;
#endif
}
struct my_regex *regex_comp(const char *str, int cs, int cap, char **err)
{
struct my_regex *regex = NULL;
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
int flags = 0;
const char *error;
int erroffset;
#elif defined(USE_PCRE2) || defined(USE_PCRE2_JIT)
int flags = 0;
int errn;
#if defined(USE_PCRE2_JIT)
int jit;
#endif
PCRE2_UCHAR error[256];
PCRE2_SIZE erroffset;
#else
int flags = REG_EXTENDED;
#endif
regex = calloc(1, sizeof(*regex));
if (!regex) {
memprintf(err, "not enough memory to build regex");
goto out_fail_alloc;
}
#if defined(USE_PCRE) || defined(USE_PCRE_JIT)
if (!cs)
flags |= PCRE_CASELESS;
if (!cap)
flags |= PCRE_NO_AUTO_CAPTURE;
regex->reg = pcre_compile(str, flags, &error, &erroffset, NULL);
if (!regex->reg) {
memprintf(err, "regex '%s' is invalid (error=%s, erroffset=%d)", str, error, erroffset);
goto out_fail_alloc;
}
regex->extra = pcre_study(regex->reg, PCRE_STUDY_JIT_COMPILE, &error);
if (!regex->extra && error != NULL) {
pcre_free(regex->reg);
memprintf(err, "failed to compile regex '%s' (error=%s)", str, error);
goto out_fail_alloc;
}
#elif defined(USE_PCRE2) || defined(USE_PCRE2_JIT)
if (!cs)
flags |= PCRE2_CASELESS;
if (!cap)
flags |= PCRE2_NO_AUTO_CAPTURE;
regex->reg = pcre2_compile((PCRE2_SPTR)str, PCRE2_ZERO_TERMINATED, flags, &errn, &erroffset, NULL);
if (!regex->reg) {
pcre2_get_error_message(errn, error, sizeof(error));
memprintf(err, "regex '%s' is invalid (error=%s, erroffset=%zu)", str, error, erroffset);
goto out_fail_alloc;
}
regex->mfn = &pcre2_match;
#if defined(USE_PCRE2_JIT)
jit = pcre2_jit_compile(regex->reg, PCRE2_JIT_COMPLETE);
/*
* We end if it is an error not related to lack of JIT support
* in a case of JIT support missing pcre2_jit_compile is "no-op"
*/
if (!jit)
regex->mfn = &pcre2_jit_match;
else {
if (jit != PCRE2_ERROR_JIT_BADOPTION) {
pcre2_code_free(regex->reg);
memprintf(err, "regex '%s' jit compilation failed", str);
goto out_fail_alloc;
}
else
regex->mfn = &pcre2_match;
}
#endif
#else
if (!cs)
flags |= REG_ICASE;
if (!cap)
flags |= REG_NOSUB;
if (regcomp(&regex->regex, str, flags) != 0) {
memprintf(err, "regex '%s' is invalid", str);
goto out_fail_alloc;
}
#endif
return regex;
out_fail_alloc:
free(regex);
return NULL;
}
static void regex_register_build_options(void)
{
char *ptr = NULL;
#ifdef USE_PCRE
memprintf(&ptr, "Built with PCRE version : %s", (HAP_XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
HAP_XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :
HAP_XSTRING(PCRE_MAJOR.PCRE_MINOR) HAP_XSTRING(PCRE_PRERELEASE PCRE_DATE));
memprintf(&ptr, "%s\nRunning on PCRE version : %s", ptr, pcre_version());
memprintf(&ptr, "%s\nPCRE library supports JIT : %s", ptr,
#ifdef USE_PCRE_JIT
({
int r;
pcre_config(PCRE_CONFIG_JIT, &r);
r ? "yes" : "no (libpcre build without JIT?)";
})
#else
"no (USE_PCRE_JIT not set)"
#endif
);
#endif /* USE_PCRE */
#ifdef USE_PCRE2
memprintf(&ptr, "Built with PCRE2 version : %s", (HAP_XSTRING(Z PCRE2_PRERELEASE)[1] == 0) ?
HAP_XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) :
HAP_XSTRING(PCRE2_MAJOR.PCRE2_MINOR) HAP_XSTRING(PCRE2_PRERELEASE PCRE2_DATE));
memprintf(&ptr, "%s\nPCRE2 library supports JIT : %s", ptr,
#ifdef USE_PCRE2_JIT
({
int r;
pcre2_config(PCRE2_CONFIG_JIT, &r);
r ? "yes" : "no (libpcre2 build without JIT?)";
})
#else
"no (USE_PCRE2_JIT not set)"
#endif
);
#endif /* USE_PCRE2 */
#if !defined(USE_PCRE) && !defined(USE_PCRE2)
memprintf(&ptr, "Built without PCRE or PCRE2 support (using libc's regex instead)");
#endif
hap_register_build_opts(ptr, 1);
}
INITCALL0(STG_REGISTER, regex_register_build_options);
#ifdef USE_PCRE2
static int init_pcre2_per_thread(void)
{
local_pcre2_match = pcre2_match_data_create(MAX_MATCH - 1, NULL);
if (!local_pcre2_match) {
ha_alert("Failed to allocate PCRE2 match data context for thread %u.\n", tid);
return 0;
}
return 1;
}
static void deinit_pcre2_per_thread(void)
{
pcre2_match_data_free(local_pcre2_match);
}
REGISTER_PER_THREAD_INIT(init_pcre2_per_thread);
REGISTER_PER_THREAD_DEINIT(deinit_pcre2_per_thread);
#endif
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/