MINOR: tools: add the ability to update a word fingerprint

Instead of making a new one from scratch, let's support not wiping the
existing fingerprint and updating it, and to do the same char by char.
The word-by-word one will still result in multiple beginnings and ends,
but that will accurately translate word boundaries. The char-based one
has more flexibility and requires that the caller maintains the previous
char to indicate the transition, which also allows to insert delimiters
for example.
This commit is contained in:
Willy Tarreau 2021-03-12 18:59:31 +01:00
parent b736458bfa
commit e33c4b3c11
2 changed files with 42 additions and 3 deletions

View File

@ -865,6 +865,7 @@ int my_unsetenv(const char *name);
char *env_expand(char *in); char *env_expand(char *in);
uint32_t parse_line(char *in, char *out, size_t *outlen, char **args, int *nbargs, uint32_t opts, char **errptr); uint32_t parse_line(char *in, char *out, size_t *outlen, char **args, int *nbargs, uint32_t opts, char **errptr);
size_t sanitize_for_printing(char *line, size_t pos, size_t width); size_t sanitize_for_printing(char *line, size_t pos, size_t width);
void update_word_fingerprint(uint8_t *fp, const char *word);
void make_word_fingerprint(uint8_t *fp, const char *word); void make_word_fingerprint(uint8_t *fp, const char *word);
int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2); int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2);
@ -1072,5 +1073,33 @@ static inline unsigned int statistical_prng()
return statistical_prng_state = x; return statistical_prng_state = x;
} }
/* Update array <fp> with the character transition <prev> to <curr>. If <prev>
* is zero, it's assumed that <curr> is the first character. If <curr> is zero
* its assumed to mark the end. Both may be zero. <fp> is a 1024-entries array
* indexed as 32*from+to. Positions for 'from' and 'to' are:
* 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
*/
static inline void update_char_fingerprint(uint8_t *fp, char prev, char curr)
{
int from, to;
switch (prev) {
case 0: from = 26; break; // begin
case 'a'...'z': from = prev - 'a'; break;
case 'A'...'Z': from = tolower(prev) - 'a'; break;
case '0'...'9': from = 26; break;
default: from = 27; break;
}
switch (curr) {
case 0: to = 28; break; // end
case 'a'...'z': to = curr - 'a'; break;
case 'A'...'Z': to = tolower(curr) - 'a'; break;
case '0'...'9': to = 26; break;
default: to = 27; break;
}
fp[32 * from + to]++;
}
#endif /* _HAPROXY_TOOLS_H */ #endif /* _HAPROXY_TOOLS_H */

View File

@ -5369,18 +5369,17 @@ size_t sanitize_for_printing(char *line, size_t pos, size_t width)
return pos - shift; return pos - shift;
} }
/* Initialize array <fp> with the fingerprint of word <word> by counting the /* Update array <fp> with the fingerprint of word <word> by counting the
* transitions between characters. <fp> is a 1024-entries array indexed as * transitions between characters. <fp> is a 1024-entries array indexed as
* 32*from+to. Positions for 'from' and 'to' are: * 32*from+to. Positions for 'from' and 'to' are:
* 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused. * 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
*/ */
void make_word_fingerprint(uint8_t *fp, const char *word) void update_word_fingerprint(uint8_t *fp, const char *word)
{ {
const char *p; const char *p;
int from, to; int from, to;
int c; int c;
memset(fp, 0, 1024);
from = 28; // begin from = 28; // begin
for (p = word; *p; p++) { for (p = word; *p; p++) {
c = tolower(*p); c = tolower(*p);
@ -5397,6 +5396,17 @@ void make_word_fingerprint(uint8_t *fp, const char *word)
fp[32 * from + to]++; fp[32 * from + to]++;
} }
/* Initialize array <fp> with the fingerprint of word <word> by counting the
* transitions between characters. <fp> is a 1024-entries array indexed as
* 32*from+to. Positions for 'from' and 'to' are:
* 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
*/
void make_word_fingerprint(uint8_t *fp, const char *word)
{
memset(fp, 0, 1024);
update_word_fingerprint(fp, word);
}
/* Return the distance between two word fingerprints created by function /* Return the distance between two word fingerprints created by function
* make_word_fingerprint(). It's a positive integer calculated as the sum of * make_word_fingerprint(). It's a positive integer calculated as the sum of
* the squares of the differences between each location. * the squares of the differences between each location.