FastRegexMatcher: Extract longer prefixes

Given a regex like `a(b|c).*`, find ab and ac as prefixes.
Even though the original may have looked like `ab.*|ac.*`, Go extracts
the sub-prefixes, but we want the longer strings when the optimisation
to put them into a map applies.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>
This commit is contained in:
Bryan Boreham 2024-10-22 21:24:00 +01:00
parent 2182b83271
commit 6732873532
2 changed files with 86 additions and 31 deletions

View File

@ -14,6 +14,7 @@
package labels
import (
"math"
"slices"
"strings"
"unicode"
@ -76,10 +77,16 @@ func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
if parsed.Op == syntax.OpConcat {
m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
}
if matches, caseSensitive := findSetMatches(parsed); caseSensitive {
matches, prefixes, caseSensitive := findSetMatches(parsed)
switch {
case len(matches) == 0 && len(prefixes) > minEqualMultiStringMatcherMapThreshold:
m.stringMatcher = newMatcherFromPrefixMatchers(caseSensitive, prefixes)
case len(matches) > 0 && len(prefixes) == 0 && caseSensitive:
m.setMatches = matches
m.stringMatcher = stringMatcherFromRegexp(parsed)
default:
m.stringMatcher = stringMatcherFromRegexp(parsed)
}
m.stringMatcher = stringMatcherFromRegexp(parsed)
m.matchString = m.compileMatchStringFunction()
}
@ -127,27 +134,27 @@ func (m *FastRegexMatcher) IsOptimized() bool {
// findSetMatches extract equality matches from a regexp.
// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains
// a mix of case sensitive and case insensitive matchers.
func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) {
func findSetMatches(re *syntax.Regexp) (matches []string, prefixes []prefixMatcher, caseSensitive bool) {
clearBeginEndText(re)
return findSetMatchesInternal(re, "")
}
func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) {
func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, prefixes []prefixMatcher, matchesCaseSensitive bool) {
switch re.Op {
case syntax.OpBeginText:
// Correctly handling the begin text operator inside a regex is tricky,
// so in this case we fallback to the regex engine.
return nil, false
return nil, nil, false
case syntax.OpEndText:
// Correctly handling the end text operator inside a regex is tricky,
// so in this case we fallback to the regex engine.
return nil, false
return nil, nil, false
case syntax.OpLiteral:
return []string{base + string(re.Rune)}, isCaseSensitive(re)
return []string{base + string(re.Rune)}, nil, isCaseSensitive(re)
case syntax.OpEmptyMatch:
if base != "" {
return []string{base}, isCaseSensitive(re)
return []string{base}, nil, isCaseSensitive(re)
}
case syntax.OpAlternate:
return findSetMatchesFromAlternate(re, base)
@ -158,7 +165,7 @@ func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, c
return findSetMatchesFromConcat(re, base)
case syntax.OpCharClass:
if len(re.Rune)%2 != 0 {
return nil, false
return nil, nil, false
}
var matches []string
var totalSet int
@ -169,7 +176,7 @@ func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, c
// In some case like negation [^0-9] a lot of possibilities exists and that
// can create thousands of possible matches at which points we're better off using regexp.
if totalSet > maxSetMatches {
return nil, false
return nil, nil, false
}
for i := 0; i+1 < len(re.Rune); i += 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
@ -177,30 +184,40 @@ func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, c
matches = append(matches, base+string(c))
}
}
return matches, isCaseSensitive(re)
return matches, nil, isCaseSensitive(re)
default:
return nil, false
return nil, nil, false
}
return nil, false
return nil, nil, false
}
func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, prefixes []prefixMatcher, matchesCaseSensitive bool) {
if len(re.Sub) == 0 {
return nil, false
return nil, nil, false
}
clearCapture(re.Sub...)
matches = []string{base}
for i := 0; i < len(re.Sub); i++ {
var newMatches []string
// An OpStar makes the matches so far into prefixes.
if i > 0 && re.Sub[i].Op == syntax.OpStar && len(matches) > 0 {
prefixes = make([]prefixMatcher, 0, len(matches))
for _, prefix := range matches {
right := stringMatcherFromRegexpInternal(re.Sub[i])
prefixes = append(prefixes, newLiteralPrefixStringMatcher(prefix, matchesCaseSensitive, right))
}
return []string{}, prefixes, matchesCaseSensitive
}
newMatches := []string{}
for j, b := range matches {
m, caseSensitive := findSetMatchesInternal(re.Sub[i], b)
m, p, caseSensitive := findSetMatchesInternal(re.Sub[i], b)
if m == nil {
return nil, false
return nil, nil, false
}
if tooManyMatches(newMatches, m...) {
return nil, false
return nil, nil, false
}
// All matches must have the same case sensitivity. If it's the first set of matches
@ -210,25 +227,29 @@ func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string,
matchesCaseSensitive = caseSensitive
}
if matchesCaseSensitive != caseSensitive {
return nil, false
return nil, nil, false
}
newMatches = append(newMatches, m...)
if i == len(re.Sub)-1 && len(p) > 0 {
prefixes = append(prefixes, p...)
}
}
matches = newMatches
}
return matches, matchesCaseSensitive
return matches, prefixes, matchesCaseSensitive
}
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, prefixes []prefixMatcher, matchesCaseSensitive bool) {
matches = []string{}
for i, sub := range re.Sub {
found, caseSensitive := findSetMatchesInternal(sub, base)
found, foundPrefixes, caseSensitive := findSetMatchesInternal(sub, base)
if found == nil {
return nil, false
return nil, nil, false
}
if tooManyMatches(matches, found...) {
return nil, false
return nil, nil, false
}
// All matches must have the same case sensitivity. If it's the first set of matches
@ -238,13 +259,14 @@ func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []stri
matchesCaseSensitive = caseSensitive
}
if matchesCaseSensitive != caseSensitive {
return nil, false
return nil, nil, false
}
matches = append(matches, found...)
prefixes = append(prefixes, foundPrefixes...)
}
return matches, matchesCaseSensitive
return matches, prefixes, matchesCaseSensitive
}
// clearCapture removes capture operation as they are not used for matching.
@ -406,6 +428,11 @@ type StringMatcher interface {
Matches(s string) bool
}
type prefixMatcher interface {
StringMatcher
Prefix() string
}
// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher.
// It returns nil if the regexp is not supported.
func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
@ -503,7 +530,11 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
re.Sub = re.Sub[:len(re.Sub)-1]
}
matches, matchesCaseSensitive := findSetMatchesInternal(re, "")
matches, prefixes, matchesCaseSensitive := findSetMatchesInternal(re, "")
if left == nil && right == nil && len(matches) == 0 && len(prefixes) > minEqualMultiStringMatcherMapThreshold {
return newMatcherFromPrefixMatchers(matchesCaseSensitive, prefixes)
}
if len(matches) == 0 && len(re.Sub) == 2 {
// We have not find fixed set matches. We look for other known cases that
@ -545,6 +576,9 @@ func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
caseSensitive: matchesCaseSensitive,
})
}
for _, prefix := range prefixes {
or = append(or, prefix)
}
return orStringMatcher(or)
// Right matcher with 1 fixed set match.
@ -627,7 +661,7 @@ func (m *containsStringMatcher) Matches(s string) bool {
return false
}
func newLiteralPrefixStringMatcher(prefix string, prefixCaseSensitive bool, right StringMatcher) StringMatcher {
func newLiteralPrefixStringMatcher(prefix string, prefixCaseSensitive bool, right StringMatcher) prefixMatcher {
if prefixCaseSensitive {
return &literalPrefixSensitiveStringMatcher{
prefix: prefix,
@ -658,6 +692,10 @@ func (m *literalPrefixSensitiveStringMatcher) Matches(s string) bool {
return m.right.Matches(s[len(m.prefix):])
}
func (m *literalPrefixSensitiveStringMatcher) Prefix() string {
return m.prefix
}
// literalPrefixInsensitiveStringMatcher matches a string with the given literal case-insensitive prefix and right side matcher.
type literalPrefixInsensitiveStringMatcher struct {
prefix string
@ -675,6 +713,10 @@ func (m *literalPrefixInsensitiveStringMatcher) Matches(s string) bool {
return m.right.Matches(s[len(m.prefix):])
}
func (m *literalPrefixInsensitiveStringMatcher) Prefix() string {
return m.prefix
}
// literalSuffixStringMatcher matches a string with the given literal suffix and left side matcher.
type literalSuffixStringMatcher struct {
// The matcher that must match the left side. Can be nil.
@ -729,6 +771,18 @@ func (m *equalStringMatcher) Matches(s string) bool {
return strings.EqualFold(m.s, s)
}
func newMatcherFromPrefixMatchers(caseSensitive bool, matchers []prefixMatcher) StringMatcher {
minPrefixLength := math.MaxInt
for _, m := range matchers {
minPrefixLength = min(minPrefixLength, len(m.Prefix()))
}
multiMatcher := newEqualMultiStringMatcher(caseSensitive, 0, len(matchers), minPrefixLength)
for _, m := range matchers {
multiMatcher.addPrefix(m.Prefix(), caseSensitive, m)
}
return multiMatcher
}
type multiStringMatcherBuilder interface {
StringMatcher
add(s string)

View File

@ -250,9 +250,10 @@ func TestFindSetMatches(t *testing.T) {
t.Parallel()
parsed, err := syntax.Parse(c.pattern, syntax.Perl|syntax.DotNL)
require.NoError(t, err)
matches, actualCaseSensitive := findSetMatches(parsed)
matches, prefixes, actualCaseSensitive := findSetMatches(parsed)
require.Equal(t, c.expMatches, matches)
require.Equal(t, c.expCaseSensitive, actualCaseSensitive)
require.Empty(t, prefixes)
if c.expCaseSensitive {
// When the regexp is case sensitive, we want to ensure that the
@ -393,7 +394,7 @@ func TestStringMatcherFromRegexp(t *testing.T) {
{".*foo.*bar.*", nil},
{`\d*`, nil},
{".", nil},
{"/|/bar.*", &literalPrefixSensitiveStringMatcher{prefix: "/", right: orStringMatcher{emptyStringMatcher{}, &literalPrefixSensitiveStringMatcher{prefix: "bar", right: trueMatcher{}}}}},
{"/|/bar.*", orStringMatcher{&equalStringMatcher{s: "/", caseSensitive: true}, &literalPrefixSensitiveStringMatcher{prefix: "/bar", right: trueMatcher{}}}},
// This one is not supported because `stringMatcherFromRegexp` is not reentrant for syntax.OpConcat.
// It would make the code too complex to handle it.
{"(.+)/(foo.*|bar$)", nil},