diff --git a/util/strutil/jarowinkler.go b/util/strutil/jarowinkler.go new file mode 100644 index 0000000000..43cc60df51 --- /dev/null +++ b/util/strutil/jarowinkler.go @@ -0,0 +1,199 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strutil + +// JaroWinklerMatcher pre-computes the encoding of a fixed search term so that +// it can be scored against many candidate strings without repeating the ASCII +// check or rune conversion on the term for every call. The first Score call +// with a Unicode candidate lazily caches the term's rune slice. It is not +// safe for concurrent use. +type JaroWinklerMatcher struct { + term string // original term; used directly on the ASCII path + termASCII bool // whether term is pure ASCII + termRunes []rune // pre-converted runes; set when !termASCII or on first Unicode candidate +} + +// NewJaroWinklerMatcher returns a matcher for the given term. +func NewJaroWinklerMatcher(term string) *JaroWinklerMatcher { + if isASCII(term) { + return &JaroWinklerMatcher{term: term, termASCII: true} + } + return &JaroWinklerMatcher{term: term, termRunes: []rune(term)} +} + +// Score returns the Jaro-Winkler similarity between the matcher's term and s, +// in [0.0, 1.0] where 1.0 means identical strings. +func (m *JaroWinklerMatcher) Score(s string) float64 { + if m.term == s { + return 1.0 + } + if m.term == "" || s == "" { + return 0.0 + } + if m.termASCII && isASCII(s) { + return jaroWinklerString(m.term, s) + } + // Either the term or s is Unicode; use the rune path. + if m.termRunes == nil { + // term is ASCII but s is Unicode; convert and cache term runes. + m.termRunes = []rune(m.term) + } + return jaroWinklerRunes(m.termRunes, []rune(s)) +} + +// isASCII reports whether s contains only ASCII characters. +func isASCII(s string) bool { + for i := range len(s) { + if s[i] >= 0x80 { + return false + } + } + return true +} + +// jaroWinklerString implements the Jaro-Winkler algorithm directly on ASCII +// strings, avoiding any []rune conversion. +func jaroWinklerString(s1, s2 string) float64 { + l1, l2 := len(s1), len(s2) + + // Swap so s1 is always the shorter string. + if l1 > l2 { + s1, s2 = s2, s1 + l1, l2 = l2, l1 + } + + // Jaro match distance: characters must be within this many positions to match. + matchDistance := max(l2/2-1, 0) + + s1Matches := make([]bool, l1) + s2Matches := make([]bool, l2) + + var matches float64 + var transpositions float64 + + for i := range l1 { + start := max(i-matchDistance, 0) + end := min(i+matchDistance+1, l2) + + for j := start; j < end; j++ { + if s2Matches[j] || s1[i] != s2[j] { + continue + } + s1Matches[i] = true + s2Matches[j] = true + matches++ + break + } + } + + if matches == 0 { + return 0.0 + } + + k := 0 + for i := range l1 { + if !s1Matches[i] { + continue + } + for !s2Matches[k] { + k++ + } + if s1[i] != s2[k] { + transpositions++ + } + k++ + } + + // Use precomputed reciprocals to replace repeated divisions. + invL1 := 1.0 / float64(l1) + invL2 := 1.0 / float64(l2) + jaro := (matches*invL1 + matches*invL2 + (matches-transpositions*0.5)/matches) / 3.0 + + // Winkler modification: boost for common prefix up to 4 characters. + prefixLen := 0 + for prefixLen < min(4, l1, l2) && s1[prefixLen] == s2[prefixLen] { + prefixLen++ + } + + const p = 0.1 // Standard Winkler prefix scaling factor; not intended to be user-configurable. + return jaro + float64(prefixLen)*p*(1.0-jaro) +} + +// jaroWinklerRunes implements the Jaro-Winkler algorithm over pre-converted +// rune slices for the Unicode path. +func jaroWinklerRunes(r1, r2 []rune) float64 { + l1, l2 := len(r1), len(r2) + + // Swap so r1 is always the shorter slice. + if l1 > l2 { + r1, r2 = r2, r1 + l1, l2 = l2, l1 + } + + // Jaro match distance: characters must be within this many positions to match. + matchDistance := max(l2/2-1, 0) + + r1Matches := make([]bool, l1) + r2Matches := make([]bool, l2) + + var matches float64 + var transpositions float64 + + for i := range l1 { + start := max(i-matchDistance, 0) + end := min(i+matchDistance+1, l2) + + for j := start; j < end; j++ { + if r2Matches[j] || r1[i] != r2[j] { + continue + } + r1Matches[i] = true + r2Matches[j] = true + matches++ + break + } + } + + if matches == 0 { + return 0.0 + } + + k := 0 + for i := range l1 { + if !r1Matches[i] { + continue + } + for !r2Matches[k] { + k++ + } + if r1[i] != r2[k] { + transpositions++ + } + k++ + } + + // Use precomputed reciprocals to replace repeated divisions. + invL1 := 1.0 / float64(l1) + invL2 := 1.0 / float64(l2) + jaro := (matches*invL1 + matches*invL2 + (matches-transpositions*0.5)/matches) / 3.0 + + // Winkler modification: boost for common prefix up to 4 characters. + prefixLen := 0 + for prefixLen < min(4, l1, l2) && r1[prefixLen] == r2[prefixLen] { + prefixLen++ + } + + const p = 0.1 // Standard Winkler prefix scaling factor; not intended to be user-configurable. + return jaro + float64(prefixLen)*p*(1.0-jaro) +} diff --git a/util/strutil/jarowinkler_bench_test.go b/util/strutil/jarowinkler_bench_test.go new file mode 100644 index 0000000000..35f0af6e69 --- /dev/null +++ b/util/strutil/jarowinkler_bench_test.go @@ -0,0 +1,37 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strutil + +import "testing" + +var benchCases = []struct { + name, s1, s2 string +}{ + {"identical_short", "prometheus", "prometheus"}, + {"similar_short", "martha", "marhta"}, + {"dissimilar_short", "dixon", "dicksonx"}, + {"long_ascii", "http_requests_total_by_method_and_path", "http_requests_count_by_method_and_path"}, + {"unicode", "naïve", "naive"}, +} + +func BenchmarkJaroWinklerMatcher(b *testing.B) { + for _, bc := range benchCases { + b.Run(bc.name, func(b *testing.B) { + m := NewJaroWinklerMatcher(bc.s1) + for range b.N { + m.Score(bc.s2) + } + }) + } +} diff --git a/util/strutil/jarowinkler_test.go b/util/strutil/jarowinkler_test.go new file mode 100644 index 0000000000..84c5e73abb --- /dev/null +++ b/util/strutil/jarowinkler_test.go @@ -0,0 +1,85 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strutil + +import ( + "math" + "testing" +) + +func TestJaroWinklerMatcher(t *testing.T) { + tests := []struct { + s1, s2 string + min float64 + max float64 + }{ + // Identical strings. + {"prometheus", "prometheus", 1.0, 1.0}, + {"", "", 1.0, 1.0}, + + // Empty vs non-empty. + {"", "abc", 0.0, 0.0}, + {"abc", "", 0.0, 0.0}, + + // Completely different strings. + {"abc", "xyz", 0.0, 0.01}, + + // Similar strings. + {"mimir", "mimer", 0.90, 0.92}, + {"martha", "marhta", 0.96, 0.97}, + {"dwayne", "duane", 0.83, 0.85}, + {"dixon", "dicksonx", 0.81, 0.83}, + + // Single character strings. + {"a", "a", 1.0, 1.0}, + {"a", "b", 0.0, 0.0}, + + // Common prefix boost. + {"prefix_abc", "prefix_xyz", 0.80, 0.90}, + + // Unicode strings (exercises the rune path). + {"café", "cafe", 0.88, 0.89}, + {"naïve", "naive", 0.89, 0.90}, + {"résumé", "resume", 0.79, 0.81}, + // Identical Unicode strings. + {"café", "café", 1.0, 1.0}, + // Empty vs Unicode. + {"", "café", 0.0, 0.0}, + {"café", "", 0.0, 0.0}, + // Two Unicode strings compared to each other. + {"café", "cafè", 0.88, 0.89}, + // Common Unicode prefix (exercises Winkler boost on runes). + {"préfixe_abc", "préfixe_xyz", 0.80, 0.90}, + // Unicode strings with unequal rune lengths (exercises swap in rune path). + {"naïve_long", "naïve", 0.89, 0.91}, + // Completely different Unicode strings (exercises zero-matches in rune path). + {"äöü", "éèê", 0.0, 0.01}, + // Unicode transpositions (mirrors martha/marhta in rune path). + {"màrthà", "màrhtà", 0.96, 0.97}, + } + + for _, tt := range tests { + t.Run(tt.s1+"_"+tt.s2, func(t *testing.T) { + score := NewJaroWinklerMatcher(tt.s1).Score(tt.s2) + if score < tt.min || score > tt.max { + t.Errorf("NewJaroWinklerMatcher(%q).Score(%q) = %f, want in [%f, %f]", tt.s1, tt.s2, score, tt.min, tt.max) + } + // Verify symmetry. + reverse := NewJaroWinklerMatcher(tt.s2).Score(tt.s1) + if math.Abs(score-reverse) > 1e-10 { + t.Errorf("NewJaroWinklerMatcher(%q).Score(%q) = %f, but NewJaroWinklerMatcher(%q).Score(%q) = %f (not symmetric)", tt.s1, tt.s2, score, tt.s2, tt.s1, reverse) + } + }) + } +}