mirror of
https://github.com/prometheus/prometheus.git
synced 2026-05-05 04:16:15 +02:00
util/strutil: add Jaro-Winkler similarity implementation (#18405)
* util/strutil: add Jaro-Winkler similarity implementation This is part of the implementation of prometheus/proposals#74 Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> * util/strutil: optimise JaroWinkler with string-native ASCII path Replace the generic jaroWinkler[T byte|rune] with two specialised functions: jaroWinklerString (ASCII path) operates directly on the string values and avoids the []byte conversion that previously caused two heap allocations per call; jaroWinklerRunes (Unicode path) is unchanged in algorithm but split out from the generic. Both paths replace the repeated float64 divisions in the Jaro formula with precomputed reciprocals (invL1, invL2). Result: short ASCII strings drop from 2 allocs/op to 0 allocs/op; long ASCII drops from 4 allocs/op to 2 allocs/op (bool match arrays only). Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> * util/strutil: replace JaroWinkler with JaroWinklerMatcher Remove the free JaroWinkler function and replace it with a JaroWinklerMatcher struct. NewJaroWinklerMatcher pre-computes the ASCII check and rune conversion for the search term once; Score then runs the comparison against each candidate without repeating that work. This is the expected usage pattern in Prometheus: one fixed term scored against many label names or values. Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> * Update util/strutil/jarowinkler.go and util/strutil/jarowinkler_test.go Co-authored-by: Arve Knudsen <arve.knudsen@gmail.com> Signed-off-by: Julien <291750+roidelapluie@users.noreply.github.com> Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --------- Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Signed-off-by: Julien <291750+roidelapluie@users.noreply.github.com> Co-authored-by: Arve Knudsen <arve.knudsen@gmail.com>
This commit is contained in:
parent
0b067888c7
commit
1c449737e1
199
util/strutil/jarowinkler.go
Normal file
199
util/strutil/jarowinkler.go
Normal file
@ -0,0 +1,199 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package strutil
|
||||
|
||||
// JaroWinklerMatcher pre-computes the encoding of a fixed search term so that
|
||||
// it can be scored against many candidate strings without repeating the ASCII
|
||||
// check or rune conversion on the term for every call. The first Score call
|
||||
// with a Unicode candidate lazily caches the term's rune slice. It is not
|
||||
// safe for concurrent use.
|
||||
type JaroWinklerMatcher struct {
|
||||
term string // original term; used directly on the ASCII path
|
||||
termASCII bool // whether term is pure ASCII
|
||||
termRunes []rune // pre-converted runes; set when !termASCII or on first Unicode candidate
|
||||
}
|
||||
|
||||
// NewJaroWinklerMatcher returns a matcher for the given term.
|
||||
func NewJaroWinklerMatcher(term string) *JaroWinklerMatcher {
|
||||
if isASCII(term) {
|
||||
return &JaroWinklerMatcher{term: term, termASCII: true}
|
||||
}
|
||||
return &JaroWinklerMatcher{term: term, termRunes: []rune(term)}
|
||||
}
|
||||
|
||||
// Score returns the Jaro-Winkler similarity between the matcher's term and s,
|
||||
// in [0.0, 1.0] where 1.0 means identical strings.
|
||||
func (m *JaroWinklerMatcher) Score(s string) float64 {
|
||||
if m.term == s {
|
||||
return 1.0
|
||||
}
|
||||
if m.term == "" || s == "" {
|
||||
return 0.0
|
||||
}
|
||||
if m.termASCII && isASCII(s) {
|
||||
return jaroWinklerString(m.term, s)
|
||||
}
|
||||
// Either the term or s is Unicode; use the rune path.
|
||||
if m.termRunes == nil {
|
||||
// term is ASCII but s is Unicode; convert and cache term runes.
|
||||
m.termRunes = []rune(m.term)
|
||||
}
|
||||
return jaroWinklerRunes(m.termRunes, []rune(s))
|
||||
}
|
||||
|
||||
// isASCII reports whether s contains only ASCII characters.
|
||||
func isASCII(s string) bool {
|
||||
for i := range len(s) {
|
||||
if s[i] >= 0x80 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// jaroWinklerString implements the Jaro-Winkler algorithm directly on ASCII
|
||||
// strings, avoiding any []rune conversion.
|
||||
func jaroWinklerString(s1, s2 string) float64 {
|
||||
l1, l2 := len(s1), len(s2)
|
||||
|
||||
// Swap so s1 is always the shorter string.
|
||||
if l1 > l2 {
|
||||
s1, s2 = s2, s1
|
||||
l1, l2 = l2, l1
|
||||
}
|
||||
|
||||
// Jaro match distance: characters must be within this many positions to match.
|
||||
matchDistance := max(l2/2-1, 0)
|
||||
|
||||
s1Matches := make([]bool, l1)
|
||||
s2Matches := make([]bool, l2)
|
||||
|
||||
var matches float64
|
||||
var transpositions float64
|
||||
|
||||
for i := range l1 {
|
||||
start := max(i-matchDistance, 0)
|
||||
end := min(i+matchDistance+1, l2)
|
||||
|
||||
for j := start; j < end; j++ {
|
||||
if s2Matches[j] || s1[i] != s2[j] {
|
||||
continue
|
||||
}
|
||||
s1Matches[i] = true
|
||||
s2Matches[j] = true
|
||||
matches++
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if matches == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
k := 0
|
||||
for i := range l1 {
|
||||
if !s1Matches[i] {
|
||||
continue
|
||||
}
|
||||
for !s2Matches[k] {
|
||||
k++
|
||||
}
|
||||
if s1[i] != s2[k] {
|
||||
transpositions++
|
||||
}
|
||||
k++
|
||||
}
|
||||
|
||||
// Use precomputed reciprocals to replace repeated divisions.
|
||||
invL1 := 1.0 / float64(l1)
|
||||
invL2 := 1.0 / float64(l2)
|
||||
jaro := (matches*invL1 + matches*invL2 + (matches-transpositions*0.5)/matches) / 3.0
|
||||
|
||||
// Winkler modification: boost for common prefix up to 4 characters.
|
||||
prefixLen := 0
|
||||
for prefixLen < min(4, l1, l2) && s1[prefixLen] == s2[prefixLen] {
|
||||
prefixLen++
|
||||
}
|
||||
|
||||
const p = 0.1 // Standard Winkler prefix scaling factor; not intended to be user-configurable.
|
||||
return jaro + float64(prefixLen)*p*(1.0-jaro)
|
||||
}
|
||||
|
||||
// jaroWinklerRunes implements the Jaro-Winkler algorithm over pre-converted
|
||||
// rune slices for the Unicode path.
|
||||
func jaroWinklerRunes(r1, r2 []rune) float64 {
|
||||
l1, l2 := len(r1), len(r2)
|
||||
|
||||
// Swap so r1 is always the shorter slice.
|
||||
if l1 > l2 {
|
||||
r1, r2 = r2, r1
|
||||
l1, l2 = l2, l1
|
||||
}
|
||||
|
||||
// Jaro match distance: characters must be within this many positions to match.
|
||||
matchDistance := max(l2/2-1, 0)
|
||||
|
||||
r1Matches := make([]bool, l1)
|
||||
r2Matches := make([]bool, l2)
|
||||
|
||||
var matches float64
|
||||
var transpositions float64
|
||||
|
||||
for i := range l1 {
|
||||
start := max(i-matchDistance, 0)
|
||||
end := min(i+matchDistance+1, l2)
|
||||
|
||||
for j := start; j < end; j++ {
|
||||
if r2Matches[j] || r1[i] != r2[j] {
|
||||
continue
|
||||
}
|
||||
r1Matches[i] = true
|
||||
r2Matches[j] = true
|
||||
matches++
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if matches == 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
k := 0
|
||||
for i := range l1 {
|
||||
if !r1Matches[i] {
|
||||
continue
|
||||
}
|
||||
for !r2Matches[k] {
|
||||
k++
|
||||
}
|
||||
if r1[i] != r2[k] {
|
||||
transpositions++
|
||||
}
|
||||
k++
|
||||
}
|
||||
|
||||
// Use precomputed reciprocals to replace repeated divisions.
|
||||
invL1 := 1.0 / float64(l1)
|
||||
invL2 := 1.0 / float64(l2)
|
||||
jaro := (matches*invL1 + matches*invL2 + (matches-transpositions*0.5)/matches) / 3.0
|
||||
|
||||
// Winkler modification: boost for common prefix up to 4 characters.
|
||||
prefixLen := 0
|
||||
for prefixLen < min(4, l1, l2) && r1[prefixLen] == r2[prefixLen] {
|
||||
prefixLen++
|
||||
}
|
||||
|
||||
const p = 0.1 // Standard Winkler prefix scaling factor; not intended to be user-configurable.
|
||||
return jaro + float64(prefixLen)*p*(1.0-jaro)
|
||||
}
|
||||
37
util/strutil/jarowinkler_bench_test.go
Normal file
37
util/strutil/jarowinkler_bench_test.go
Normal file
@ -0,0 +1,37 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package strutil
|
||||
|
||||
import "testing"
|
||||
|
||||
var benchCases = []struct {
|
||||
name, s1, s2 string
|
||||
}{
|
||||
{"identical_short", "prometheus", "prometheus"},
|
||||
{"similar_short", "martha", "marhta"},
|
||||
{"dissimilar_short", "dixon", "dicksonx"},
|
||||
{"long_ascii", "http_requests_total_by_method_and_path", "http_requests_count_by_method_and_path"},
|
||||
{"unicode", "naïve", "naive"},
|
||||
}
|
||||
|
||||
func BenchmarkJaroWinklerMatcher(b *testing.B) {
|
||||
for _, bc := range benchCases {
|
||||
b.Run(bc.name, func(b *testing.B) {
|
||||
m := NewJaroWinklerMatcher(bc.s1)
|
||||
for range b.N {
|
||||
m.Score(bc.s2)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
85
util/strutil/jarowinkler_test.go
Normal file
85
util/strutil/jarowinkler_test.go
Normal file
@ -0,0 +1,85 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package strutil
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestJaroWinklerMatcher(t *testing.T) {
|
||||
tests := []struct {
|
||||
s1, s2 string
|
||||
min float64
|
||||
max float64
|
||||
}{
|
||||
// Identical strings.
|
||||
{"prometheus", "prometheus", 1.0, 1.0},
|
||||
{"", "", 1.0, 1.0},
|
||||
|
||||
// Empty vs non-empty.
|
||||
{"", "abc", 0.0, 0.0},
|
||||
{"abc", "", 0.0, 0.0},
|
||||
|
||||
// Completely different strings.
|
||||
{"abc", "xyz", 0.0, 0.01},
|
||||
|
||||
// Similar strings.
|
||||
{"mimir", "mimer", 0.90, 0.92},
|
||||
{"martha", "marhta", 0.96, 0.97},
|
||||
{"dwayne", "duane", 0.83, 0.85},
|
||||
{"dixon", "dicksonx", 0.81, 0.83},
|
||||
|
||||
// Single character strings.
|
||||
{"a", "a", 1.0, 1.0},
|
||||
{"a", "b", 0.0, 0.0},
|
||||
|
||||
// Common prefix boost.
|
||||
{"prefix_abc", "prefix_xyz", 0.80, 0.90},
|
||||
|
||||
// Unicode strings (exercises the rune path).
|
||||
{"café", "cafe", 0.88, 0.89},
|
||||
{"naïve", "naive", 0.89, 0.90},
|
||||
{"résumé", "resume", 0.79, 0.81},
|
||||
// Identical Unicode strings.
|
||||
{"café", "café", 1.0, 1.0},
|
||||
// Empty vs Unicode.
|
||||
{"", "café", 0.0, 0.0},
|
||||
{"café", "", 0.0, 0.0},
|
||||
// Two Unicode strings compared to each other.
|
||||
{"café", "cafè", 0.88, 0.89},
|
||||
// Common Unicode prefix (exercises Winkler boost on runes).
|
||||
{"préfixe_abc", "préfixe_xyz", 0.80, 0.90},
|
||||
// Unicode strings with unequal rune lengths (exercises swap in rune path).
|
||||
{"naïve_long", "naïve", 0.89, 0.91},
|
||||
// Completely different Unicode strings (exercises zero-matches in rune path).
|
||||
{"äöü", "éèê", 0.0, 0.01},
|
||||
// Unicode transpositions (mirrors martha/marhta in rune path).
|
||||
{"màrthà", "màrhtà", 0.96, 0.97},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.s1+"_"+tt.s2, func(t *testing.T) {
|
||||
score := NewJaroWinklerMatcher(tt.s1).Score(tt.s2)
|
||||
if score < tt.min || score > tt.max {
|
||||
t.Errorf("NewJaroWinklerMatcher(%q).Score(%q) = %f, want in [%f, %f]", tt.s1, tt.s2, score, tt.min, tt.max)
|
||||
}
|
||||
// Verify symmetry.
|
||||
reverse := NewJaroWinklerMatcher(tt.s2).Score(tt.s1)
|
||||
if math.Abs(score-reverse) > 1e-10 {
|
||||
t.Errorf("NewJaroWinklerMatcher(%q).Score(%q) = %f, but NewJaroWinklerMatcher(%q).Score(%q) = %f (not symmetric)", tt.s1, tt.s2, score, tt.s2, tt.s1, reverse)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user