util/strutil: add Jaro-Winkler similarity implementation (#18405)

* util/strutil: add Jaro-Winkler similarity implementation

This is part of the implementation of prometheus/proposals#74

Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>

* util/strutil: optimise JaroWinkler with string-native ASCII path

Replace the generic jaroWinkler[T byte|rune] with two specialised
functions: jaroWinklerString (ASCII path) operates directly on the
string values and avoids the []byte conversion that previously caused
two heap allocations per call; jaroWinklerRunes (Unicode path) is
unchanged in algorithm but split out from the generic.

Both paths replace the repeated float64 divisions in the Jaro formula
with precomputed reciprocals (invL1, invL2).

Result: short ASCII strings drop from 2 allocs/op to 0 allocs/op;
long ASCII drops from 4 allocs/op to 2 allocs/op (bool match arrays
only).

Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>

* util/strutil: replace JaroWinkler with JaroWinklerMatcher

Remove the free JaroWinkler function and replace it with a
JaroWinklerMatcher struct. NewJaroWinklerMatcher pre-computes the
ASCII check and rune conversion for the search term once; Score then
runs the comparison against each candidate without repeating that work.

This is the expected usage pattern in Prometheus: one fixed term scored
against many label names or values.

Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>

* Update util/strutil/jarowinkler.go and util/strutil/jarowinkler_test.go

Co-authored-by: Arve Knudsen <arve.knudsen@gmail.com>
Signed-off-by: Julien <291750+roidelapluie@users.noreply.github.com>
Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>

---------

Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>
Signed-off-by: Julien <291750+roidelapluie@users.noreply.github.com>
Co-authored-by: Arve Knudsen <arve.knudsen@gmail.com>
This commit is contained in:
Julien 2026-04-14 16:58:46 +02:00 committed by GitHub
parent 0b067888c7
commit 1c449737e1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 321 additions and 0 deletions

199
util/strutil/jarowinkler.go Normal file
View File

@ -0,0 +1,199 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package strutil
// JaroWinklerMatcher pre-computes the encoding of a fixed search term so that
// it can be scored against many candidate strings without repeating the ASCII
// check or rune conversion on the term for every call. The first Score call
// with a Unicode candidate lazily caches the term's rune slice. It is not
// safe for concurrent use.
type JaroWinklerMatcher struct {
term string // original term; used directly on the ASCII path
termASCII bool // whether term is pure ASCII
termRunes []rune // pre-converted runes; set when !termASCII or on first Unicode candidate
}
// NewJaroWinklerMatcher returns a matcher for the given term.
func NewJaroWinklerMatcher(term string) *JaroWinklerMatcher {
if isASCII(term) {
return &JaroWinklerMatcher{term: term, termASCII: true}
}
return &JaroWinklerMatcher{term: term, termRunes: []rune(term)}
}
// Score returns the Jaro-Winkler similarity between the matcher's term and s,
// in [0.0, 1.0] where 1.0 means identical strings.
func (m *JaroWinklerMatcher) Score(s string) float64 {
if m.term == s {
return 1.0
}
if m.term == "" || s == "" {
return 0.0
}
if m.termASCII && isASCII(s) {
return jaroWinklerString(m.term, s)
}
// Either the term or s is Unicode; use the rune path.
if m.termRunes == nil {
// term is ASCII but s is Unicode; convert and cache term runes.
m.termRunes = []rune(m.term)
}
return jaroWinklerRunes(m.termRunes, []rune(s))
}
// isASCII reports whether s contains only ASCII characters.
func isASCII(s string) bool {
for i := range len(s) {
if s[i] >= 0x80 {
return false
}
}
return true
}
// jaroWinklerString implements the Jaro-Winkler algorithm directly on ASCII
// strings, avoiding any []rune conversion.
func jaroWinklerString(s1, s2 string) float64 {
l1, l2 := len(s1), len(s2)
// Swap so s1 is always the shorter string.
if l1 > l2 {
s1, s2 = s2, s1
l1, l2 = l2, l1
}
// Jaro match distance: characters must be within this many positions to match.
matchDistance := max(l2/2-1, 0)
s1Matches := make([]bool, l1)
s2Matches := make([]bool, l2)
var matches float64
var transpositions float64
for i := range l1 {
start := max(i-matchDistance, 0)
end := min(i+matchDistance+1, l2)
for j := start; j < end; j++ {
if s2Matches[j] || s1[i] != s2[j] {
continue
}
s1Matches[i] = true
s2Matches[j] = true
matches++
break
}
}
if matches == 0 {
return 0.0
}
k := 0
for i := range l1 {
if !s1Matches[i] {
continue
}
for !s2Matches[k] {
k++
}
if s1[i] != s2[k] {
transpositions++
}
k++
}
// Use precomputed reciprocals to replace repeated divisions.
invL1 := 1.0 / float64(l1)
invL2 := 1.0 / float64(l2)
jaro := (matches*invL1 + matches*invL2 + (matches-transpositions*0.5)/matches) / 3.0
// Winkler modification: boost for common prefix up to 4 characters.
prefixLen := 0
for prefixLen < min(4, l1, l2) && s1[prefixLen] == s2[prefixLen] {
prefixLen++
}
const p = 0.1 // Standard Winkler prefix scaling factor; not intended to be user-configurable.
return jaro + float64(prefixLen)*p*(1.0-jaro)
}
// jaroWinklerRunes implements the Jaro-Winkler algorithm over pre-converted
// rune slices for the Unicode path.
func jaroWinklerRunes(r1, r2 []rune) float64 {
l1, l2 := len(r1), len(r2)
// Swap so r1 is always the shorter slice.
if l1 > l2 {
r1, r2 = r2, r1
l1, l2 = l2, l1
}
// Jaro match distance: characters must be within this many positions to match.
matchDistance := max(l2/2-1, 0)
r1Matches := make([]bool, l1)
r2Matches := make([]bool, l2)
var matches float64
var transpositions float64
for i := range l1 {
start := max(i-matchDistance, 0)
end := min(i+matchDistance+1, l2)
for j := start; j < end; j++ {
if r2Matches[j] || r1[i] != r2[j] {
continue
}
r1Matches[i] = true
r2Matches[j] = true
matches++
break
}
}
if matches == 0 {
return 0.0
}
k := 0
for i := range l1 {
if !r1Matches[i] {
continue
}
for !r2Matches[k] {
k++
}
if r1[i] != r2[k] {
transpositions++
}
k++
}
// Use precomputed reciprocals to replace repeated divisions.
invL1 := 1.0 / float64(l1)
invL2 := 1.0 / float64(l2)
jaro := (matches*invL1 + matches*invL2 + (matches-transpositions*0.5)/matches) / 3.0
// Winkler modification: boost for common prefix up to 4 characters.
prefixLen := 0
for prefixLen < min(4, l1, l2) && r1[prefixLen] == r2[prefixLen] {
prefixLen++
}
const p = 0.1 // Standard Winkler prefix scaling factor; not intended to be user-configurable.
return jaro + float64(prefixLen)*p*(1.0-jaro)
}

View File

@ -0,0 +1,37 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package strutil
import "testing"
var benchCases = []struct {
name, s1, s2 string
}{
{"identical_short", "prometheus", "prometheus"},
{"similar_short", "martha", "marhta"},
{"dissimilar_short", "dixon", "dicksonx"},
{"long_ascii", "http_requests_total_by_method_and_path", "http_requests_count_by_method_and_path"},
{"unicode", "naïve", "naive"},
}
func BenchmarkJaroWinklerMatcher(b *testing.B) {
for _, bc := range benchCases {
b.Run(bc.name, func(b *testing.B) {
m := NewJaroWinklerMatcher(bc.s1)
for range b.N {
m.Score(bc.s2)
}
})
}
}

View File

@ -0,0 +1,85 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package strutil
import (
"math"
"testing"
)
func TestJaroWinklerMatcher(t *testing.T) {
tests := []struct {
s1, s2 string
min float64
max float64
}{
// Identical strings.
{"prometheus", "prometheus", 1.0, 1.0},
{"", "", 1.0, 1.0},
// Empty vs non-empty.
{"", "abc", 0.0, 0.0},
{"abc", "", 0.0, 0.0},
// Completely different strings.
{"abc", "xyz", 0.0, 0.01},
// Similar strings.
{"mimir", "mimer", 0.90, 0.92},
{"martha", "marhta", 0.96, 0.97},
{"dwayne", "duane", 0.83, 0.85},
{"dixon", "dicksonx", 0.81, 0.83},
// Single character strings.
{"a", "a", 1.0, 1.0},
{"a", "b", 0.0, 0.0},
// Common prefix boost.
{"prefix_abc", "prefix_xyz", 0.80, 0.90},
// Unicode strings (exercises the rune path).
{"café", "cafe", 0.88, 0.89},
{"naïve", "naive", 0.89, 0.90},
{"résumé", "resume", 0.79, 0.81},
// Identical Unicode strings.
{"café", "café", 1.0, 1.0},
// Empty vs Unicode.
{"", "café", 0.0, 0.0},
{"café", "", 0.0, 0.0},
// Two Unicode strings compared to each other.
{"café", "cafè", 0.88, 0.89},
// Common Unicode prefix (exercises Winkler boost on runes).
{"préfixe_abc", "préfixe_xyz", 0.80, 0.90},
// Unicode strings with unequal rune lengths (exercises swap in rune path).
{"naïve_long", "naïve", 0.89, 0.91},
// Completely different Unicode strings (exercises zero-matches in rune path).
{"äöü", "éèê", 0.0, 0.01},
// Unicode transpositions (mirrors martha/marhta in rune path).
{"màrthà", "màrhtà", 0.96, 0.97},
}
for _, tt := range tests {
t.Run(tt.s1+"_"+tt.s2, func(t *testing.T) {
score := NewJaroWinklerMatcher(tt.s1).Score(tt.s2)
if score < tt.min || score > tt.max {
t.Errorf("NewJaroWinklerMatcher(%q).Score(%q) = %f, want in [%f, %f]", tt.s1, tt.s2, score, tt.min, tt.max)
}
// Verify symmetry.
reverse := NewJaroWinklerMatcher(tt.s2).Score(tt.s1)
if math.Abs(score-reverse) > 1e-10 {
t.Errorf("NewJaroWinklerMatcher(%q).Score(%q) = %f, but NewJaroWinklerMatcher(%q).Score(%q) = %f (not symmetric)", tt.s1, tt.s2, score, tt.s2, tt.s1, reverse)
}
})
}
}