Skip to content

Commit 635622b

Browse files
committed
jsonschema: using hashing for uniqueItems
Determine if a slice contains unique items by hashing them. This should take linear time on average, better than the previous quadratic algorithm. Change-Id: I8dc95bb6f29d802bdbc64bc7f7e698c71eae5ce7 Reviewed-on: https://go-review.googlesource.com/c/tools/+/669455 Reviewed-by: Alan Donovan <adonovan@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
1 parent 0f6a53f commit 635622b

File tree

3 files changed

+162
-8
lines changed

3 files changed

+162
-8
lines changed

internal/mcp/internal/jsonschema/util.go

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@ package jsonschema
66

77
import (
88
"bytes"
9+
"cmp"
10+
"encoding/binary"
911
"encoding/json"
1012
"fmt"
13+
"hash/maphash"
1114
"math"
1215
"math/big"
1316
"reflect"
17+
"slices"
1418
)
1519

1620
// Equal reports whether two Go values representing JSON values are equal according
@@ -126,14 +130,93 @@ func equalValue(x, y reflect.Value) bool {
126130
return x.String() == y.String()
127131
case reflect.Bool:
128132
return x.Bool() == y.Bool()
129-
case reflect.Complex64, reflect.Complex128:
130-
return x.Complex() == y.Complex()
131133
// Ints, uints and floats handled in jsonNumber, at top of function.
132134
default:
133135
panic(fmt.Sprintf("unsupported kind: %s", x.Kind()))
134136
}
135137
}
136138

139+
// hashValue adds v to the data hashed by h. v must not have cycles.
140+
// hashValue panics if the value contains functions or channels, or maps whose
141+
// key type is not string.
142+
// It ignores unexported fields of structs.
143+
// Calls to hashValue with the equal values (in the sense
144+
// of [Equal]) result in the same sequence of values written to the hash.
145+
func hashValue(h *maphash.Hash, v reflect.Value) {
146+
// TODO: replace writes of basic types with WriteComparable in 1.24.
147+
148+
writeUint := func(u uint64) {
149+
var buf [8]byte
150+
binary.BigEndian.PutUint64(buf[:], u)
151+
h.Write(buf[:])
152+
}
153+
154+
var write func(reflect.Value)
155+
write = func(v reflect.Value) {
156+
if r, ok := jsonNumber(v); ok {
157+
// We want 1.0 and 1 to hash the same.
158+
// big.Rats are always normalized, so they will be.
159+
// We could do this more efficiently by handling the int and float cases
160+
// separately, but that's premature.
161+
writeUint(uint64(r.Sign() + 1))
162+
h.Write(r.Num().Bytes())
163+
h.Write(r.Denom().Bytes())
164+
return
165+
}
166+
switch v.Kind() {
167+
case reflect.Invalid:
168+
h.WriteByte(0)
169+
case reflect.String:
170+
h.WriteString(v.String())
171+
case reflect.Bool:
172+
if v.Bool() {
173+
h.WriteByte(1)
174+
} else {
175+
h.WriteByte(0)
176+
}
177+
case reflect.Complex64, reflect.Complex128:
178+
c := v.Complex()
179+
writeUint(math.Float64bits(real(c)))
180+
writeUint(math.Float64bits(imag(c)))
181+
case reflect.Array, reflect.Slice:
182+
// Although we could treat []byte more efficiently,
183+
// JSON values are unlikely to contain them.
184+
writeUint(uint64(v.Len()))
185+
for i := range v.Len() {
186+
write(v.Index(i))
187+
}
188+
case reflect.Interface, reflect.Pointer:
189+
write(v.Elem())
190+
case reflect.Struct:
191+
t := v.Type()
192+
for i := range t.NumField() {
193+
if sf := t.Field(i); sf.IsExported() {
194+
write(v.FieldByIndex(sf.Index))
195+
}
196+
}
197+
case reflect.Map:
198+
if v.Type().Key().Kind() != reflect.String {
199+
panic("map with non-string key")
200+
}
201+
// Sort the keys so the hash is deterministic.
202+
keys := v.MapKeys()
203+
// Write the length. That distinguishes between, say, two consecutive
204+
// maps with disjoint keys from one map that has the items of both.
205+
writeUint(uint64(len(keys)))
206+
slices.SortFunc(keys, func(x, y reflect.Value) int { return cmp.Compare(x.String(), y.String()) })
207+
for _, k := range keys {
208+
write(k)
209+
write(v.MapIndex(k))
210+
}
211+
// Ints, uints and floats handled in jsonNumber, at top of function.
212+
default:
213+
panic(fmt.Sprintf("unsupported kind: %s", v.Kind()))
214+
}
215+
}
216+
217+
write(v)
218+
}
219+
137220
// jsonNumber converts a numeric value or a json.Number to a [big.Rat].
138221
// If v is not a number, it returns nil, false.
139222
func jsonNumber(v reflect.Value) (*big.Rat, bool) {

internal/mcp/internal/jsonschema/util_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package jsonschema
66

77
import (
88
"encoding/json"
9+
"hash/maphash"
910
"reflect"
1011
"testing"
1112
)
@@ -71,3 +72,56 @@ func TestJSONType(t *testing.T) {
7172

7273
}
7374
}
75+
76+
func TestHash(t *testing.T) {
77+
x := map[string]any{
78+
"s": []any{1, "foo", nil, true},
79+
"f": 2.5,
80+
"m": map[string]any{
81+
"n": json.Number("123.456"),
82+
"schema": &Schema{Type: "integer", UniqueItems: true},
83+
},
84+
"c": 1.2 + 3.4i,
85+
"n": nil,
86+
}
87+
88+
seed := maphash.MakeSeed()
89+
90+
hash := func(x any) uint64 {
91+
var h maphash.Hash
92+
h.SetSeed(seed)
93+
hashValue(&h, reflect.ValueOf(x))
94+
return h.Sum64()
95+
}
96+
97+
want := hash(x)
98+
// Run several times to verify consistency.
99+
for range 10 {
100+
if got := hash(x); got != want {
101+
t.Errorf("hash values differ: %d vs. %d", got, want)
102+
}
103+
}
104+
105+
// Check mathematically equal values.
106+
nums := []any{
107+
5,
108+
uint(5),
109+
5.0,
110+
json.Number("5"),
111+
json.Number("5.00"),
112+
}
113+
for i, n := range nums {
114+
if i == 0 {
115+
want = hash(n)
116+
} else if got := hash(n); got != want {
117+
t.Errorf("hashes differ between %v (%[1]T) and %v (%[2]T)", nums[0], n)
118+
}
119+
}
120+
121+
// Check that a bare JSON `null` is OK.
122+
var null any
123+
if err := json.Unmarshal([]byte(`null`), &null); err != nil {
124+
t.Fatal(err)
125+
}
126+
_ = hash(null)
127+
}

internal/mcp/internal/jsonschema/validate.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package jsonschema
66

77
import (
88
"fmt"
9+
"hash/maphash"
910
"math"
1011
"math/big"
1112
"reflect"
@@ -297,16 +298,32 @@ func (st *state) validate(instance reflect.Value, schema *Schema, callerAnns *an
297298
}
298299
}
299300
if schema.UniqueItems {
300-
// Determine uniqueness with O(n²) comparisons.
301-
// TODO: optimize via hashing.
302-
for i := range instance.Len() {
303-
for j := i + 1; j < instance.Len(); j++ {
304-
if equalValue(instance.Index(i), instance.Index(j)) {
305-
return fmt.Errorf("uniqueItems: array items %d and %d are equal", i, j)
301+
if instance.Len() > 1 {
302+
// Hash each item and compare the hashes.
303+
// If two hashes differ, the items differ.
304+
// If two hashes are the same, compare the collisions for equality.
305+
// (The same logic as hash table lookup.)
306+
// TODO(jba): Use container/hash.Map when it becomes available (https://go.dev/issue/69559),
307+
hashes := map[uint64][]int{} // from hash to indices
308+
seed := maphash.MakeSeed()
309+
for i := range instance.Len() {
310+
item := instance.Index(i)
311+
var h maphash.Hash
312+
h.SetSeed(seed)
313+
hashValue(&h, item)
314+
hv := h.Sum64()
315+
if sames := hashes[hv]; len(sames) > 0 {
316+
for _, j := range sames {
317+
if equalValue(item, instance.Index(j)) {
318+
return fmt.Errorf("uniqueItems: array items %d and %d are equal", i, j)
319+
}
320+
}
306321
}
322+
hashes[hv] = append(hashes[hv], i)
307323
}
308324
}
309325
}
326+
310327
// https://json-schema.org/draft/2020-12/json-schema-core#section-11.2
311328
if schema.UnevaluatedItems != nil && !anns.allItems {
312329
// Apply this subschema to all items in the array that haven't been successfully validated.

0 commit comments

Comments
 (0)