Skip to content

Commit 73c056e

Browse files
committed
Use RLE on step hash to make arrays and ranges equal
Instead of hashing the values themselves, hash the first value and differences between subsequent elements using run-length encoding. This allows for O(1) hashing of ranges consistent with AbstractArrays, which means they can now compare equal. Elements for which the - operator is not defined are hashed directly. This assumes that types which can be used for ranges and sparse matrices implement -.
1 parent 807ec46 commit 73c056e

File tree

4 files changed

+139
-58
lines changed

4 files changed

+139
-58
lines changed

base/abstractarray.jl

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -912,9 +912,6 @@ function isequal(A::AbstractArray, B::AbstractArray)
912912
if size(A) != size(B)
913913
return false
914914
end
915-
if isa(A,Range) != isa(B,Range)
916-
return false
917-
end
918915
for (a, b) in zip(A, B)
919916
if !isequal(a, b)
920917
return false
@@ -935,9 +932,6 @@ function (==)(A::AbstractArray, B::AbstractArray)
935932
if size(A) != size(B)
936933
return false
937934
end
938-
if isa(A,Range) != isa(B,Range)
939-
return false
940-
end
941935
for (a, b) in zip(A, B)
942936
if !(a == b)
943937
return false
@@ -1151,38 +1145,3 @@ push!(A, a, b) = push!(push!(A, a), b)
11511145
push!(A, a, b, c...) = push!(push!(A, a, b), c...)
11521146
unshift!(A, a, b) = unshift!(unshift!(A, b), a)
11531147
unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b)
1154-
1155-
## hashing collections ##
1156-
1157-
const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
1158-
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
1159-
function hash(a::AbstractArray, h::UInt)
1160-
h += hashaa_seed
1161-
h += hash(size(a))
1162-
1163-
state = start(a)
1164-
done(a, state) && return h
1165-
x2, state = next(a, state)
1166-
done(a, state) && return hash(x2, h)
1167-
1168-
x1 = x2
1169-
while !done(a, state)
1170-
x1 = x2
1171-
x2, state = next(a, state)
1172-
if isequal(x2, x1)
1173-
# For repeated elements, use run length encoding
1174-
# This allows efficient hashing of sparse arrays
1175-
runlength = 2
1176-
while !done(a, state)
1177-
x2, state = next(a, state)
1178-
isequal(x1, x2) || break
1179-
runlength += 1
1180-
end
1181-
h += hashrle_seed
1182-
h = hash(runlength, h)
1183-
end
1184-
h = hash(x1, h)
1185-
end
1186-
!isequal(x2, x1) && (h = hash(x2, h))
1187-
return h
1188-
end

base/hashing.jl

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,79 @@ end
6464

6565
hash(x::QuoteNode, h::UInt) = hash(x.value, hash(QuoteNode, h))
6666

67-
# hashing ranges by component at worst leads to collisions for very similar ranges
68-
const hashr_seed = UInt === UInt64 ? 0x80707b6821b70087 : 0x21b70087
67+
## hashing collections ##
68+
69+
const hashaa_seed = UInt === UInt64 ? 0x7f53e68ceb575e76 : 0xeb575e76
70+
const hashrle_seed = UInt == UInt64 ? 0x2aab8909bfea414c : 0xbfea414c
71+
function hash{T}(a::AbstractArray{T}, h::UInt)
72+
if isleaftype(T)
73+
if method_exists(-, (T, T))
74+
val = (x1, x2) -> x2 - x1
75+
else
76+
val = (x1, x2) -> x2
77+
end
78+
else
79+
val = (x1, x2) -> applicable(-, x2, x1) ? x2 - x1 : x2
80+
end
81+
82+
_hash(a, h, val)
83+
end
84+
85+
function _hash{T}(a::AbstractArray{T}, h::UInt, val::Function)
86+
h += hashaa_seed
87+
h += hash(size(a))
88+
89+
state = start(a)
90+
done(a, state) && return h
91+
x1, state = next(a, state)
92+
# Always hash the first element
93+
h = hash(x1, h)
94+
done(a, state) && return h
95+
96+
# Then hash the difference between two subsequent elements when - is supported,
97+
# or the elements themselves when not
98+
x2, state = next(a, state)
99+
v2 = val(x1, x2)
100+
done(a, state) && return hash(v2, h)
101+
102+
v1 = v2
103+
while !done(a, state)
104+
x1 = x2
105+
x2, state = next(a, state)
106+
v1 = v2
107+
v2 = applicable(-, x2, x1) ? x2 - x1 : x2
108+
if isequal(v2, v1)
109+
# For repeated elements, use run length encoding
110+
# This allows efficient hashing of sparse arrays
111+
runlength = 2
112+
while !done(a, state)
113+
x1 = x2
114+
x2, state = next(a, state)
115+
v2 = val(x1, x2)
116+
isequal(v1, v2) || break
117+
runlength += 1
118+
end
119+
h += hashrle_seed
120+
h = hash(runlength, h)
121+
end
122+
h = hash(v1, h)
123+
end
124+
!isequal(v2, v1) && (h = hash(v2, h))
125+
return h
126+
end
127+
128+
# hashaa_seed and hashrle_seed are defined in abstractarray.jl
69129
function hash(r::Range, h::UInt)
70-
h += hashr_seed
130+
h += hashaa_seed
131+
h += hash(size(r))
132+
133+
length(r) == 0 && return h
134+
71135
h = hash(first(r), h)
72-
h = hash(step(r), h)
73-
h = hash(last(r), h)
136+
length(r) == 1 && return h
137+
length(r) == 2 && return hash(step(r), h)
138+
139+
h += hashrle_seed
140+
h = hash(length(r)-1, h)
141+
hash(step(r), h)
74142
end

base/sparse/sparsematrix.jl

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3292,6 +3292,7 @@ end
32923292

32933293
# End the run and return the current hash
32943294
@inline function hashrun(val, runlength::Int, h::UInt)
3295+
# @show val, runlength
32953296
if runlength == 0
32963297
return h
32973298
elseif runlength > 1
@@ -3301,7 +3302,16 @@ end
33013302
hash(val, h)
33023303
end
33033304

3304-
function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
3305+
# Hash a sequence of zero entries, including the step before the first one
3306+
@inline function hashzeros(val, runlength::Int, h::UInt)
3307+
#@show "b", val, runlength
3308+
runlength == 0 && return h
3309+
h = hash(zero(val)-val, h)
3310+
hashrun(zero(val), runlength-1, h)
3311+
end
3312+
3313+
# hashaa_seed and hashrle_seed are defined in abstractarray.jl
3314+
function hashsp{T}(A::SparseMatrixCSC{T}, h::UInt)
33053315
h += Base.hashaa_seed
33063316
sz = size(A)
33073317
h += hash(sz)
@@ -3311,26 +3321,34 @@ function hash{T}(A::SparseMatrixCSC{T}, h::UInt)
33113321
nzval = A.nzval
33123322
lastidx = 0
33133323
runlength = 0
3324+
last = zero(T)
33143325
lastnz = zero(T)
3326+
lastdiff = zero(T)
33153327
@inbounds for col = 1:size(A, 2)
33163328
for j = colptr[col]:colptr[col+1]-1
33173329
nz = nzval[j]
33183330
isequal(nz, zero(T)) && continue
33193331
idx = sub2ind(sz, rowval[j], col)
3320-
if idx != lastidx+1 || !isequal(nz, lastnz) # Run is over
3321-
h = hashrun(lastnz, runlength, h) # Hash previous run
3322-
h = hashrun(0, idx-lastidx-1, h) # Hash intervening zeros
3323-
3324-
runlength = 1
3325-
lastnz = nz
3326-
else
3327-
runlength += 1
3332+
diff = nz - last
3333+
# @show col, j, nz, last, diff, lastdiff, idx, lastidx
3334+
if idx != lastidx+1 # There are zeros since the previous value
3335+
h = hashzeros(lastnz, idx-lastidx-1, h) # Hash intervening zeros
3336+
last = 0
3337+
runlength = 0
3338+
end
3339+
if !isequal(diff, lastdiff) # Run is over
3340+
h = hashrun(lastdiff, runlength, h) # Hash previous run
3341+
runlength = 0
33283342
end
3343+
runlength += 1
33293344
lastidx = idx
3345+
last = nz
3346+
lastnz = nz
3347+
lastdiff = diff
33303348
end
33313349
end
3332-
h = hashrun(lastnz, runlength, h) # Hash previous run
3333-
hashrun(0, length(A)-lastidx, h) # Hash zeros at end
3350+
h = hashrun(lastdiff, runlength, h) # Hash previous run
3351+
hashzeros(last, length(A)-lastidx, h) # Hash zeros at end
33343352
end
33353353

33363354
## Statistics

test/hashing.jl

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,49 @@ vals = Any[
7272
[], [1], [2], [1, 1], [1, 2], [1, 3], [2, 2], [1, 2, 2], [1, 3, 3],
7373
zeros(2, 2), spzeros(2, 2), eye(2, 2), speye(2, 2),
7474
sparse(ones(2, 2)), ones(2, 2), sparse([0 0; 1 0]), [0 0; 1 0],
75-
[-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.])
75+
[-0. 0; -0. 0.], SparseMatrixCSC(2, 2, [1, 3, 3], [1, 2], [-0., -0.]),
76+
# issue #16364
77+
1:4, 1:1:4, 1:-1:0, 1.0:4.0, 1.0:1.0:4.0, linspace(1, 4, 4),
78+
'a':'e', ['a', 'b', 'c', 'd', 'e'],
79+
# check that hash is still consistent with heteregeneous arrays for which - is defined
80+
# for some pairs and not others (no element must be ignored)
81+
["a", "b", 1, 2], ["a", 1, 2], ["a", "b", 2, 2], ["a", "a", 1, 2], ["a", "b", 2, 3]
7682
]
7783

7884
for a in vals, b in vals
7985
@test isequal(a,b) == (hash(a)==hash(b))
8086
end
8187

88+
vals = Any[
89+
Int[], Char[], String[],
90+
[0], [1], ['a'], ["a"],
91+
[0, 1], ['a', 'b'], ["a", "b"],
92+
[0, 1, 2], ['a', 'b', 'c'], ["a", "b", "c"],
93+
# test various sparsity patterns
94+
[0, 0], [0, 0, 0], [0, 1], [1, 0],
95+
[0, 0, 1], [0, 1, 0], [1, 0, 0],
96+
[0 0; 0 0], [1 0; 0 0], [0 1; 0 0], [0 0; 1 0], [0 0; 0 1],
97+
[5 1; 0 0], [1 0; 0 1], [0 2; 3 0], [0 4; 1 2], [4 0; 0 1],
98+
[0 0 0; 0 0 0], [1 0 0; 0 0 1], [0 0 2; 3 0 0], [0 0 7; 6 1 2], [4 0 0; 3 0 1]
99+
]
100+
101+
for a in vals
102+
# check that element type does not affect hash
103+
@test hash(convert(Array{Any}, a)) == hash(a)
104+
@test hash(convert(Array{supertype(eltype(a))}, a)) == hash(a)
105+
@test hash(sparse(a)) == hash(a)
106+
end
107+
108+
vals = Any[
109+
1:0, 1:1, 1:2, 1:3, 1.0:0.0, 1.0:1.0:1.0, 1.0:0.5:3.0,
110+
0:-1:1, 0.0:-1.0:1.0, -4:10, 'a':'e', 'b':'a',
111+
linspace(1, 1, 1), linspace(1, 10, 3)
112+
]
113+
114+
for a in vals
115+
@test hash(collect(a)) == hash(a)
116+
end
117+
82118
@test hash(SubString("--hello--",3,7)) == hash("hello")
83119
@test hash(:(X.x)) == hash(:(X.x))
84120
@test hash(:(X.x)) != hash(:(X.y))

0 commit comments

Comments
 (0)