Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pseudorandom probing for hash collision #13418

Merged
merged 19 commits into from
Feb 19, 2020
87 changes: 71 additions & 16 deletions lib/pure/collections/hashcommon.nim
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,87 @@ when not defined(nimHasDefault):
var v: T
v

const freeMarker = 0
const deletedMarker = -1

type UHash = uint

# hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These
# two procs retain clarity of that encoding without the space cost of an enum.
proc isEmpty(hcode: Hash): bool {.inline.} =
result = hcode == 0
proc isFilledAndValid(hcode: Hash): bool {.inline.} =
result = hcode != 0 and hcode != deletedMarker
# performance: we could use bit magic if needed

proc isFilled(hcode: Hash): bool {.inline.} =
result = hcode != 0

proc nextTry(h, maxHash: Hash): Hash {.inline.} =
result = (h + 1) and maxHash

proc mustRehash(length, counter: int): bool {.inline.} =
assert(length > counter)
result = (length * 2 < counter * 3) or (length - counter < 4)
proc translateBits(a: UHash, numBitsMask: int): UHash {.inline.} =
result = (a shr numBitsMask) or (a shl (UHash.sizeof * 8 - numBitsMask))

proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} =
# FACTOR between hashcommon.nextTry, intsets.nextTry
# an optimization would be to use `(h + 1) and maxHash` for a few iterations
# and then switch to the formula below, to get "best of both worlds": good
# cache locality, except when a collision cluster is detected (ie, large number
# of iterations).
const PERTURB_SHIFT = 5 # consider tying this to `numBitsMask = fastLog2(t.dataLen)`
result = cast[Hash]((5*cast[uint](h) + 1 + perturb) and cast[uint](maxHash))
perturb = perturb shr PERTURB_SHIFT

proc mustRehash[T](t: T): bool {.inline.} =
# FACTOR between hashcommon.mustRehash, intsets.mustRehash
let counter2 = t.counter + t.countDeleted
let length = t.dataLen
assert(length > counter2)
result = (length * 2 < counter2 * 3) or (length - counter2 < 4) # synchronize with `rightSize`

proc rightSize*(count: Natural): int {.inline.} =
## Return the value of `initialSize` to support `count` items.
##
## If more items are expected to be added, simply add that
## expected extra amount to the parameter before calling this.
##
## Internally, we want `mustRehash(t) == false` for t that was just resized.
# Make sure to synchronize with `mustRehash`
result = nextPowerOfTwo(count * 3 div 2 + 4)

template getPerturb(t: typed, hc: Hash): UHash =
# we can't use `fastLog2(dataLen(t))` because importing `bitops` would cause codegen errors
# so we use a practical value of half the bit width (eg 64 / 2 = 32 on 64bit machines)
let numBitsMask = sizeof(Hash) * 4 # ie, sizeof(Hash) * 8 / 2
# this makes a major difference for cases like #13393; it causes the bits
# that were masked out in 1st position so they'll be masked in instead, and
# influence the recursion in nextTry earlier rather than later.
translateBits(cast[uint](hc), numBitsMask)

template rawGetKnownHCImpl() {.dirty.} =
if t.dataLen == 0:
return -1
var h: Hash = hc and maxHash(t) # start with real hash value
while isFilled(t.data[h].hcode):
# Compare hc THEN key with boolean short circuit. This makes the common case
# zero ==key's for missing (e.g.inserts) and exactly one ==key for present.
# It does slow down succeeding lookups by one extra Hash cmp&and..usually
# just a few clock cycles, generally worth it for any non-integer-like A.
if t.data[h].hcode == hc and t.data[h].key == key:
return h
h = nextTry(h, maxHash(t))
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
var perturb = t.getPerturb(hc)
var deletedIndex = -1
while true:
if isFilledAndValid(t.data[h].hcode):
# Compare hc THEN key with boolean short circuit. This makes the common case
# zero ==key's for missing (e.g.inserts) and exactly one ==key for present.
timotheecour marked this conversation as resolved.
Show resolved Hide resolved
# It does slow down succeeding lookups by one extra Hash cmp&and..usually
# just a few clock cycles, generally worth it for any non-integer-like A.
# performance: we optimize this: depending on type(key), skip hc comparison
if t.data[h].hcode == hc and t.data[h].key == key:
return h
h = nextTry(h, maxHash(t), perturb)
elif t.data[h].hcode == deletedMarker:
if deletedIndex == -1:
deletedIndex = h
h = nextTry(h, maxHash(t), perturb)
else:
break
if deletedIndex == -1:
result = -1 - h # < 0 => MISSING; insert idx = -1 - result
else:
# we prefer returning a (in fact the 1st found) deleted index
result = -1 - deletedIndex

proc rawGetKnownHC[X, A](t: X, key: A, hc: Hash): int {.inline.} =
rawGetKnownHCImpl()
Expand All @@ -54,6 +107,8 @@ template genHashImpl(key, hc: typed) =
hc = hash(key)
if hc == 0: # This almost never taken branch should be very predictable.
hc = 314159265 # Value doesn't matter; Any non-zero favorite is fine.
elif hc == deletedMarker:
hc = 214159261

template genHash(key: typed): Hash =
var res: Hash
Expand Down
36 changes: 25 additions & 11 deletions lib/pure/collections/intsets.nim
Original file line number Diff line number Diff line change
Expand Up @@ -46,30 +46,40 @@ type
IntSet* = object ## An efficient set of `int` implemented as a sparse bit set.
elems: int # only valid for small numbers
counter, max: int
countDeleted: int
head: PTrunk
data: TrunkSeq
a: array[0..33, int] # profiling shows that 34 elements are enough

proc mustRehash(length, counter: int): bool {.inline.} =
assert(length > counter)
result = (length * 2 < counter * 3) or (length - counter < 4)
proc mustRehash[T](t: T): bool {.inline.} =
# FACTOR between hashcommon.mustRehash, intsets.mustRehash
let counter2 = t.counter + t.countDeleted
let length = t.max + 1
assert length > counter2
result = (length * 2 < counter2 * 3) or (length - counter2 < 4)

proc nextTry(h, maxHash: Hash): Hash {.inline.} =
result = ((5 * h) + 1) and maxHash
proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} =
# FACTOR between hashcommon.nextTry, intsets.nextTry
const PERTURB_SHIFT = 5
var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT
perturb = cast[Hash](perturb2)
result = ((5*h) + 1 + perturb) and maxHash

proc intSetGet(t: IntSet, key: int): PTrunk =
var h = key and t.max
var perturb = key
while t.data[h] != nil:
if t.data[h].key == key:
return t.data[h]
h = nextTry(h, t.max)
h = nextTry(h, t.max, perturb)
result = nil

proc intSetRawInsert(t: IntSet, data: var TrunkSeq, desc: PTrunk) =
var h = desc.key and t.max
var perturb = desc.key
while data[h] != nil:
assert(data[h] != desc)
h = nextTry(h, t.max)
h = nextTry(h, t.max, perturb)
assert(data[h] == nil)
data[h] = desc

Expand All @@ -84,14 +94,16 @@ proc intSetEnlarge(t: var IntSet) =

proc intSetPut(t: var IntSet, key: int): PTrunk =
var h = key and t.max
var perturb = key
while t.data[h] != nil:
if t.data[h].key == key:
return t.data[h]
h = nextTry(h, t.max)
if mustRehash(t.max + 1, t.counter): intSetEnlarge(t)
h = nextTry(h, t.max, perturb)
if mustRehash(t): intSetEnlarge(t)
inc(t.counter)
h = key and t.max
while t.data[h] != nil: h = nextTry(h, t.max)
perturb = key
while t.data[h] != nil: h = nextTry(h, t.max, perturb)
assert(t.data[h] == nil)
new(result)
result.next = t.head
Expand All @@ -100,6 +112,7 @@ proc intSetPut(t: var IntSet, key: int): PTrunk =
t.data[h] = result

proc bitincl(s: var IntSet, key: int) {.inline.} =
var ret: PTrunk
var t = intSetPut(s, `shr`(key, TrunkShift))
var u = key and TrunkMask
t.bits[u shr IntShift] = t.bits[u shr IntShift] or
Expand Down Expand Up @@ -393,7 +406,8 @@ proc assign*(dest: var IntSet, src: IntSet) =
var it = src.head
while it != nil:
var h = it.key and dest.max
while dest.data[h] != nil: h = nextTry(h, dest.max)
var perturb = it.key
while dest.data[h] != nil: h = nextTry(h, dest.max, perturb)
assert(dest.data[h] == nil)
var n: PTrunk
new(n)
Expand Down
23 changes: 5 additions & 18 deletions lib/pure/collections/setimpl.nim
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ template inclImpl() {.dirty.} =
var hc: Hash
var index = rawGet(s, key, hc)
if index < 0:
if mustRehash(len(s.data), s.counter):
if mustRehash(s):
enlarge(s)
index = rawGetKnownHC(s, key, hc)
rawInsert(s, s.data, key, hc, -1 - index)
Expand All @@ -62,17 +62,12 @@ template containsOrInclImpl() {.dirty.} =
if index >= 0:
result = true
else:
if mustRehash(len(s.data), s.counter):
if mustRehash(s):
enlarge(s)
index = rawGetKnownHC(s, key, hc)
rawInsert(s, s.data, key, hc, -1 - index)
inc(s.counter)

template doWhile(a, b) =
while true:
b
if not a: break

proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} =
var hc: Hash
var i = rawGet(s, key, hc)
Expand All @@ -82,17 +77,9 @@ proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} =
if i >= 0:
result = false
dec(s.counter)
while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1
var j = i # The correctness of this depends on (h+1) in nextTry,
var r = j # though may be adaptable to other simple sequences.
s.data[i].hcode = 0 # mark current EMPTY
s.data[i].key = default(type(s.data[i].key))
doWhile((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)):
i = (i + 1) and msk # increment mod table size
if isEmpty(s.data[i].hcode): # end of collision cluster; So all done
return
r = s.data[i].hcode and msk # "home" location of key@i
s.data[j] = move(s.data[i]) # data[i] will be marked EMPTY next loop
inc(s.countDeleted)
s.data[i].hcode = deletedMarker
s.data[i].key = default(type(s.data[i].key))

template dollarImpl() {.dirty.} =
result = "{"
Expand Down
40 changes: 19 additions & 21 deletions lib/pure/collections/sets.nim
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ type
## before calling other procs on it.
data: KeyValuePairSeq[A]
counter: int
countDeleted: int

type
OrderedKeyValuePair[A] = tuple[
Expand All @@ -80,15 +81,13 @@ type
## <#initOrderedSet,int>`_ before calling other procs on it.
data: OrderedKeyValuePairSeq[A]
counter, first, last: int
countDeleted: int

const
defaultInitialSize* = 64

include setimpl

proc rightSize*(count: Natural): int {.inline.}


# ---------------------------------------------------------------------
# ------------------------------ HashSet ------------------------------
# ---------------------------------------------------------------------
Expand Down Expand Up @@ -250,7 +249,7 @@ iterator items*[A](s: HashSet[A]): A =
## echo b
## # --> {(a: 1, b: 3), (a: 0, b: 4)}
for h in 0 .. high(s.data):
if isFilled(s.data[h].hcode): yield s.data[h].key
if isFilledAndValid(s.data[h].hcode): yield s.data[h].key

proc containsOrIncl*[A](s: var HashSet[A], key: A): bool =
## Includes `key` in the set `s` and tells if `key` was already in `s`.
Expand Down Expand Up @@ -342,7 +341,7 @@ proc pop*[A](s: var HashSet[A]): A =
doAssertRaises(KeyError, echo s.pop)

for h in 0 .. high(s.data):
if isFilled(s.data[h].hcode):
if isFilledAndValid(s.data[h].hcode):
result = s.data[h].key
excl(s, result)
return result
Expand Down Expand Up @@ -593,16 +592,6 @@ proc `$`*[A](s: HashSet[A]): string =
## # --> {no, esc'aping, is " provided}
dollarImpl()

proc rightSize*(count: Natural): int {.inline.} =
## Return the value of `initialSize` to support `count` items.
##
## If more items are expected to be added, simply add that
## expected extra amount to the parameter before calling this.
##
## Internally, we want `mustRehash(rightSize(x), x) == false`.
result = nextPowerOfTwo(count * 3 div 2 + 4)


proc initSet*[A](initialSize = defaultInitialSize): HashSet[A] {.deprecated:
"Deprecated since v0.20, use 'initHashSet'".} = initHashSet[A](initialSize)

Expand Down Expand Up @@ -634,7 +623,7 @@ template forAllOrderedPairs(yieldStmt: untyped) {.dirty.} =
var idx = 0
while h >= 0:
var nxt = s.data[h].next
if isFilled(s.data[h].hcode):
if isFilledAndValid(s.data[h].hcode):
yieldStmt
inc(idx)
h = nxt
Expand Down Expand Up @@ -868,7 +857,7 @@ proc `==`*[A](s, t: OrderedSet[A]): bool =
while h >= 0 and g >= 0:
var nxh = s.data[h].next
var nxg = t.data[g].next
if isFilled(s.data[h].hcode) and isFilled(t.data[g].hcode):
if isFilledAndValid(s.data[h].hcode) and isFilledAndValid(t.data[g].hcode):
if s.data[h].key == t.data[g].key:
inc compared
else:
Expand Down Expand Up @@ -1146,10 +1135,19 @@ when isMainModule and not defined(release):
b.incl(2)
assert b.len == 1

for i in 0 .. 32:
var s = rightSize(i)
if s <= i or mustRehash(s, i):
echo "performance issue: rightSize() will not elide enlarge() at ", i
block:
type FakeTable = object
dataLen: int
counter: int
countDeleted: int

var t: FakeTable
for i in 0 .. 32:
var s = rightSize(i)
t.dataLen = s
t.counter = i
doAssert s > i and not mustRehash(t),
"performance issue: rightSize() will not elide enlarge() at: " & $i

block missingOrExcl:
var s = toOrderedSet([2, 3, 6, 7])
Expand Down
6 changes: 4 additions & 2 deletions lib/pure/collections/sharedtables.nim
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ type
SharedTable*[A, B] = object ## generic hash SharedTable
data: KeyValuePairSeq[A, B]
counter, dataLen: int
countDeleted: int
lock: Lock

template maxHash(t): untyped = t.dataLen-1

include tableimpl

template st_maybeRehashPutImpl(enlarge) {.dirty.} =
if mustRehash(t.dataLen, t.counter):
if mustRehash(t):
enlarge(t)
index = rawGetKnownHC(t, key, hc)
index = -1 - index # important to transform for mgetOrPutImpl
Expand All @@ -49,9 +50,10 @@ proc enlarge[A, B](t: var SharedTable[A, B]) =
for i in 0..<oldSize:
let eh = n[i].hcode
if isFilled(eh):
var perturb = t.getPerturb(eh)
var j: Hash = eh and maxHash(t)
while isFilled(t.data[j].hcode):
j = nextTry(j, maxHash(t))
j = nextTry(j, maxHash(t), perturb)
rawInsert(t, t.data, n[i].key, n[i].val, eh, j)
deallocShared(n)

Expand Down
Loading