Skip to content

Commit

Permalink
robinhood hashing
Browse files Browse the repository at this point in the history
  • Loading branch information
timotheecour committed Feb 20, 2020
1 parent 3385027 commit 96a2d49
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
47 changes: 46 additions & 1 deletion lib/pure/collections/hashcommon.nim
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ when not defined(nimHasDefault):

const freeMarker = 0
const deletedMarker = -1
type CMode = enum krobin, kmix
const cmode = krobin

type UHash = uint

Expand Down Expand Up @@ -73,6 +75,37 @@ template getPerturb(t: typed, hc: Hash): UHash =
# influence the recursion in nextTry earlier rather than later.
translateBits(cast[uint](hc), numBitsMask)

template robinHoodGet(t: typed, hc, mustNextTry, key0): int =
let m = maxHash(t)
var index: Hash = hc and m
while mustNextTry(t.data[index], index):
# TODO: can short circuit/optimize using depth
index = (index + 1) and m
index

template robinHoodInsert(t: typed, hc, mustNextTry, key0, val0): int =
static: doAssert cmode == krobin
let m = maxHash(t)
var index: Hash = hc and m
var depth = 0
while mustNextTry(t.data[index], index):
depth.inc
let indexi = t.data[index].hcode and m
var depthi = index - indexi
if depthi < 0: depthi = depthi + m + 1
# debugecho (depth, depthi, depth > depthi)
# if depth > depthi and false:
if depth > depthi:
# steal!
when compiles(t.data[index].val): # eg: not for HashSet
swap(t.data[index].val, val0)
swap(t.data[index].key, key0)
swap(t.data[index].hcode, hc)
depth = depthi
index = (index + 1) and m
# debugecho (depth, index, key0, val0, hc)
index

template findCell(t: typed, hc, mustNextTry): int =
let m = maxHash(t)
var index: Hash = hc and m
Expand Down Expand Up @@ -103,6 +136,7 @@ template findCell(t: typed, hc, mustNextTry): int =
template rawGetKnownHCImpl() {.dirty.} =
if t.dataLen == 0: return -1
var deletedIndex = -1

template mustNextTry(cell, index): bool =
if isFilledAndValid(cell.hcode):
# Compare hc THEN key with boolean short circuit. This makes the common case
Expand All @@ -121,7 +155,15 @@ template rawGetKnownHCImpl() {.dirty.} =
true
else: false

let index = findCell(t, hc, mustNextTry)
when compiles(t.data[0].val) and cmode == krobin:
# IMPROVE
when compiles(val):
let index = robinHoodInsert(t, hc, mustNextTry, key, val)
else:
let index = robinHoodGet(t, hc, mustNextTry, key)
else:
let index = findCell(t, hc, mustNextTry)

if deletedIndex == -2:
result = index
elif deletedIndex == -1:
Expand Down Expand Up @@ -151,3 +193,6 @@ template rawGetImpl() {.dirty.} =

proc rawGet[X, A](t: X, key: A, hc: var Hash): int {.inline.} =
rawGetImpl()

proc rawPutAux[X, A, B](t: var X, key: var A, hc: var Hash, val: var B): int {.inline.} =
rawGetKnownHCImpl()
5 changes: 4 additions & 1 deletion lib/pure/collections/tableimpl.nim
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ template maybeRehashPutImpl(enlarge) {.dirty.} =
template putImpl(enlarge) {.dirty.} =
checkIfInitialized()
var hc: Hash
var index = rawGet(t, key, hc)
genHashImpl(key, hc)
var val = val
var key = key
var index = rawPutAux(t, key, hc, val)
if index >= 0: t.data[index].val = val
else: maybeRehashPutImpl(enlarge)

Expand Down
7 changes: 5 additions & 2 deletions lib/pure/collections/tables.nim
Original file line number Diff line number Diff line change
Expand Up @@ -270,10 +270,13 @@ proc enlarge[A, B](t: var Table[A, B]) =
swap(t.data, n)
t.countDeleted = 0
for i in countup(0, high(n)):
let eh = n[i].hcode
var eh = n[i].hcode
if isFilledAndValid(eh):
template mustNextTry(cell, index): bool = isFilled(cell.hcode)
let j = findCell(t, eh, mustNextTry)
when cmode == krobin:
let j = robinHoodInsert(t, eh, mustNextTry, n[i].key, n[i].val)
else:
let j = findCell(t, eh, mustNextTry)
when defined(js):
rawInsert(t, t.data, n[i].key, n[i].val, eh, j)
else:
Expand Down

6 comments on commit 96a2d49

@c-blake
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, this looks ok. What I was doing myself was something like this:

template rawInsertImpl() {.dirty.} =
  if data.len == 0:
    initImpl(s, defaultInitialSize)
  var i = h                       # Linear probe to first empty slot
  while isFilled(data[i].hcode):
    i = (i + 1) and data.high
  if i > h:                       # No table wrap around; just shift up
    pushUp data, h, i - h
  elif i < h:                     # i wrapped to low indices
    pushUp data, 0, i
    data[0] = data[data.high]
    pushUp data, h, data.high - h
  #else: i == h => already have space at h; nothing to do
  data[h].key = key
  data[h].hcode = hc

where pushUp is in hashcommon and is like:

proc pushUp[T](x: var seq[T], i, n: int) {.inline.} = # move n elts up 1 idx
  if n < 1: return
  when defined(rhhMoveMem):
    moveMem x[i + 1].addr, x[i].addr, n * T.sizeof
  else:
    for j in countdown(i + n - 1, i): x[j+1] = x[j]   # XXX `move`?

and hashcommon also changes to be like this:

proc depth(h, hcode, msk: Hash): Hash {.inline.} =
  (msk + 1 + h - hcode) and msk

template rawGetKnownHCImpl() {.dirty.} =
  if t.dataLen == 0:
    return -1 - (hc and (defaultInitialSize - 1))
  var h: Hash = hc and t.maxHash # start with real hash value
  var d: Hash = 0 # Running search depth
  while isFilled(t.data[h].hcode) and depth(h, t.data[h].hcode, t.maxHash) >= d:
    when type(key) is SomeInteger:
      when defined(hashStats):
        hashDepth.inc
      if t.data[h].key == key:
        return h
    else:
      when defined(hashStats):
        t.depth.inc
      if t.data[h].hcode == hc and t.data[h].key == key:
        return h
    h = (h + 1) and t.maxHash
    d.inc
  result = -1 - h   # < 0 => MISSING and insert idx = -1 - result

@c-blake
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was just working with sets for simplicity while it seems you were working with tables as a POC. There is similar code for deletion with a flipped around pushDown. It's not a complex algorithm - mostly inserting and deleting from an ordered list, just ordered by probe depth and "module the table size" (which can actually be simplified to ordered by hash value with an overflow area at the end the masked part of data[] IF you can assume a good hash function). That said, it won't really solve the problem jyapayne found with an identity hash and only like 2 hash values the way your Python copied psuedo-random probe can, and there is a gotcha or two.

Also, while code like the above was working fine (and very fast) with a HashSet pre-sized to the correct size, the "grow up by resizing" case was taking too much longer - like 2x slower than your PRPobing in-L3. Instrumentation showed search depth growing by a factor of 17X for me instead of 2.7x as with the current pseudo-random probing. I don't think it is quite this problem, but may be related: https://accidentallyquadratic.tumblr.com/post/153545455987/rust-hash-iteration-reinsertion . (My test set has >200 kEntry...So, "only 17x" is not big enough to be really 'quadratic'. I'm not sure what's wrong there yet, but I still assume I can solve it.) That link at least shows another gotcha about triggering resize with the easy workaround of marking a table as to-be-resized on any overlong cluster but only if <50% full. My current puzzle is during resize from smaller to bigger where you pick up an extra bit, though.

Also, the way Araq did OrderedSet and OrderedTable is totally disrupted by Robin Hood because we move elements on insert while his next pointers are table indices. The choices there are to A) make things a doubly-linked list so we can update the pointers as we shift around or B) move to an entirely different impl for the Ordered* variants neither of which is great. So, if I do solve the growth problem and if we did do Robin Hood we would still need to re-do Ordered* either to take up more space overhead or entirely re-do it, e.g. the way Python did in the 3.6 series. They have a double DIMM hit kind of baked in (assuming large tables), but then regular/more common case Table/HashSet users would not need to suffer the double DIMM hit (often masked in "perfectly hot loops" - you kind of want some real program spending 25% of its time in hash operations and to back out table performance by the way that % changes based upon impls).

Another entirely different approach (with or without PR probing with or without thresholds) is to just use a less bad but almost as fast integer hash like Fibonacci hashing. It's just one integer multiply and spreads the bits around with the most mixed bits in the middle (kind of like Pascal's triangle for binomial powers for similar reasons). A 32-bit*32-bit product is 64 which gets mod 2**32'd into something where the top bits are the most mixed and Nim tables do and insead of say shr to reduce to table addresses. We could rotate(mixingConstant*x, someBits). It's funny it's called Fibonacci hashing - if you read that section in Knuth v3 he starts with the golden ratio, then tells you how it works for any irrational number, then any actual use case is finite integer arithmetic. So, it's really just any sufficiently mixing multiplier in the end, but gets labeled "Fibonacci" from the mathematics inspiring Knuth. It's not perfect, but it's a less fragile default hash.

If Araq is really insistent on the identity hash instead of just "something fast" that might include something like Fibonacci, Robin Hood cannot really save naive users. Non-naive users could always have defined a better hash and not had a problem. So, it might be better to just have some pre-fetching generous depth threshold like 8, use tombstones, and declare that deletion-heavy workloads are the special case while crappy hash functions are the more common case. (To further expand on my tombstone complaint, besides lengthening collision clusters they also lower memory utilization. You could easily have some array that was 33% tombstones, 33% occupied and still performing like it was 66% occupied. So, it's almost a 2X hit on utilization.) Anyway, it's kind of some "unknown meta distribution of use cases". I'm also fine doing some incubator thing under cligen/ for a deletion-heavy-workload friendly variant to let us get experience with it. (Also, I sent you an email about cligen-1.0 coming up. Not sure if you have any backward incompatible thoughts.)

I do think some optional local/total search depth instrumentation could go a long way here, maybe on by default in debug/non-release mode. Maybe even just a writable module attribute like var hashWarnings: File a debugger could set to stderr. Not sure.

Anyway, thanks for your attention/work on this. It's a pretty complex design space that almost needs diagrams to really explain all the cases. There's clearly been a bit of bitrot in the Nim impl since last I dove deep (my first real contribution was removing the infinite loop by a bad delete algo) -- at least a check for an empty data returning a misleading -1 and discordant factoring within sets & tables as well as a big split up into several files. I'm less sure there's a one size fits most solution to be had after spending some time on trying to switch to RHH and seeing the Ordered* mess it would make. While B-trees are much more guaranteed, they are also usually several times slower in average cases with well distributed hash functions. So, I doubt the hash table allure will ever go away.

@c-blake
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. My resizing bug was just a dumb numerator-denominator flip in my mustRehash. Oops.

With that fix, this RHH is 1.4X faster (overall program runtime, not just the hashing part) in-L3 than the random probing approach (without a limited linear probe) currently on HEAD in Nim devel on my one test dataset (that anagram finder thing on the forum). The boost should be even bigger on out-of-L3, but your (very good) linear probe limit idea can also probably reign that boost back.

I can provide the 180 line prototype impl on request. It is mostly a very stripped down sets.nim + setimpl.nim + hashcommon.nim. Should I email that to you?

Anyway, all the other points mentioned above remain relevant. It definitely breaks OrderedSet as any rearranging on insert must, but a specialized-optimized separate impl of that may be warranted/wanted anyway. It's more resilient to weak hashes, but not nearly as impervious as that Python-esque pseudo-random probing, but more cache-friendly and more heavy delete workload friendly. It has a gotcha in going to smaller tables in the same hash order that is fixable, but maybe with a "table-level flag" to resize on the next insert. It's a big ol' bag of engineering trade-offs.

@timotheecour
Copy link
Owner Author

@timotheecour timotheecour commented on 96a2d49 Feb 24, 2020 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@c-blake
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know one test doesn't cut it..Was just an example, but I also don't think many tests cut it and we agree it's very hard to define precisely. There are also space-speed tradeoffs. Your benchmarks don't measure memory usage (I like cgmemtime for this, btw, but it might need sysadmin intervention to run on CIs).

It's really hard to have an omnibus average score. Personally, I actually think many varieties should be available (at least limited LP-PRProbe, RH, Vanilla LP, maybe Cuckoo), but that's more work and I'm sure Araq would prefer a B-tree to all of them. Maybe I should just port my B-Tree to Nim.

@c-blake
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Of course, for this particular case memory is just data.len and so easy..but cgmemtime is nice if you didn't know about it.)

Please sign in to comment.