-
-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
hashes: implement murmur3 #12022
Merged
Merged
hashes: implement murmur3 #12022
Changes from 3 commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
b273de2
hashes: implement murmur3
narimiran e0b4a33
resolved merge conflict
Araq 765c6f5
refactoring; there is only one murmurHash and it works at compile-tim…
Araq 0ae181d
fixes JS tests
Araq 1210edc
Merge branch 'devel' into miran-murmur3
Araq 48ef7af
makes toOpenArrayByte work with C++
Araq 95725ae
Merge branch 'devel' into miran-murmur3
Araq 8a7f181
Merge branch 'miran-murmur3' of github.com:nim-lang/Nim into miran-mu…
Araq ca3774d
bugfix
Araq 431deca
Merge branch 'devel' into miran-murmur3
Araq d6c4ec9
Merge branch 'miran-murmur3' of github.com:nim-lang/Nim into miran-mu…
Araq 683d027
attempt to make it bootstrap in C++ mode for 0.20
Araq File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,9 +49,6 @@ type | |
## always have a size of a power of two and can use the ``and`` | ||
## operator instead of ``mod`` for truncation of the hash value. | ||
|
||
const | ||
IntSize = sizeof(int) | ||
|
||
proc `!&`*(h: Hash, val: int): Hash {.inline.} = | ||
## Mixes a hash value `h` with `val` to produce a new hash value. | ||
## | ||
|
@@ -108,13 +105,12 @@ proc hash*(x: pointer): Hash {.inline.} = | |
else: | ||
result = cast[Hash](cast[uint](x) shr 3) # skip the alignment | ||
|
||
when not defined(booting): | ||
proc hash*[T: proc](x: T): Hash {.inline.} = | ||
## Efficient hashing of proc vars. Closures are supported too. | ||
when T is "closure": | ||
result = hash(rawProc(x)) !& hash(rawEnv(x)) | ||
else: | ||
result = hash(pointer(x)) | ||
proc hash*[T: proc](x: T): Hash {.inline.} = | ||
## Efficient hashing of proc vars. Closures are supported too. | ||
when T is "closure": | ||
result = hash(rawProc(x)) !& hash(rawEnv(x)) | ||
else: | ||
result = hash(pointer(x)) | ||
|
||
proc hash*(x: int): Hash {.inline.} = | ||
## Efficient hashing of integers. | ||
|
@@ -151,27 +147,87 @@ proc hash*(x: float): Hash {.inline.} = | |
proc hash*[A](x: openArray[A]): Hash | ||
proc hash*[A](x: set[A]): Hash | ||
|
||
template bytewiseHashing(result: Hash, x: typed, start, stop: int) = | ||
for i in start .. stop: | ||
result = result !& hash(x[i]) | ||
result = !$result | ||
|
||
template hashImpl(result: Hash, x: typed, start, stop: int) = | ||
when defined(JS): | ||
proc imul(a, b: uint32): uint32 = | ||
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/imul | ||
let mask = 0xffff'u32 | ||
var | ||
aHi = (a shr 16) and mask | ||
aLo = a and mask | ||
bHi = (b shr 16) and mask | ||
bLo = b and mask | ||
result = (aLo * bLo) + (aHi * bLo + aLo * bHi) shl 16 | ||
else: | ||
template imul(a, b: uint32): untyped = a * b | ||
|
||
proc rotl32(x: uint32, r: int): uint32 {.inline.} = | ||
(x shl r) or (x shr (32 - r)) | ||
|
||
proc murmurHash(x: openArray[byte]): Hash = | ||
# https://github.com/PeterScott/murmur3/blob/master/murmur3.c | ||
const | ||
c1 = 0xcc9e2d51'u32 | ||
c2 = 0x1b873593'u32 | ||
n1 = 0xe6546b64'u32 | ||
m1 = 0x85ebca6b'u32 | ||
m2 = 0xc2b2ae35'u32 | ||
let | ||
elementSize = sizeof(x[start]) | ||
stepSize = IntSize div elementSize | ||
var i = start | ||
while i <= stop+1 - stepSize: | ||
var n = 0 | ||
size = len(x) | ||
stepSize = 4 # 32-bit | ||
n = size div stepSize | ||
var | ||
h1: uint32 | ||
i = 0 | ||
|
||
# body | ||
while i < n * stepSize: | ||
var k1: uint32 | ||
when nimvm: | ||
# we cannot cast in VM, so we do it manually | ||
for j in countdown(stepSize-1, 0): | ||
n = (n shl (8*elementSize)) or ord(x[i+j]) | ||
var j = stepSize | ||
while j > 0: | ||
dec j | ||
k1 = (k1 shl 8) or (ord(x[i+j])).uint32 | ||
else: | ||
n = cast[ptr Hash](unsafeAddr x[i])[] | ||
result = result !& n | ||
i += stepSize | ||
bytewiseHashing(result, x, i, stop) # hash the remaining elements and finish | ||
k1 = cast[ptr uint32](unsafeAddr x[i])[] | ||
inc i, stepSize | ||
|
||
k1 = imul(k1, c1) | ||
k1 = rotl32(k1, 15) | ||
k1 = imul(k1, c2) | ||
|
||
h1 = h1 xor k1 | ||
h1 = rotl32(h1, 13) | ||
h1 = h1*5 + n1 | ||
|
||
# tail | ||
var k1: uint32 | ||
var rem = size mod stepSize | ||
while rem > 0: | ||
dec rem | ||
k1 = (k1 shl 8) or (ord(x[i+rem])).uint32 | ||
k1 = imul(k1, c1) | ||
k1 = rotl32(k1, 15) | ||
k1 = imul(k1, c2) | ||
h1 = h1 xor k1 | ||
|
||
# finalization | ||
h1 = h1 xor size.uint32 | ||
h1 = h1 xor (h1 shr 16) | ||
h1 = imul(h1, m1) | ||
h1 = h1 xor (h1 shr 13) | ||
h1 = imul(h1, m2) | ||
h1 = h1 xor (h1 shr 16) | ||
return cast[Hash](h1) | ||
|
||
proc hashVmImpl(x: string, sPos, ePos: int): Hash = | ||
discard "look at compiler/vmops.nim" | ||
|
||
proc hashVmImplChar(x: openArray[char], sPos, ePos: int): Hash = | ||
discard "look at compiler/vmops.nim" | ||
|
||
proc hashVmImplByte(x: openArray[byte], sPos, ePos: int): Hash = | ||
discard "look at compiler/vmops.nim" | ||
|
||
proc hash*(x: string): Hash = | ||
## Efficient hashing of strings. | ||
|
@@ -182,7 +238,10 @@ proc hash*(x: string): Hash = | |
runnableExamples: | ||
doAssert hash("abracadabra") != hash("AbracadabrA") | ||
|
||
hashImpl(result, x, 0, high(x)) | ||
when nimvm: | ||
result = hashVmImpl(x, 0, high(x)) | ||
else: | ||
result = murmurHash(toOpenArrayByte(x, 0, high(x))) | ||
|
||
proc hash*(x: cstring): Hash = | ||
## Efficient hashing of null-terminated strings. | ||
|
@@ -191,7 +250,11 @@ proc hash*(x: cstring): Hash = | |
doAssert hash(cstring"AbracadabrA") == hash("AbracadabrA") | ||
doAssert hash(cstring"abracadabra") != hash(cstring"AbracadabrA") | ||
|
||
hashImpl(result, x, 0, high(x)) | ||
when not defined(JS) and defined(nimToOpenArrayCString): | ||
murmurHash(toOpenArrayByte(x, 0, x.high)) | ||
else: | ||
let xx = $x | ||
murmurHash(toOpenArrayByte(xx, 0, high(xx))) | ||
|
||
proc hash*(sBuf: string, sPos, ePos: int): Hash = | ||
## Efficient hashing of a string buffer, from starting | ||
|
@@ -202,7 +265,8 @@ proc hash*(sBuf: string, sPos, ePos: int): Hash = | |
var a = "abracadabra" | ||
doAssert hash(a, 0, 3) == hash(a, 7, 10) | ||
|
||
hashImpl(result, sBuf, sPos, ePos) | ||
murmurHash(toOpenArrayByte(sBuf, sPos, ePos)) | ||
|
||
|
||
proc hashIgnoreStyle*(x: string): Hash = | ||
## Efficient hashing of strings; style is ignored. | ||
|
@@ -300,12 +364,20 @@ proc hash*[T: tuple](x: T): Hash = | |
result = result !& hash(f) | ||
result = !$result | ||
|
||
|
||
proc hash*[A](x: openArray[A]): Hash = | ||
## Efficient hashing of arrays and sequences. | ||
when A is char|SomeInteger: | ||
hashImpl(result, x, 0, x.high) | ||
when A is byte: | ||
result = murmurHash(x) | ||
elif A is char: | ||
when nimvm: | ||
result = hashVmImplChar(x, 0, x.high) | ||
else: | ||
result = murmurHash(toOpenArrayByte(x, 0, x.high)) | ||
else: | ||
bytewiseHashing(result, x, 0, x.high) | ||
for a in x: | ||
result = result !& hash(a) | ||
result = !$result | ||
|
||
proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash = | ||
## Efficient hashing of portions of arrays and sequences, from starting | ||
|
@@ -316,10 +388,20 @@ proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash = | |
let a = [1, 2, 5, 1, 2, 6] | ||
doAssert hash(a, 0, 1) == hash(a, 3, 4) | ||
|
||
when A is char|SomeInteger: | ||
hashImpl(result, aBuf, sPos, ePos) | ||
when A is byte: | ||
when nimvm: | ||
result = hashVmImplByte(aBuf, 0, aBuf.high) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good point. :-) |
||
else: | ||
result = murmurHash(toOpenArray(aBuf, sPos, ePos)) | ||
elif A is char: | ||
when nimvm: | ||
result = hashVmImplChar(aBuf, 0, aBuf.high) | ||
else: | ||
result = murmurHash(toOpenArrayByte(aBuf, sPos, ePos)) | ||
else: | ||
bytewiseHashing(result, aBuf, sPos, ePos) | ||
for i in sPos .. ePos: | ||
result = result !& hash(aBuf[i]) | ||
result = !$result | ||
|
||
proc hash*[A](x: set[A]): Hash = | ||
## Efficient hashing of sets. | ||
|
@@ -334,26 +416,30 @@ when isMainModule: | |
a = "" | ||
b = newSeq[char]() | ||
c = newSeq[int]() | ||
d = cstring"" | ||
e = "abcd" | ||
doAssert hash(a) == 0 | ||
doAssert hash(b) == 0 | ||
doAssert hash(c) == 0 | ||
doAssert hash(d) == 0 | ||
doAssert hashIgnoreCase(a) == 0 | ||
doAssert hashIgnoreStyle(a) == 0 | ||
doAssert hash(e, 3, 2) == 0 | ||
block sameButDifferent: | ||
doAssert hash("aa bb aaaa1234") == hash("aa bb aaaa1234", 0, 13) | ||
doAssert hash("aa bb aaaa1234") == hash(cstring"aa bb aaaa1234") | ||
doAssert hashIgnoreCase("aA bb aAAa1234") == hashIgnoreCase("aa bb aaaa1234") | ||
doAssert hashIgnoreStyle("aa_bb_AAaa1234") == hashIgnoreCase("aaBBAAAa1234") | ||
block smallSize: # no multibyte hashing | ||
let | ||
xx = @['H','e','l','l','o'] | ||
ii = @[72'i8, 101, 108, 108, 111] | ||
ss = "Hello" | ||
xx = @['H','i'] | ||
ii = @[72'u8, 105] | ||
ss = "Hi" | ||
doAssert hash(xx) == hash(ii) | ||
doAssert hash(xx) == hash(ss) | ||
doAssert hash(xx) == hash(xx, 0, xx.high) | ||
doAssert hash(ss) == hash(ss, 0, ss.high) | ||
block largeSize: # longer than 8 characters, should trigger multibyte hashing | ||
block largeSize: # longer than 4 characters | ||
let | ||
xx = @['H','e','l','l','o'] | ||
xxl = @['H','e','l','l','o','w','e','e','n','s'] | ||
|
@@ -362,9 +448,6 @@ when isMainModule: | |
doAssert hash(xxl) == hash(xxl, 0, xxl.high) | ||
doAssert hash(ssl) == hash(ssl, 0, ssl.high) | ||
doAssert hash(xx) == hash(xxl, 0, 4) | ||
block misc: | ||
let | ||
a = [1'u8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4] | ||
b = [1'i8, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4] | ||
doAssert hash(a) == hash(b) | ||
doAssert hash(a, 2, 5) == hash(b, 2, 5) | ||
doAssert hash(xx) == hash(ssl, 0, 4) | ||
doAssert hash(xx, 0, 3) == hash(xxl, 0, 3) | ||
doAssert hash(xx, 0, 3) == hash(ssl, 0, 3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
int8/uint8
? at leastint8
was handled before this change IIRCmaybe:
when sizeof(A)==1 and A isnot char: ...
vm supports casting integers of same size so everything could be cast to 1 type (eg byte) without having to add overloads. Ideally (but out of scope for this PR) there are more things that VM should allow to cast safely
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't consider it important enough but fair enough.