@@ -70,80 +70,100 @@ hash(x::UInt64, h::UInt) = hash_uint64(hash_mix_linear(x, h))
7070hash (x:: Int64 , h:: UInt ) = hash (bitcast (UInt64, x), h)
7171hash (x:: Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32} , h:: UInt ) = hash (Int64 (x), h)
7272
73+ # IntegerCodeUnits provides a little-endian byte representation of integers
74+ struct IntegerCodeUnits{T<: Integer } <: AbstractVector{UInt8}
75+ value:: T
76+ num_bytes:: Int
77+
78+ function IntegerCodeUnits (x:: T ) where {T<: Integer }
79+ # Calculate number of bytes needed (always pad to full byte)
80+ u = abs (x)
81+ num_bytes = max (cld (top_set_bit (u), 8 ), 1 )
82+ return new {T} (x, num_bytes)
83+ end
84+ end
85+
86+ function Base. size (units:: IntegerCodeUnits )
87+ return (units. num_bytes,)
88+ end
89+
90+ function Base. length (units:: IntegerCodeUnits )
91+ return units. num_bytes
92+ end
93+
94+ function Base. getindex (units:: IntegerCodeUnits , i:: Int )
95+ @boundscheck checkbounds (units, i)
96+ u = abs (units. value)
97+ byte_pos = i - 1
98+ return UInt8 ((u >>> (8 * byte_pos)) & 0xff )
99+ end
100+
101+ function Base. iterate (units:: IntegerCodeUnits , state:: Int = 1 )
102+ state > units. num_bytes && return nothing
103+ return units[state], state + 1
104+ end
105+
106+ # Main interface function to get little-endian byte representation of integers
107+ codeunits (x:: Integer ) = IntegerCodeUnits (x)
108+
109+ # UTF8Units provides UTF-8 byte iteration for any AbstractString
110+ struct UTF8Units{T<: AbstractString }
111+ string:: T
112+ end
113+
114+ utf8units (s:: AbstractString ) = codeunit (s) <: UInt8 ? codeunits (s) : UTF8Units (s)
115+
116+ # Iterator state: (char_iter_state, remaining_utf8_bytes)
117+ function Base. iterate (units:: UTF8Units )
118+ char_result = iterate (units. string)
119+ char_result === nothing && return nothing
120+ char, char_state = char_result
121+
122+ # Decode char to UTF-8 bytes (similar to the write function)
123+ u = bswap (reinterpret (UInt32, char))
124+
125+ # Return first byte and set up state for remaining bytes
126+ first_byte = u % UInt8
127+ remaining_bytes = u >> 8
128+ return first_byte, (char_state, remaining_bytes)
129+ end
130+
131+ function Base. iterate (units:: UTF8Units , state)
132+ char_state, remaining_bytes = state
133+ # If we have more bytes from current char, return next byte
134+ if remaining_bytes != 0
135+ byte = remaining_bytes % UInt8
136+ new_remaining = remaining_bytes >> 8
137+ return byte, (char_state, new_remaining)
138+ end
139+
140+ # Move to next char
141+ char_result = iterate (units. string, char_state)
142+ char_result === nothing && return nothing
143+ char, new_char_state = char_result
144+
145+ # Decode new char to UTF-8 bytes
146+ u = bswap (reinterpret (UInt32, char))
147+
148+ # Return first byte and set up state for remaining bytes
149+ first_byte = u % UInt8
150+ remaining_bytes = u >> 8
151+
152+ return first_byte, (new_char_state, remaining_bytes)
153+ end
154+
73155hash_integer (x:: Integer , h:: UInt ) = _hash_integer (x, UInt64 (h)) % UInt
74156function _hash_integer (
75157 x:: Integer ,
76158 seed:: UInt64 ,
77159 secret:: NTuple{4, UInt64} = HASH_SECRET
78160 )
161+ # Handle sign by XOR-ing with seed
79162 seed ⊻= (x < 0 )
80- u0 = abs (x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
81- u = u0
82-
83- # always left-pad to full byte
84- buflen = UInt (max (cld (top_set_bit (u), 8 ), 1 ))
85- seed = seed ⊻ hash_mix (seed ⊻ secret[3 ], secret[2 ])
86-
87- a = zero (UInt64)
88- b = zero (UInt64)
89- i = buflen
90-
91- if buflen ≤ 16
92- if buflen ≥ 4
93- seed ⊻= buflen
94- if buflen ≥ 8
95- a = UInt64 (u % UInt64)
96- b = UInt64 ((u >>> (8 * (buflen - 8 ))) % UInt64)
97- else
98- a = UInt64 (u % UInt32)
99- b = UInt64 ((u >>> (8 * (buflen - 4 ))) % UInt32)
100- end
101- else # buflen > 0
102- b0 = u % UInt8
103- b1 = (u >>> (8 * div (buflen, 2 ))) % UInt8
104- b2 = (u >>> (8 * (buflen - 1 ))) % UInt8
105- a = (UInt64 (b0) << 45 ) | UInt64 (b2)
106- b = UInt64 (b1)
107- end
108- else
109- if i > 48
110- see1 = seed
111- see2 = seed
112- while i > 48
113- l0 = u % UInt64; u >>>= 64
114- l1 = u % UInt64; u >>>= 64
115- l2 = u % UInt64; u >>>= 64
116- l3 = u % UInt64; u >>>= 64
117- l4 = u % UInt64; u >>>= 64
118- l5 = u % UInt64; u >>>= 64
119-
120- seed = hash_mix (l0 ⊻ secret[1 ], l1 ⊻ seed)
121- see1 = hash_mix (l2 ⊻ secret[2 ], l3 ⊻ see1)
122- see2 = hash_mix (l4 ⊻ secret[3 ], l5 ⊻ see2)
123- i -= 48
124- end
125- seed ⊻= see1
126- seed ⊻= see2
127- end
128- if i > 16
129- l0 = u % UInt64; u >>>= 64
130- l1 = u % UInt64; u >>>= 64
131- seed = hash_mix (l0 ⊻ secret[3 ], l1 ⊻ seed)
132- if i > 32
133- l2 = u % UInt64; u >>>= 64
134- l3 = u % UInt64; u >>>= 64
135- seed = hash_mix (l2 ⊻ secret[3 ], l3 ⊻ seed)
136- end
137- end
138-
139- a = (u0 >>> 8 (buflen - 16 )) % UInt64 ⊻ i
140- b = (u0 >>> 8 (buflen - 8 )) % UInt64
141- end
142-
143- a = a ⊻ secret[2 ]
144- b = b ⊻ seed
145- b, a = mul_parts (a, b)
146- return hash_mix (a ⊻ secret[4 ], b ⊻ secret[2 ] ⊻ i)
163+ # Get little-endian byte representation of absolute value
164+ # and hash using the new safe hash_bytes function
165+ u = abs (x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
166+ return hash_bytes (codeunits (u), seed, secret)
147167end
148168
149169
619639 return hash_mix (a ⊻ secret[4 ], b ⊻ secret[2 ] ⊻ bytes_chunk)
620640end
621641
642+ hash (data:: AbstractString , h:: UInt ) =
643+ hash_bytes (utf8units (data), UInt64 (h), HASH_SECRET) % UInt
622644@assume_effects :total hash (data:: String , h:: UInt ) =
623645 GC. @preserve data hash_bytes (pointer (data), sizeof (data), UInt64 (h), HASH_SECRET) % UInt
624646
0 commit comments