Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 45 additions & 19 deletions base/gmp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -864,21 +864,48 @@ if Limb === UInt64 === UInt

using .Base: HASH_SECRET, hash_bytes, hash_finalizer

# UnsafeLimbView provides a safe iterator interface to BigInt limb data
struct UnsafeLimbView <: AbstractVector{UInt8}
bigint::BigInt
start_byte::Int
num_bytes::Int
end

function Base.size(view::UnsafeLimbView)
return (view.num_bytes,)
end

function Base.getindex(view::UnsafeLimbView, i::Int)
@boundscheck checkbounds(view, i)
GC.@preserve view begin
limb_index = div(view.start_byte + i - 2, 8) + 1
byte_in_limb = (view.start_byte + i - 2) % 8
limb = unsafe_load(view.bigint.d, limb_index)
return UInt8((limb >> (8 * byte_in_limb)) & 0xff)
end
end

function Base.iterate(view::UnsafeLimbView, state::Int = 1)
state > view.num_bytes && return nothing
return @inbounds(view[state]), state + 1
end

function Base.length(view::UnsafeLimbView)
return view.num_bytes
end

function hash_integer(n::BigInt, h::UInt)
iszero(n) && return hash_integer(0, h)
GC.@preserve n begin
s = n.size
h ⊻= (s < 0)

us = abs(s)
leading_zero_bytes = div(leading_zeros(unsafe_load(n.d, us)), 8)
hash_bytes(
Ptr{UInt8}(n.d),
8 * us - leading_zero_bytes,
h,
HASH_SECRET
)
end
s = n.size
h ⊻= (s < 0)

us = abs(s)
leading_zero_bytes = div(leading_zeros(unsafe_load(n.d, us)), 8)
num_bytes = 8 * us - leading_zero_bytes

# Use UnsafeLimbView for safe iterator-based access
limb_view = UnsafeLimbView(n, 1, num_bytes)
return hash_bytes(limb_view, h, HASH_SECRET)
end

function hash(x::BigInt, h::UInt)
Expand Down Expand Up @@ -913,12 +940,11 @@ if Limb === UInt64 === UInt
h ⊻= (sz < 0)
leading_zero_bytes = div(leading_zeros(unsafe_load(x.d, asz)), 8)
trailing_zero_bytes = div(pow, 8)
return hash_bytes(
Ptr{UInt8}(x.d) + trailing_zero_bytes,
8 * asz - (leading_zero_bytes + trailing_zero_bytes),
h,
HASH_SECRET
)
num_bytes = 8 * asz - (leading_zero_bytes + trailing_zero_bytes)

# Use UnsafeLimbView for safe iterator-based access
limb_view = UnsafeLimbView(x, trailing_zero_bytes + 1, num_bytes)
return hash_bytes(limb_view, h, HASH_SECRET)
end
end
end
Expand Down
156 changes: 89 additions & 67 deletions base/hashing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,80 +70,100 @@ hash(x::UInt64, h::UInt) = hash_uint64(hash_mix_linear(x, h))
hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h)
hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h)

# IntegerCodeUnits provides a little-endian byte representation of integers
struct IntegerCodeUnits{T<:Integer} <: AbstractVector{UInt8}
value::T
num_bytes::Int

function IntegerCodeUnits(x::T) where {T<:Integer}
# Calculate number of bytes needed (always pad to full byte)
u = abs(x)
num_bytes = max(cld(top_set_bit(u), 8), 1)
return new{T}(x, num_bytes)
end
end

function Base.size(units::IntegerCodeUnits)
return (units.num_bytes,)
end

function Base.length(units::IntegerCodeUnits)
return units.num_bytes
end

function Base.getindex(units::IntegerCodeUnits, i::Int)
@boundscheck checkbounds(units, i)
u = abs(units.value)
byte_pos = i - 1
return UInt8((u >>> (8 * byte_pos)) & 0xff)
end

function Base.iterate(units::IntegerCodeUnits, state::Int = 1)
state > units.num_bytes && return nothing
return units[state], state + 1
end

# Main interface function to get little-endian byte representation of integers
codeunits(x::Integer) = IntegerCodeUnits(x)

# UTF8Units provides UTF-8 byte iteration for any AbstractString
struct UTF8Units{T<:AbstractString}
string::T
end

utf8units(s::AbstractString) = codeunit(s) <: UInt8 ? codeunits(s) : UTF8Units(s)

# Iterator state: (char_iter_state, remaining_utf8_bytes)
function Base.iterate(units::UTF8Units)
char_result = iterate(units.string)
char_result === nothing && return nothing
char, char_state = char_result

# Decode char to UTF-8 bytes (similar to the write function)
u = bswap(reinterpret(UInt32, char))

# Return first byte and set up state for remaining bytes
first_byte = u % UInt8
remaining_bytes = u >> 8
return first_byte, (char_state, remaining_bytes)
end

function Base.iterate(units::UTF8Units, state)
char_state, remaining_bytes = state
# If we have more bytes from current char, return next byte
if remaining_bytes != 0
byte = remaining_bytes % UInt8
new_remaining = remaining_bytes >> 8
return byte, (char_state, new_remaining)
end

# Move to next char
char_result = iterate(units.string, char_state)
char_result === nothing && return nothing
char, new_char_state = char_result

# Decode new char to UTF-8 bytes
u = bswap(reinterpret(UInt32, char))

# Return first byte and set up state for remaining bytes
first_byte = u % UInt8
remaining_bytes = u >> 8

return first_byte, (new_char_state, remaining_bytes)
end

hash_integer(x::Integer, h::UInt) = _hash_integer(x, UInt64(h)) % UInt
function _hash_integer(
x::Integer,
seed::UInt64,
secret::NTuple{4, UInt64} = HASH_SECRET
)
# Handle sign by XOR-ing with seed
seed ⊻= (x < 0)
u0 = abs(x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
u = u0

# always left-pad to full byte
buflen = UInt(max(cld(top_set_bit(u), 8), 1))
seed = seed ⊻ hash_mix(seed ⊻ secret[3], secret[2])

a = zero(UInt64)
b = zero(UInt64)
i = buflen

if buflen ≤ 16
if buflen ≥ 4
seed ⊻= buflen
if buflen ≥ 8
a = UInt64(u % UInt64)
b = UInt64((u >>> (8 * (buflen - 8))) % UInt64)
else
a = UInt64(u % UInt32)
b = UInt64((u >>> (8 * (buflen - 4))) % UInt32)
end
else # buflen > 0
b0 = u % UInt8
b1 = (u >>> (8 * div(buflen, 2))) % UInt8
b2 = (u >>> (8 * (buflen - 1))) % UInt8
a = (UInt64(b0) << 45) | UInt64(b2)
b = UInt64(b1)
end
else
if i > 48
see1 = seed
see2 = seed
while i > 48
l0 = u % UInt64; u >>>= 64
l1 = u % UInt64; u >>>= 64
l2 = u % UInt64; u >>>= 64
l3 = u % UInt64; u >>>= 64
l4 = u % UInt64; u >>>= 64
l5 = u % UInt64; u >>>= 64

seed = hash_mix(l0 ⊻ secret[1], l1 ⊻ seed)
see1 = hash_mix(l2 ⊻ secret[2], l3 ⊻ see1)
see2 = hash_mix(l4 ⊻ secret[3], l5 ⊻ see2)
i -= 48
end
seed ⊻= see1
seed ⊻= see2
end
if i > 16
l0 = u % UInt64; u >>>= 64
l1 = u % UInt64; u >>>= 64
seed = hash_mix(l0 ⊻ secret[3], l1 ⊻ seed)
if i > 32
l2 = u % UInt64; u >>>= 64
l3 = u % UInt64; u >>>= 64
seed = hash_mix(l2 ⊻ secret[3], l3 ⊻ seed)
end
end

a = (u0 >>> 8(buflen - 16)) % UInt64 ⊻ i
b = (u0 >>> 8(buflen - 8)) % UInt64
end

a = a ⊻ secret[2]
b = b ⊻ seed
b, a = mul_parts(a, b)
return hash_mix(a ⊻ secret[4], b ⊻ secret[2] ⊻ i)
# Get little-endian byte representation of absolute value
# and hash using the new safe hash_bytes function
u = abs(x) # n.b.: this hashes typemin(IntN) correctly even if abs fails
return hash_bytes(codeunits(u), seed, secret)
end


Expand Down Expand Up @@ -619,6 +639,8 @@ end
return hash_mix(a ⊻ secret[4], b ⊻ secret[2] ⊻ bytes_chunk)
end

hash(data::AbstractString, h::UInt) =
hash_bytes(utf8units(data), UInt64(h), HASH_SECRET) % UInt
@assume_effects :total hash(data::String, h::UInt) =
GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) % UInt

Expand Down
4 changes: 0 additions & 4 deletions base/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -362,10 +362,6 @@ end

isless(a::Symbol, b::Symbol) = cmp(a, b) < 0

# hashing

hash(s::AbstractString, h::UInt) = hash(String(s)::String, h)

## character index arithmetic ##

"""
Expand Down
3 changes: 2 additions & 1 deletion base/strings/lazy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ iterate(s::LazyString, i::Integer) = iterate(String(s), i)
isequal(a::LazyString, b::LazyString) = isequal(String(a), String(b))
==(a::LazyString, b::LazyString) = (String(a) == String(b))
ncodeunits(s::LazyString) = ncodeunits(String(s))
codeunit(s::LazyString) = codeunit(String(s))
codeunit(s::LazyString) = codeunit("") # returns UInt8
codeunit(s::LazyString, i::Integer) = codeunit(String(s), i)
codeunits(s::LazyString) = codeunits(String(s))
isvalid(s::LazyString, i::Integer) = isvalid(String(s), i)
8 changes: 3 additions & 5 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1193,12 +1193,10 @@ end
apple_uint8 = Vector{UInt8}("Apple")
@test apple_uint8 == [0x41, 0x70, 0x70, 0x6c, 0x65]

apple_uint8 = Array{UInt8}("Apple")
@test apple_uint8 == [0x41, 0x70, 0x70, 0x6c, 0x65]

Base.String(::tstStringType) = "Test"
Base.codeunit(::tstStringType) = UInt8
Base.codeunits(t::tstStringType) = t.data
abstract_apple = tstStringType(apple_uint8)
@test hash(abstract_apple, UInt(1)) == hash("Test", UInt(1))
@test hash(abstract_apple, UInt(1)) == hash("Apple", UInt(1))

@test length("abc", 1, 3) == length("abc", UInt(1), UInt(3))

Expand Down