From 9e9fd16f6bbf320f1dbe7d136be6719a91c3d3aa Mon Sep 17 00:00:00 2001 From: J S <49557684+svilupp@users.noreply.github.com> Date: Sun, 19 May 2024 17:18:08 +0100 Subject: [PATCH] Add BitPacked embeddings for RAG retrieval (#152) --- CHANGELOG.md | 4 +- src/Experimental/RAGTools/preparation.jl | 64 +++++++++++++- src/Experimental/RAGTools/retrieval.jl | 102 ++++++++++++++++++---- src/Experimental/RAGTools/utils.jl | 97 ++++++++++++++++++++ test/Experimental/RAGTools/preparation.jl | 9 ++ test/Experimental/RAGTools/retrieval.jl | 102 ++++++++++++++++++++++ test/Experimental/RAGTools/utils.jl | 76 ++++++++++++++++ test/runtests.jl | 2 +- 8 files changed, 435 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 271eff83..4e4ba8cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,12 +16,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added new field `meta` to `TracerMessage` and `TracerMessageLike` to hold metadata in a simply dictionary. Change is backward-compatible. - Changed behaviour of `aitemplates(name::Symbol)` to look for the exact match on the template name, not just a partial match. This is a breaking change for the `aitemplates` function only. Motivation is that having multiple matches could have introduced subtle bugs when looking up valid placeholders for a template. - ### Added - Improved support for `aiclassify` with OpenAI models (you can now encode upto 40 choices). - Added a template for routing questions `:QuestionRouter` (to be used with `aiclassify`) -- Improved tracing by `TracerSchema` to automatically capture crucial metadata such as any LLM API kwargs (`api_kwargs`), use of prompt templates and its versions. Information is captured in `meta(tracer)` dictionary. See `?TracerSchema` for more information. +- Improved tracing by `TracerSchema` to automatically capture crucial metadata such as any LLM API kwargs (`api_kwargs`), use of prompt templates and its version. Information is captured in `meta(tracer)` dictionary. See `?TracerSchema` for more information. - New tracing schema `SaverSchema` allows to automatically serialize all conversations. It can be composed with other tracing schemas, eg, `TracerSchema` to automatically capture necessary metadata and serialize. See `?SaverSchema` for more information. +- Updated options for Binary embeddings (refer to release v0.18 for motivation). Adds utility functions `pack_bits` and `unpack_bits` to move between binary and UInt64 representations of embeddings. RAGTools adds the corresponding `BitPackedBatchEmbedder` and `BitPackedCosineSimilarity` for fast retrieval on these Bool<->UInt64 embeddings (credit to [**domluna's tinyRAG**](https://github.com/domluna/tinyRAG)). ### Fixed - Fixed a bug where `aiclassify` would not work when returning the full conversation for choices with extra descriptions diff --git a/src/Experimental/RAGTools/preparation.jl b/src/Experimental/RAGTools/preparation.jl index 7756c589..9b12b3c5 100644 --- a/src/Experimental/RAGTools/preparation.jl +++ b/src/Experimental/RAGTools/preparation.jl @@ -31,15 +31,27 @@ struct BatchEmbedder <: AbstractEmbedder end """ BinaryBatchEmbedder <: AbstractEmbedder -Same as `BatchEmbedder` but reduces the embeddings matrix to binary tool (eg, `BitMatrix`). +Same as `BatchEmbedder` but reduces the embeddings matrix to a binary form (eg, `BitMatrix`). Reference: [HuggingFace: Embedding Quantization](https://huggingface.co/blog/embedding-quantization#binary-quantization-in-vector-databases). """ struct BinaryBatchEmbedder <: AbstractEmbedder end +""" + BitPackedBatchEmbedder <: AbstractEmbedder + +Same as `BatchEmbedder` but reduces the embeddings matrix to a binary form packed in UInt64 (eg, `BitMatrix.chunks`). + +See also utilities `pack_bits` and `unpack_bits` to move between packed/non-packed binary forms. + +Reference: [HuggingFace: Embedding Quantization](https://huggingface.co/blog/embedding-quantization#binary-quantization-in-vector-databases). +""" +struct BitPackedBatchEmbedder <: AbstractEmbedder end + EmbedderEltype(::T) where {T} = EmbedderEltype(T) EmbedderEltype(::Type{<:AbstractEmbedder}) = Float32 EmbedderEltype(::Type{BinaryBatchEmbedder}) = Bool +EmbedderEltype(::Type{BitPackedBatchEmbedder}) = UInt64 ### Tagging Types """ @@ -302,6 +314,56 @@ function get_embeddings( emb = (emb .> 0) |> x -> x isa return_type ? x : return_type(x) end +""" + get_embeddings(embedder::BitPackedBatchEmbedder, docs::AbstractVector{<:AbstractString}; + verbose::Bool = true, + model::AbstractString = PT.MODEL_EMBEDDING, + truncate_dimension::Union{Int, Nothing} = nothing, + cost_tracker = Threads.Atomic{Float64}(0.0), + target_batch_size_length::Int = 80_000, + ntasks::Int = 4 * Threads.nthreads(), + kwargs...) + + +Embeds a vector of `docs` using the provided model (kwarg `model`) in a batched manner and then returns the binary embeddings matrix represented in UInt64 (bit-packed) - `BitPackedBatchEmbedder`. + +`BitPackedBatchEmbedder` tries to batch embedding calls for roughly 80K characters per call (to avoid exceeding the API rate limit) to reduce network latency. + +The best option for FAST and MEMORY-EFFICIENT storage of embeddings, for retrieval use `BitPackedCosineSimilarity`. + +# Notes +- `docs` are assumed to be already chunked to the reasonable sizes that fit within the embedding context limit. +- If you get errors about exceeding input sizes, first check the `max_length` in your chunks. + If that does NOT resolve the issue, try reducing the `target_batch_size_length` parameter (eg, 10_000) and number of tasks `ntasks=1`. + Some providers cannot handle large batch sizes. + +# Arguments +- `docs`: A vector of strings to be embedded. +- `verbose`: A boolean flag for verbose output. Default is `true`. +- `model`: The model to use for embedding. Default is `PT.MODEL_EMBEDDING`. +- `truncate_dimension`: The dimensionality of the embeddings to truncate to. Default is `nothing`. +- `cost_tracker`: A `Threads.Atomic{Float64}` object to track the total cost of the API calls. Useful to pass the total cost to the parent call. +- `target_batch_size_length`: The target length (in characters) of each batch of document chunks sent for embedding. Default is 80_000 characters. Speeds up embedding process. +- `ntasks`: The number of tasks to use for asyncmap. Default is 4 * Threads.nthreads(). + +See also: `unpack_bits`, `pack_bits`, `BitPackedCosineSimilarity`. +""" +function get_embeddings( + embedder::BitPackedBatchEmbedder, docs::AbstractVector{<:AbstractString}; + verbose::Bool = true, + model::AbstractString = PT.MODEL_EMBEDDING, + truncate_dimension::Union{Int, Nothing} = nothing, + cost_tracker = Threads.Atomic{Float64}(0.0), + target_batch_size_length::Int = 80_000, + ntasks::Int = 4 * Threads.nthreads(), + kwargs...) + emb = get_embeddings(BatchEmbedder(), docs; verbose, model, truncate_dimension, + cost_tracker, target_batch_size_length, ntasks, kwargs...) + # This will return Matrix{UInt64} to save space + # Use unpack_bits to convert back to BitMatrix + pack_bits(emb .> 0) +end + ### Tag Extraction function get_tags(tagger::AbstractTagger, docs::AbstractVector{<:AbstractString}; diff --git a/src/Experimental/RAGTools/retrieval.jl b/src/Experimental/RAGTools/retrieval.jl index 9cdefc8a..722020a7 100644 --- a/src/Experimental/RAGTools/retrieval.jl +++ b/src/Experimental/RAGTools/retrieval.jl @@ -46,6 +46,18 @@ Reference: [HuggingFace: Embedding Quantization](https://huggingface.co/blog/emb """ struct BinaryCosineSimilarity <: AbstractSimilarityFinder end +""" + BitPackedCosineSimilarity <: AbstractSimilarityFinder + +Finds the closest chunks to a query embedding by measuring the Hamming distance AND cosine similarity between the query and the chunks' embeddings in binary form. + +The difference to `BinaryCosineSimilarity` is that the binary values are packed into UInt64, which is more efficient. + +Reference: [HuggingFace: Embedding Quantization](https://huggingface.co/blog/embedding-quantization#binary-quantization-in-vector-databases). +Implementation of `hamming_distance` is based on [TinyRAG](https://github.com/domluna/tinyrag/blob/main/README.md). +""" +struct BitPackedCosineSimilarity <: AbstractSimilarityFinder end + """ NoTagFilter <: AbstractTagFilter @@ -202,32 +214,46 @@ function find_closest( c -> find_closest(finder, index, c; top_k = top_k_, kwargs...), vcat, eachcol(query_emb)) end -## For binary embeddings +#### For binary embeddings +## Source: https://github.com/domluna/tinyrag/blob/main/README.md +## With minor modifications to the signatures + +@inline function hamming_distance(x1::T, x2::T)::Int where {T <: Integer} + return Int(count_ones(x1 ⊻ x2)) +end +@inline function hamming_distance(x1::T, x2::T)::Int where {T <: Bool} + return Int(x1 ⊻ x2) +end +@inline function hamming_distance( + x1::AbstractVector{T}, x2::AbstractVector{T})::Int where {T <: Integer} + s = 0 + @inbounds @simd for i in eachindex(x1, x2) + s += hamming_distance(x1[i], x2[i]) + end + s +end + """ - hamming_distance(mat::AbstractMatrix{<:Bool}, vect::AbstractVector{<:Bool}) + hamming_distance( + mat::AbstractMatrix{T}, query::AbstractVector{T})::Vector{Int} where {T <: Integer} Calculates the column-wise Hamming distance between a matrix of binary vectors `mat` and a single binary vector `vect`. This is the first-pass ranking for `BinaryCosineSimilarity` method. + +Implementation from [**domluna's tinyRAG**](https://github.com/domluna/tinyRAG). """ -function hamming_distance(mat::AbstractMatrix{<:Bool}, vect::AbstractVector{<:Bool}) +@inline function hamming_distance( + mat::AbstractMatrix{T}, query::AbstractVector{T})::Vector{Int} where {T <: Integer} # Check if the number of rows matches - if size(mat, 1) != length(vect) - throw(ArgumentError("Matrix must have the same number of rows as the length of the Vector (provided: $(size(mat, 1)) vs $(length(vect)))")) + if size(mat, 1) != length(query) + throw(ArgumentError("Matrix must have the same number of rows as the length of the Vector (provided: $(size(mat, 1)) vs $(length(query)))")) end - - # Calculate number of different bits, the smaller the number, the more similar they are. - distances = zeros(Int, size(mat, 2)) - @inbounds for j in axes(mat, 2) - cnt = 0 - v = @view(mat[:, j]) - @simd for i in eachindex(vect, v) - cnt += v[i] ⊻ vect[i] - end - distances[j] = cnt + dists = zeros(Int, size(mat, 2)) + @inbounds @simd for i in axes(mat, 2) + dists[i] = hamming_distance(@view(mat[:, i]), query) end - - return distances + dists end """ @@ -272,6 +298,48 @@ function find_closest( return positions[new_positions], scores end +""" + find_closest( + finder::BitPackedCosineSimilarity, emb::AbstractMatrix{<:Bool}, + query_emb::AbstractVector{<:Real}; + top_k::Int = 100, rescore_multiplier::Int = 4, minimum_similarity::AbstractFloat = -1.0, kwargs...) + +Finds the indices of chunks (represented by embeddings in `emb`) that are closest to query embedding (`query_emb`) using bit-packed binary embeddings (in the index). + +This is a two-pass approach: +- First pass: Hamming distance in bit-packed binary form to get the `top_k * rescore_multiplier` (i.e., more than top_k) candidates. +- Second pass: Rescore the candidates with float embeddings and return the top_k. + +Returns only `top_k` closest indices. + +Reference: [HuggingFace: Embedding Quantization](https://huggingface.co/blog/embedding-quantization#binary-quantization-in-vector-databases). + +# Examples +Convert any Float embeddings to bit-packed binary like this: +```julia +bitpacked_emb = pack_bits(emb.>0) +``` +""" +function find_closest( + finder::BitPackedCosineSimilarity, emb::AbstractMatrix{<:Integer}, + query_emb::AbstractVector{<:Real}; + top_k::Int = 100, rescore_multiplier::Int = 4, minimum_similarity::AbstractFloat = -1.0, kwargs...) + # emb is an embedding matrix where the first dimension is the embedding dimension + + ## First pass, both in binary with Hamming, get rescore_multiplier times top_k + bit_query_emb = pack_bits(query_emb .> 0) + scores = hamming_distance(emb, bit_query_emb) + positions = scores |> sortperm |> x -> first(x, top_k * rescore_multiplier) + + ## Second pass, rescore with float embeddings and return top_k + unpacked_emb = unpack_bits(@view(emb[:, positions])) + new_positions, scores = find_closest(CosineSimilarity(), unpacked_emb, + query_emb; top_k, minimum_similarity, kwargs...) + + ## translate to original indices + return positions[new_positions], scores +end + ## TODO: Implement for MultiIndex ## function find_closest(index::AbstractMultiIndex, ## query_emb::AbstractVector{<:Real}; diff --git a/src/Experimental/RAGTools/utils.jl b/src/Experimental/RAGTools/utils.jl index b640ac81..02473852 100644 --- a/src/Experimental/RAGTools/utils.jl +++ b/src/Experimental/RAGTools/utils.jl @@ -367,3 +367,100 @@ function merge_kwargs_nested(nt1::NamedTuple, nt2::NamedTuple) end return (; zip(keys(result), values(result))...) end + +### Support for binary embeddings + +function pack_bits(arr::AbstractArray{<:Number}) + throw(ArgumentError("Input must be of binary eltype (Bool vs provided $(eltype(arr))). Please convert your matrix to binary before packing.")) +end + +""" + pack_bits(arr::AbstractMatrix{<:Bool}) -> Matrix{UInt64} + pack_bits(vect::AbstractVector{<:Bool}) -> Vector{UInt64} + +Pack a matrix or vector of boolean values into a more compact representation using UInt64. + +# Arguments (Input) +- `arr::AbstractMatrix{<:Bool}`: A matrix of boolean values where the number of rows must be divisible by 64. + +# Returns +- For `arr::AbstractMatrix{<:Bool}`: Returns a matrix of UInt64 where each element represents 64 boolean values from the original matrix. + +# Examples + +For vectors: +```julia +bin = rand(Bool, 128) +binint = pack_bits(bin) +binx = unpack_bits(binint) +@assert bin == binx +``` + +For matrices: +```julia +bin = rand(Bool, 128, 10) +binint = pack_bits(bin) +binx = unpack_bits(binint) +@assert bin == binx +``` +""" +function pack_bits(arr::AbstractMatrix{<:Bool}) + rows, cols = size(arr) + @assert rows % 64==0 "Number of rows must be divisable by 64" + new_rows = rows ÷ 64 + reshape(BitArray(arr).chunks, new_rows, cols) +end +function pack_bits(vect::AbstractVector{<:Bool}) + len = length(vect) + @assert len % 64==0 "Length must be divisable by 64" + BitArray(vect).chunks +end + +function unpack_bits(arr::AbstractArray{<:Number}) + throw(ArgumentError("Input must be of UInt64 eltype (provided: $(eltype(arr))). Are you sure you've packed this array?")) +end + +""" + unpack_bits(packed_vector::AbstractVector{UInt64}) -> Vector{Bool} + unpack_bits(packed_matrix::AbstractMatrix{UInt64}) -> Matrix{Bool} + +Unpack a vector or matrix of UInt64 values into their original boolean representation. + +# Arguments (Input) +- `packed_matrix::AbstractMatrix{UInt64}`: A matrix of UInt64 values where each element represents 64 boolean values. + +# Returns +- For `packed_matrix::AbstractMatrix{UInt64}`: Returns a matrix of boolean values where the number of rows is 64 times the number of rows in the input matrix. + +# Examples + +For vectors: +```julia +bin = rand(Bool, 128) +binint = pack_bits(bin) +binx = unpack_bits(binint) +@assert bin == binx +``` + +For matrices: +```julia +bin = rand(Bool, 128, 10) +binint = pack_bits(bin) +binx = unpack_bits(binint) +@assert bin == binx +``` +""" +function unpack_bits(packed_vector::AbstractVector{UInt64}) + return Bool[((x >> i) & 1) == 1 for x in packed_vector for i in 0:63] +end +function unpack_bits(packed_matrix::AbstractMatrix{UInt64}) + num_rows, num_cols = size(packed_matrix) + output_rows = num_rows * 64 + output_matrix = Matrix{Bool}(undef, output_rows, num_cols) + + for col in axes(packed_matrix, 2) + output_matrix[:, col] = unpack_bits(@view(packed_matrix[:, col])) + end + + return output_matrix +end diff --git a/test/Experimental/RAGTools/preparation.jl b/test/Experimental/RAGTools/preparation.jl index bdaedced..347da507 100644 --- a/test/Experimental/RAGTools/preparation.jl +++ b/test/Experimental/RAGTools/preparation.jl @@ -9,6 +9,7 @@ using PromptingTools.Experimental.RAGTools: build_tags, build_index, SimpleIndex get_tags, get_chunks, get_embeddings using PromptingTools.Experimental.RAGTools: build_tags, build_index using PromptingTools: TestEchoOpenAISchema +using PromptingTools.Experimental.RAGTools: pack_bits, BitPackedBatchEmbedder @testset "load_text" begin # from file @@ -80,9 +81,17 @@ end @test size(output) == (100, 2) @test eltype(output) == Bool + # BitPackedBatchEmbedder + output = get_embeddings( + BitPackedBatchEmbedder(), docs; model = "mock-emb") + @test size(output) == (2, 2) + @test eltype(output) == UInt64 + output = pack_bits(ones(Float32, 128, 2) .> 0) + # EmbedderEltype @test EmbedderEltype(BinaryBatchEmbedder()) == Bool @test EmbedderEltype(BatchEmbedder()) == Float32 + @test EmbedderEltype(BitPackedBatchEmbedder()) == UInt64 end @testset "tags_extract" begin diff --git a/test/Experimental/RAGTools/retrieval.jl b/test/Experimental/RAGTools/retrieval.jl index 9dbebed4..9d1404ba 100644 --- a/test/Experimental/RAGTools/retrieval.jl +++ b/test/Experimental/RAGTools/retrieval.jl @@ -10,6 +10,8 @@ using PromptingTools.Experimental.RAGTools: find_closest, hamming_distance, find rerank, rephrase, retrieve using PromptingTools.Experimental.RAGTools: NoReranker, CohereReranker +using PromptingTools.Experimental.RAGTools: hamming_distance, BitPackedCosineSimilarity, + pack_bits, unpack_bits @testset "rephrase" begin # Test rephrase with NoRephraser, simple passthrough @@ -39,6 +41,8 @@ using PromptingTools.Experimental.RAGTools: NoReranker, CohereReranker end @testset "hamming_distance" begin + + ## ORIGINAL TESTS # Test for matching number of rows @test_throws ArgumentError hamming_distance( [true false; false true], [true, false, true]) @@ -48,6 +52,81 @@ end @test hamming_distance([true false; false true], [false, true]) == [2, 0] @test hamming_distance([true false; false true], [true, true]) == [1, 1] @test hamming_distance([true false; false true], [false, false]) == [1, 1] + + ## NEW TESTS + # Test for Bool vectors + vec1 = Bool[1, 0, 1, 0, 1, 0, 1, 0] + vec2 = Bool[0, 1, 0, 1, 0, 1, 0, 1] + # Basic functionality + @test hamming_distance(vec1, vec2) == 8 + + # Edge cases + vec3 = Bool[1, 1, 1, 1, 1, 1, 1, 1] + vec4 = Bool[0, 0, 0, 0, 0, 0, 0, 0] + @test hamming_distance(vec3, vec4) == 8 + + vec5 = Bool[1, 1, 1, 1, 1, 1, 1, 1] + vec6 = Bool[1, 1, 1, 1, 1, 1, 1, 1] + @test hamming_distance(vec5, vec6) == 0 + + # Test for UInt64 (bitpacked) vectors + vec7 = pack_bits(repeat(vec1, 8)) + vec8 = pack_bits(repeat(vec2, 8)) + @test hamming_distance(vec7, vec8) == 64 + + vec9 = pack_bits(repeat(vec3, 8)) + vec10 = pack_bits(repeat(vec4, 8)) + @test hamming_distance(vec9, vec10) == 64 + + vec11 = pack_bits(repeat(vec5, 8)) + vec12 = pack_bits(repeat(vec6, 8)) + @test hamming_distance(vec11, vec12) == 0 + + # Test for Bool matrices + mat1 = [vec1 vec2] + mat2 = [vec3 vec4] + @test hamming_distance(mat1, vec2) == [8, 0] + @test hamming_distance(mat2, vec3) == [0, 8] + + # Test for UInt64 (bitpacked) matrices + mat3 = pack_bits(repeat(mat1; outer = 8)) + mat4 = pack_bits(repeat(mat2; outer = 8)) + @test hamming_distance(mat3, vec8) == [64, 0] + @test hamming_distance(mat4, vec9) == [0, 64] + + # Test for mismatched dimensions + vec13 = Bool[1, 0, 1] + @test_throws ArgumentError hamming_distance(mat1, vec13) + + # Additional edge cases + # Empty vectors + vec_empty1 = Bool[] + vec_empty2 = Bool[] + @test hamming_distance(vec_empty1, vec_empty2) == 0 + + # Single element vectors + vec_single1 = Bool[1] + vec_single2 = Bool[0] + @test hamming_distance(vec_single1, vec_single2) == 1 + + # Large vectors + vec_large1 = Bool[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] + vec_large2 = Bool[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] + @test hamming_distance(vec_large1, vec_large2) == 32 + + # Large vectors with bitpacking + vec_large_packed1 = pack_bits(repeat(vec_large1, 2)) + vec_large_packed2 = pack_bits(repeat(vec_large2, 2)) + @test hamming_distance(vec_large_packed1, vec_large_packed2) == 64 + + ## Compare packed vs binary results + mat_rand1 = rand(Bool, 128, 10) + q_rand2 = rand(Bool, 128) + hamming_dist_binary = hamming_distance(mat_rand1, q_rand2) + hamming_dist_packed = hamming_distance(pack_bits(mat_rand1), pack_bits(q_rand2)) + @test hamming_dist_binary == hamming_dist_packed end @testset "find_closest" begin @@ -152,6 +231,29 @@ end BinaryCosineSimilarity(), emb, query_emb; top_k = 1, minimum_similarity = 0.6) @test isempty(positions) @test isempty(scores) + + ### Sense check for approximate methods + + # Generate random embeddings as a sense check + Random.seed!(1234) # For reproducibility + emb = mapreduce(normalize, hcat, eachcol(randn(128, 1000))) + query_emb = randn(128) |> normalize # Normalize the query embedding + + # Calculate positions and scores using normal CosineSimilarity + positions_cosine, scores_cosine = find_closest( + CosineSimilarity(), emb, query_emb; top_k = 10) + + # Calculate positions and scores using BinaryCosineSimilarity + binary_emb = map(>(0), emb) + positions_binary, scores_binary = find_closest( + BinaryCosineSimilarity(), binary_emb, query_emb; top_k = 10) + @test length(intersect(positions_cosine, positions_binary)) >= 1 + + # Calculate positions and scores using BinaryCosineSimilarity + packed_emb = pack_bits(binary_emb) + positions_packed, scores_packed = find_closest( + BitPackedCosineSimilarity(), packed_emb, query_emb; top_k = 10) + @test length(intersect(positions_cosine, positions_packed)) >= 1 end @testset "find_tags" begin diff --git a/test/Experimental/RAGTools/utils.jl b/test/Experimental/RAGTools/utils.jl index 9453769c..bc122dde 100644 --- a/test/Experimental/RAGTools/utils.jl +++ b/test/Experimental/RAGTools/utils.jl @@ -6,6 +6,7 @@ using PromptingTools.Experimental.RAGTools: token_with_boundaries, text_to_trigr using PromptingTools.Experimental.RAGTools: split_into_code_and_sentences using PromptingTools.Experimental.RAGTools: getpropertynested, setpropertynested, merge_kwargs_nested +using PromptingTools.Experimental.RAGTools: pack_bits, unpack_bits @testset "_check_aiextract_capability" begin @test _check_aiextract_capability("gpt-3.5-turbo") == nothing @@ -366,3 +367,78 @@ end expected = (; a = 1, b = (; c = 2)) @test merge_kwargs_nested(nt1, nt2) == expected end + +@testset "pack_bits,unpack_bits" begin + ### Test for vectors + # Basic functionality + bin = rand(Bool, 128) + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # Edge cases + # Test with all true values + bin = trues(128) + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # Test with all false values + bin = falses(128) + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # Test with alternating true and false values + bin = Bool[mod(i, 2) == 0 for i in 1:128] + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # empty vector + bin_empty = Bool[] + binint_empty = pack_bits(bin_empty) + binx_empty = unpack_bits(binint_empty) + @test bin_empty == binx_empty + + # Invalid input + # Test with length not divisible by 64 + bin = rand(Bool, 130) + @test_throws AssertionError pack_bits(bin) + @test_throws ArgumentError pack_bits(rand(Float32, 128)) + @test_throws ArgumentError unpack_bits(rand(Float32, 128)) + + ### Test for matrices + # Basic functionality + bin = rand(Bool, 128, 10) + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # Edge cases + # Test with all true values + bin = trues(128, 10) + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # Test with all false values + bin = falses(128, 10) + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # Test with alternating true and false values + bin = Bool[mod(i, 2) == 0 for i in 1:128, j in 1:10] + binint = pack_bits(bin) + binx = unpack_bits(binint) + @test bin == binx + + # Invalid input + # Test with number of rows not divisible by 64 + bin = rand(Bool, 130, 10) + @test_throws AssertionError pack_bits(bin) + # Wrong number type + @test_throws ArgumentError pack_bits(rand(Float32, 128, 10)) + @test_throws ArgumentError unpack_bits(rand(Float32, 128, 10)) +end diff --git a/test/runtests.jl b/test/runtests.jl index eb8614f1..24b69e1a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,7 @@ using OpenAI, HTTP, JSON3 using SparseArrays, LinearAlgebra, Markdown using Statistics using Dates: now -using Test, Pkg +using Test, Pkg, Random const PT = PromptingTools using Aqua