@@ -848,6 +848,10 @@ fn KTHash(
848848 final_state : ? StateType , // Running TurboSHAKE state for final node
849849 num_leaves : usize , // Count of leaves processed (after first chunk)
850850
851+ // SIMD chunk batching
852+ pending_chunks : [8 * chunk_size ]u8 align (cache_line_size ), // Buffer for up to 8 chunks
853+ pending_count : usize , // Number of complete chunks in pending_chunks
854+
851855 /// Initialize a KangarooTwelve hashing context.
852856 /// The customization string is optional and used for domain separation.
853857 pub fn init (options : Options ) Self {
@@ -861,9 +865,48 @@ fn KTHash(
861865 .first_chunk = null ,
862866 .final_state = null ,
863867 .num_leaves = 0 ,
868+ .pending_chunks = undefined ,
869+ .pending_count = 0 ,
864870 };
865871 }
866872
873+ /// Flush all pending chunks using SIMD when possible
874+ fn flushPendingChunks (self : * Self ) void {
875+ const cv_size = Variant .cv_size ;
876+
877+ // Process all pending chunks using the largest SIMD batch sizes possible
878+ while (self .pending_count > 0 ) {
879+ // Try SIMD batches in decreasing size order
880+ inline for ([_ ]usize { 8 , 4 , 2 }) | batch_size | {
881+ if (optimal_vector_len >= batch_size and self .pending_count >= batch_size ) {
882+ var leaf_cvs : [batch_size * cv_size ]u8 align (cache_line_size ) = undefined ;
883+ processLeaves (Variant , batch_size , self .pending_chunks [0 .. batch_size * chunk_size ], & leaf_cvs );
884+ self .final_state .? .update (& leaf_cvs );
885+ self .num_leaves += batch_size ;
886+ self .pending_count -= batch_size ;
887+
888+ // Shift remaining chunks to the front
889+ if (self .pending_count > 0 ) {
890+ const remaining_bytes = self .pending_count * chunk_size ;
891+ @memcpy (self .pending_chunks [0.. remaining_bytes ], self .pending_chunks [batch_size * chunk_size .. ][0.. remaining_bytes ]);
892+ }
893+ break ; // Continue outer loop to try next batch
894+ }
895+ }
896+
897+ // If no SIMD batch was possible, process one chunk with scalar code
898+ if (self .pending_count > 0 and self .pending_count < 2 ) {
899+ var cv_buffer : [64 ]u8 = undefined ;
900+ const cv_slice = MultiSliceView .init (self .pending_chunks [0.. chunk_size ], &[_ ]u8 {}, &[_ ]u8 {});
901+ Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
902+ self .final_state .? .update (cv_buffer [0.. cv_size ]);
903+ self .num_leaves += 1 ;
904+ self .pending_count -= 1 ;
905+ break ; // No more chunks to process
906+ }
907+ }
908+ }
909+
867910 /// Absorb data into the hash state.
868911 /// Can be called multiple times to incrementally add data.
869912 pub fn update (self : * Self , data : []const u8 ) void {
@@ -895,15 +938,21 @@ fn KTHash(
895938 const padding = [_ ]u8 { 0x03 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 };
896939 self .final_state .? .update (& padding );
897940 } else {
898- // Subsequent chunks - process as leaf and absorb CV
899- const cv_size = Variant .cv_size ;
900- var cv_buffer : [64 ]u8 = undefined ; // Max CV size
901- const cv_slice = MultiSliceView .init (& self .buffer , &[_ ]u8 {}, &[_ ]u8 {});
902- Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
903-
904- // Absorb CV into final state immediately
905- self .final_state .? .update (cv_buffer [0.. cv_size ]);
906- self .num_leaves += 1 ;
941+ // Add chunk to pending buffer for SIMD batch processing
942+ @memcpy (self .pending_chunks [self .pending_count * chunk_size .. ][0.. chunk_size ], & self .buffer );
943+ self .pending_count += 1 ;
944+
945+ // Flush when we have enough chunks for optimal SIMD batch
946+ // Determine best batch size for this architecture
947+ const optimal_batch_size = comptime blk : {
948+ if (optimal_vector_len >= 8 ) break :blk 8 ;
949+ if (optimal_vector_len >= 4 ) break :blk 4 ;
950+ if (optimal_vector_len >= 2 ) break :blk 2 ;
951+ break :blk 1 ;
952+ };
953+ if (self .pending_count >= optimal_batch_size ) {
954+ self .flushPendingChunks ();
955+ }
907956 }
908957 self .buffer_len = 0 ;
909958 }
@@ -931,24 +980,65 @@ fn KTHash(
931980 return ;
932981 }
933982
934- // Tree mode: we've already absorbed first_chunk + padding + intermediate CVs
935- // Now handle remaining buffer data
936- const remaining_with_custom_len = self .buffer_len + self .customization .len + self .custom_len_enc .len ;
983+ // Flush any pending chunks with SIMD
984+ self .flushPendingChunks ();
985+
986+ // Build view over remaining data (buffer + customization + encoding)
987+ const remaining_view = MultiSliceView .init (
988+ self .buffer [0.. self .buffer_len ],
989+ self .customization ,
990+ self .custom_len_enc .slice (),
991+ );
992+ const remaining_len = remaining_view .totalLen ();
993+
937994 var final_leaves = self .num_leaves ;
995+ var leaf_start : usize = 0 ;
996+
997+ // Tree mode: initialize if not already done (lazy initialization)
998+ if (self .final_state == null and remaining_len > 0 ) {
999+ self .final_state = StateType .init (.{});
1000+
1001+ // Absorb first chunk (up to chunk_size bytes from remaining data)
1002+ const first_chunk_len = @min (chunk_size , remaining_len );
1003+ if (remaining_view .tryGetSlice (0 , first_chunk_len )) | first_chunk | {
1004+ // Data is contiguous, use it directly
1005+ self .final_state .? .update (first_chunk );
1006+ } else {
1007+ // Data spans boundaries, copy to buffer
1008+ var first_chunk_buf : [chunk_size ]u8 = undefined ;
1009+ remaining_view .copyRange (0 , first_chunk_len , first_chunk_buf [0.. first_chunk_len ]);
1010+ self .final_state .? .update (first_chunk_buf [0.. first_chunk_len ]);
1011+ }
9381012
939- if (remaining_with_custom_len > 0 ) {
940- // Build final leaf data with customization
941- var final_leaf_buffer : [chunk_size + 256 ]u8 = undefined ; // Extra space for customization
942- @memcpy (final_leaf_buffer [0.. self .buffer_len ], self .buffer [0.. self .buffer_len ]);
943- @memcpy (final_leaf_buffer [self .buffer_len .. ][0.. self .customization .len ], self .customization );
944- @memcpy (final_leaf_buffer [self .buffer_len + self .customization .len .. ][0.. self .custom_len_enc .len ], self .custom_len_enc .slice ());
945-
946- // Generate CV for final leaf and absorb it
947- var cv_buffer : [64 ]u8 = undefined ; // Max CV size
948- const cv_slice = MultiSliceView .init (final_leaf_buffer [0.. remaining_with_custom_len ], &[_ ]u8 {}, &[_ ]u8 {});
949- Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
1013+ // Absorb padding (8 bytes: 0x03 followed by 7 zeros)
1014+ const padding = [_ ]u8 { 0x03 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 };
1015+ self .final_state .? .update (& padding );
1016+
1017+ // Process remaining data as leaves
1018+ leaf_start = first_chunk_len ;
1019+ }
1020+
1021+ // Process all remaining data as leaves (starting from leaf_start)
1022+ var offset = leaf_start ;
1023+ while (offset < remaining_len ) {
1024+ const leaf_end = @min (offset + chunk_size , remaining_len );
1025+ const leaf_size = leaf_end - offset ;
1026+
1027+ var cv_buffer : [64 ]u8 = undefined ;
1028+ if (remaining_view .tryGetSlice (offset , leaf_end )) | leaf_data | {
1029+ // Data is contiguous, use it directly
1030+ const cv_slice = MultiSliceView .init (leaf_data , &[_ ]u8 {}, &[_ ]u8 {});
1031+ Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
1032+ } else {
1033+ // Data spans boundaries, copy to buffer
1034+ var leaf_buf : [chunk_size ]u8 = undefined ;
1035+ remaining_view .copyRange (offset , leaf_end , leaf_buf [0.. leaf_size ]);
1036+ const cv_slice = MultiSliceView .init (leaf_buf [0.. leaf_size ], &[_ ]u8 {}, &[_ ]u8 {});
1037+ Variant .turboSHAKEToBuffer (& cv_slice , 0x0B , cv_buffer [0.. cv_size ]);
1038+ }
9501039 self .final_state .? .update (cv_buffer [0.. cv_size ]);
9511040 final_leaves += 1 ;
1041+ offset = leaf_end ;
9521042 }
9531043
9541044 // Absorb right_encode(num_leaves) and terminator
0 commit comments