Skip to content

Commit bf90825

Browse files
authored
crypto.kt128: when using incremental hashing, use SIMD when possible (#25783)
Also add plain kt128 (without threading) to the benchmarks
1 parent 2f4bca4 commit bf90825

File tree

2 files changed

+114
-23
lines changed

2 files changed

+114
-23
lines changed

lib/std/crypto/benchmark.zig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const hashes = [_]Crypto{
3030
Crypto{ .ty = crypto.hash.sha3.Shake256, .name = "shake-256" },
3131
Crypto{ .ty = crypto.hash.sha3.TurboShake128(null), .name = "turboshake-128" },
3232
Crypto{ .ty = crypto.hash.sha3.TurboShake256(null), .name = "turboshake-256" },
33+
Crypto{ .ty = crypto.hash.sha3.KT128, .name = "kt128" },
3334
Crypto{ .ty = crypto.hash.blake2.Blake2s256, .name = "blake2s" },
3435
Crypto{ .ty = crypto.hash.blake2.Blake2b512, .name = "blake2b" },
3536
Crypto{ .ty = crypto.hash.Blake3, .name = "blake3" },

lib/std/crypto/kangarootwelve.zig

Lines changed: 113 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -848,6 +848,10 @@ fn KTHash(
848848
final_state: ?StateType, // Running TurboSHAKE state for final node
849849
num_leaves: usize, // Count of leaves processed (after first chunk)
850850

851+
// SIMD chunk batching
852+
pending_chunks: [8 * chunk_size]u8 align(cache_line_size), // Buffer for up to 8 chunks
853+
pending_count: usize, // Number of complete chunks in pending_chunks
854+
851855
/// Initialize a KangarooTwelve hashing context.
852856
/// The customization string is optional and used for domain separation.
853857
pub fn init(options: Options) Self {
@@ -861,9 +865,48 @@ fn KTHash(
861865
.first_chunk = null,
862866
.final_state = null,
863867
.num_leaves = 0,
868+
.pending_chunks = undefined,
869+
.pending_count = 0,
864870
};
865871
}
866872

873+
/// Flush all pending chunks using SIMD when possible
874+
fn flushPendingChunks(self: *Self) void {
875+
const cv_size = Variant.cv_size;
876+
877+
// Process all pending chunks using the largest SIMD batch sizes possible
878+
while (self.pending_count > 0) {
879+
// Try SIMD batches in decreasing size order
880+
inline for ([_]usize{ 8, 4, 2 }) |batch_size| {
881+
if (optimal_vector_len >= batch_size and self.pending_count >= batch_size) {
882+
var leaf_cvs: [batch_size * cv_size]u8 align(cache_line_size) = undefined;
883+
processLeaves(Variant, batch_size, self.pending_chunks[0 .. batch_size * chunk_size], &leaf_cvs);
884+
self.final_state.?.update(&leaf_cvs);
885+
self.num_leaves += batch_size;
886+
self.pending_count -= batch_size;
887+
888+
// Shift remaining chunks to the front
889+
if (self.pending_count > 0) {
890+
const remaining_bytes = self.pending_count * chunk_size;
891+
@memcpy(self.pending_chunks[0..remaining_bytes], self.pending_chunks[batch_size * chunk_size ..][0..remaining_bytes]);
892+
}
893+
break; // Continue outer loop to try next batch
894+
}
895+
}
896+
897+
// If no SIMD batch was possible, process one chunk with scalar code
898+
if (self.pending_count > 0 and self.pending_count < 2) {
899+
var cv_buffer: [64]u8 = undefined;
900+
const cv_slice = MultiSliceView.init(self.pending_chunks[0..chunk_size], &[_]u8{}, &[_]u8{});
901+
Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
902+
self.final_state.?.update(cv_buffer[0..cv_size]);
903+
self.num_leaves += 1;
904+
self.pending_count -= 1;
905+
break; // No more chunks to process
906+
}
907+
}
908+
}
909+
867910
/// Absorb data into the hash state.
868911
/// Can be called multiple times to incrementally add data.
869912
pub fn update(self: *Self, data: []const u8) void {
@@ -895,15 +938,21 @@ fn KTHash(
895938
const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
896939
self.final_state.?.update(&padding);
897940
} else {
898-
// Subsequent chunks - process as leaf and absorb CV
899-
const cv_size = Variant.cv_size;
900-
var cv_buffer: [64]u8 = undefined; // Max CV size
901-
const cv_slice = MultiSliceView.init(&self.buffer, &[_]u8{}, &[_]u8{});
902-
Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
903-
904-
// Absorb CV into final state immediately
905-
self.final_state.?.update(cv_buffer[0..cv_size]);
906-
self.num_leaves += 1;
941+
// Add chunk to pending buffer for SIMD batch processing
942+
@memcpy(self.pending_chunks[self.pending_count * chunk_size ..][0..chunk_size], &self.buffer);
943+
self.pending_count += 1;
944+
945+
// Flush when we have enough chunks for optimal SIMD batch
946+
// Determine best batch size for this architecture
947+
const optimal_batch_size = comptime blk: {
948+
if (optimal_vector_len >= 8) break :blk 8;
949+
if (optimal_vector_len >= 4) break :blk 4;
950+
if (optimal_vector_len >= 2) break :blk 2;
951+
break :blk 1;
952+
};
953+
if (self.pending_count >= optimal_batch_size) {
954+
self.flushPendingChunks();
955+
}
907956
}
908957
self.buffer_len = 0;
909958
}
@@ -931,24 +980,65 @@ fn KTHash(
931980
return;
932981
}
933982

934-
// Tree mode: we've already absorbed first_chunk + padding + intermediate CVs
935-
// Now handle remaining buffer data
936-
const remaining_with_custom_len = self.buffer_len + self.customization.len + self.custom_len_enc.len;
983+
// Flush any pending chunks with SIMD
984+
self.flushPendingChunks();
985+
986+
// Build view over remaining data (buffer + customization + encoding)
987+
const remaining_view = MultiSliceView.init(
988+
self.buffer[0..self.buffer_len],
989+
self.customization,
990+
self.custom_len_enc.slice(),
991+
);
992+
const remaining_len = remaining_view.totalLen();
993+
937994
var final_leaves = self.num_leaves;
995+
var leaf_start: usize = 0;
996+
997+
// Tree mode: initialize if not already done (lazy initialization)
998+
if (self.final_state == null and remaining_len > 0) {
999+
self.final_state = StateType.init(.{});
1000+
1001+
// Absorb first chunk (up to chunk_size bytes from remaining data)
1002+
const first_chunk_len = @min(chunk_size, remaining_len);
1003+
if (remaining_view.tryGetSlice(0, first_chunk_len)) |first_chunk| {
1004+
// Data is contiguous, use it directly
1005+
self.final_state.?.update(first_chunk);
1006+
} else {
1007+
// Data spans boundaries, copy to buffer
1008+
var first_chunk_buf: [chunk_size]u8 = undefined;
1009+
remaining_view.copyRange(0, first_chunk_len, first_chunk_buf[0..first_chunk_len]);
1010+
self.final_state.?.update(first_chunk_buf[0..first_chunk_len]);
1011+
}
9381012

939-
if (remaining_with_custom_len > 0) {
940-
// Build final leaf data with customization
941-
var final_leaf_buffer: [chunk_size + 256]u8 = undefined; // Extra space for customization
942-
@memcpy(final_leaf_buffer[0..self.buffer_len], self.buffer[0..self.buffer_len]);
943-
@memcpy(final_leaf_buffer[self.buffer_len..][0..self.customization.len], self.customization);
944-
@memcpy(final_leaf_buffer[self.buffer_len + self.customization.len ..][0..self.custom_len_enc.len], self.custom_len_enc.slice());
945-
946-
// Generate CV for final leaf and absorb it
947-
var cv_buffer: [64]u8 = undefined; // Max CV size
948-
const cv_slice = MultiSliceView.init(final_leaf_buffer[0..remaining_with_custom_len], &[_]u8{}, &[_]u8{});
949-
Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1013+
// Absorb padding (8 bytes: 0x03 followed by 7 zeros)
1014+
const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
1015+
self.final_state.?.update(&padding);
1016+
1017+
// Process remaining data as leaves
1018+
leaf_start = first_chunk_len;
1019+
}
1020+
1021+
// Process all remaining data as leaves (starting from leaf_start)
1022+
var offset = leaf_start;
1023+
while (offset < remaining_len) {
1024+
const leaf_end = @min(offset + chunk_size, remaining_len);
1025+
const leaf_size = leaf_end - offset;
1026+
1027+
var cv_buffer: [64]u8 = undefined;
1028+
if (remaining_view.tryGetSlice(offset, leaf_end)) |leaf_data| {
1029+
// Data is contiguous, use it directly
1030+
const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
1031+
Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1032+
} else {
1033+
// Data spans boundaries, copy to buffer
1034+
var leaf_buf: [chunk_size]u8 = undefined;
1035+
remaining_view.copyRange(offset, leaf_end, leaf_buf[0..leaf_size]);
1036+
const cv_slice = MultiSliceView.init(leaf_buf[0..leaf_size], &[_]u8{}, &[_]u8{});
1037+
Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1038+
}
9501039
self.final_state.?.update(cv_buffer[0..cv_size]);
9511040
final_leaves += 1;
1041+
offset = leaf_end;
9521042
}
9531043

9541044
// Absorb right_encode(num_leaves) and terminator

0 commit comments

Comments
 (0)