Implement threaded BLAKE3 (#25587)

jedisct1 · web-flow · commit d5585bc650b7 · 2025-11-01T07:40:03.000+01:00
Allows BLAKE3 to be computed using multiple threads.
diff --git a/lib/std/crypto/benchmark.zig b/lib/std/crypto/benchmark.zig
@@ -35,6 +35,10 @@ const hashes = [_]Crypto{
     Crypto{ .ty = crypto.hash.Blake3, .name = "blake3" },
 };
 
+const parallel_hashes = [_]Crypto{
+    Crypto{ .ty = crypto.hash.Blake3, .name = "blake3-parallel" },
+};
+
 const block_size: usize = 8 * 8192;
 
 pub fn benchmarkHash(comptime Hash: anytype, comptime bytes: comptime_int) !u64 {
@@ -61,6 +65,25 @@ pub fn benchmarkHash(comptime Hash: anytype, comptime bytes: comptime_int) !u64
     return throughput;
 }
 
+pub fn benchmarkHashParallel(comptime Hash: anytype, comptime bytes: comptime_int, allocator: mem.Allocator, io: std.Io) !u64 {
+    const data: []u8 = try allocator.alloc(u8, bytes);
+    defer allocator.free(data);
+    random.bytes(data);
+
+    var timer = try Timer.start();
+    const start = timer.lap();
+    var final: [Hash.digest_length]u8 = undefined;
+    try Hash.hashParallel(data, &final, .{}, allocator, io);
+    std.mem.doNotOptimizeAway(final);
+
+    const end = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
+    const throughput = @as(u64, @intFromFloat(bytes / elapsed_s));
+
+    return throughput;
+}
+
 const macs = [_]Crypto{
     Crypto{ .ty = crypto.onetimeauth.Ghash, .name = "ghash" },
     Crypto{ .ty = crypto.onetimeauth.Polyval, .name = "polyval" },
@@ -512,6 +535,18 @@ pub fn main() !void {
         }
     }
 
+    var io_threaded = std.Io.Threaded.init(arena_allocator);
+    defer io_threaded.deinit();
+    const io = io_threaded.io();
+
+    inline for (parallel_hashes) |H| {
+        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) {
+            const throughput = try benchmarkHashParallel(H.ty, mode(128 * MiB), arena_allocator, io);
+            try stdout.print("{s:>17}: {:10} MiB/s\n", .{ H.name, throughput / (1 * MiB) });
+            try stdout.flush();
+        }
+    }
+
     inline for (macs) |M| {
         if (filter == null or std.mem.indexOf(u8, M.name, filter.?) != null) {
             const throughput = try benchmarkMac(M.ty, mode(128 * MiB));
diff --git a/lib/std/crypto/blake3.zig b/lib/std/crypto/blake3.zig
@@ -2,6 +2,8 @@ const std = @import("std");
 const builtin = @import("builtin");
 const fmt = std.fmt;
 const mem = std.mem;
+const Io = std.Io;
+const Thread = std.Thread;
 
 const Vec4 = @Vector(4, u32);
 const Vec8 = @Vector(8, u32);
@@ -14,6 +16,11 @@ pub const simd_degree = std.simd.suggestVectorLength(u32) orelse 1;
 pub const max_simd_degree = simd_degree;
 const max_simd_degree_or_2 = if (max_simd_degree > 2) max_simd_degree else 2;
 
+/// Threshold for switching to parallel processing.
+/// Below this size, sequential hashing is used.
+/// Benchmarks generally show significant speedup starting at 3 MiB.
+const parallel_threshold = 3 * 1024 * 1024;
+
 const iv: [8]u32 = .{
     0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
     0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
@@ -666,6 +673,95 @@ fn leftSubtreeLen(input_len: usize) usize {
     return @intCast(roundDownToPowerOf2(full_chunks) * chunk_length);
 }
 
+const ChunkBatch = struct {
+    input: []const u8,
+    start_chunk: usize,
+    end_chunk: usize,
+    cvs: [][8]u32,
+    key: [8]u32,
+    flags: Flags,
+
+    fn process(ctx: ChunkBatch) void {
+        var cv_buffer: [max_simd_degree * Blake3.digest_length]u8 = undefined;
+        var chunk_idx = ctx.start_chunk;
+
+        while (chunk_idx < ctx.end_chunk) {
+            const remaining = ctx.end_chunk - chunk_idx;
+            const batch_size = @min(remaining, max_simd_degree);
+            const offset = chunk_idx * chunk_length;
+            const batch_len = @as(usize, batch_size) * chunk_length;
+
+            const num_cvs = compressChunksParallel(
+                ctx.input[offset..][0..batch_len],
+                ctx.key,
+                chunk_idx,
+                ctx.flags,
+                &cv_buffer,
+            );
+
+            for (0..num_cvs) |i| {
+                const cv_bytes = cv_buffer[i * Blake3.digest_length ..][0..Blake3.digest_length];
+                ctx.cvs[chunk_idx + i] = loadCvWords(cv_bytes.*);
+            }
+
+            chunk_idx += batch_size;
+        }
+    }
+};
+
+const ParentBatchContext = struct {
+    input_cvs: [][8]u32,
+    output_cvs: [][8]u32,
+    start_idx: usize,
+    end_idx: usize,
+    key: [8]u32,
+    flags: Flags,
+};
+
+fn processParentBatch(ctx: ParentBatchContext) void {
+    for (ctx.start_idx..ctx.end_idx) |i| {
+        const output = parentOutputFromCvs(ctx.input_cvs[i * 2], ctx.input_cvs[i * 2 + 1], ctx.key, ctx.flags);
+        ctx.output_cvs[i] = output.chainingValue();
+    }
+}
+
+fn buildMerkleTreeLayerParallel(
+    input_cvs: [][8]u32,
+    output_cvs: [][8]u32,
+    key: [8]u32,
+    flags: Flags,
+    io: Io,
+) void {
+    const num_parents = input_cvs.len / 2;
+
+    if (num_parents <= 16) {
+        for (0..num_parents) |i| {
+            const output = parentOutputFromCvs(input_cvs[i * 2], input_cvs[i * 2 + 1], key, flags);
+            output_cvs[i] = output.chainingValue();
+        }
+        return;
+    }
+
+    const num_workers = Thread.getCpuCount() catch 1;
+    const parents_per_worker = (num_parents + num_workers - 1) / num_workers;
+    var group: Io.Group = .init;
+
+    for (0..num_workers) |worker_id| {
+        const start_idx = worker_id * parents_per_worker;
+        if (start_idx >= num_parents) break;
+
+        group.async(io, processParentBatch, .{ParentBatchContext{
+            .input_cvs = input_cvs,
+            .output_cvs = output_cvs,
+            .start_idx = start_idx,
+            .end_idx = @min(start_idx + parents_per_worker, num_parents),
+            .key = key,
+            .flags = flags,
+        }});
+    }
+    group.wait(io);
+}
+
 fn parentOutput(parent_block: []const u8, key: [8]u32, flags: Flags) Output {
     var block: [Blake3.block_length]u8 = undefined;
     @memcpy(&block, parent_block[0..Blake3.block_length]);
@@ -705,7 +801,7 @@ const ChunkState = struct {
         return ChunkState{
             .cv = key,
             .chunk_counter = 0,
-            .buf = [_]u8{0} ** Blake3.block_length,
+            .buf = @splat(0),
             .buf_len = 0,
             .blocks_compressed = 0,
             .flags = flags,
@@ -716,7 +812,7 @@ const ChunkState = struct {
         self.cv = key;
         self.chunk_counter = chunk_counter;
         self.blocks_compressed = 0;
-        self.buf = [_]u8{0} ** Blake3.block_length;
+        self.buf = @splat(0);
         self.buf_len = 0;
     }
 
@@ -742,7 +838,7 @@ const ChunkState = struct {
             if (self.buf_len == Blake3.block_length) {
                 compressInPlace(&self.cv, &self.buf, Blake3.block_length, self.chunk_counter, self.flags.with(self.maybeStartFlag()));
                 self.blocks_compressed += 1;
-                self.buf = [_]u8{0} ** Blake3.block_length;
+                self.buf = @splat(0);
                 self.buf_len = 0;
             }
 
@@ -849,6 +945,90 @@ pub const Blake3 = struct {
         d.final(out);
     }
 
+    pub fn hashParallel(b: []const u8, out: []u8, options: Options, allocator: std.mem.Allocator, io: Io) !void {
+        if (b.len < parallel_threshold) {
+            return hash(b, out, options);
+        }
+
+        const key_words = if (options.key) |key| loadKeyWords(key) else iv;
+        const flags: Flags = if (options.key != null) .{ .keyed_hash = true } else .{};
+
+        const num_full_chunks = b.len / chunk_length;
+        const thread_count = Thread.getCpuCount() catch 1;
+        if (thread_count <= 1 or num_full_chunks == 0) {
+            return hash(b, out, options);
+        }
+
+        const cvs = try allocator.alloc([8]u32, num_full_chunks);
+        defer allocator.free(cvs);
+
+        // Process chunks in parallel
+        const num_workers = thread_count;
+        const chunks_per_worker = (num_full_chunks + num_workers - 1) / num_workers;
+        var group: Io.Group = .init;
+
+        for (0..num_workers) |worker_id| {
+            const start_chunk = worker_id * chunks_per_worker;
+            if (start_chunk >= num_full_chunks) break;
+
+            group.async(io, ChunkBatch.process, .{ChunkBatch{
+                .input = b,
+                .start_chunk = start_chunk,
+                .end_chunk = @min(start_chunk + chunks_per_worker, num_full_chunks),
+                .cvs = cvs,
+                .key = key_words,
+                .flags = flags,
+            }});
+        }
+        group.wait(io);
+
+        // Build Merkle tree in parallel layers using ping-pong buffers
+        const max_intermediate_size = (num_full_chunks + 1) / 2;
+        const buffer0 = try allocator.alloc([8]u32, max_intermediate_size);
+        defer allocator.free(buffer0);
+        const buffer1 = try allocator.alloc([8]u32, max_intermediate_size);
+        defer allocator.free(buffer1);
+
+        var current_level = cvs;
+        var next_level_buf = buffer0;
+        var toggle = false;
+
+        while (current_level.len > 8) {
+            const num_parents = current_level.len / 2;
+            const has_odd = current_level.len % 2 == 1;
+            const next_level_size = num_parents + @intFromBool(has_odd);
+
+            buildMerkleTreeLayerParallel(
+                current_level[0 .. num_parents * 2],
+                next_level_buf[0..num_parents],
+                key_words,
+                flags,
+                io,
+            );
+
+            if (has_odd) {
+                next_level_buf[num_parents] = current_level[current_level.len - 1];
+            }
+
+            current_level = next_level_buf[0..next_level_size];
+            next_level_buf = if (toggle) buffer0 else buffer1;
+            toggle = !toggle;
+        }
+
+        // Finalize remaining small tree sequentially
+        var hasher = init_internal(key_words, flags);
+        for (current_level, 0..) |cv, i| hasher.pushCv(cv, i);
+
+        hasher.chunk.chunk_counter = num_full_chunks;
+        const remaining_bytes = b.len % chunk_length;
+        if (remaining_bytes > 0) {
+            hasher.chunk.update(b[num_full_chunks * chunk_length ..]);
+            hasher.mergeCvStack(hasher.chunk.chunk_counter);
+        }
+
+        hasher.final(out);
+    }
+
     fn init_internal(key: [8]u32, flags: Flags) Blake3 {
         return Blake3{
             .key = key,
@@ -1182,3 +1362,48 @@ test "BLAKE3 reference test cases" {
         try testBlake3(derive_key, t.input_len, t.derive_key.*);
     }
 }
+
+test "BLAKE3 parallel vs sequential" {
+    const allocator = std.testing.allocator;
+    const io = std.testing.io;
+
+    // Test various sizes including those above the parallelization threshold
+    const test_sizes = [_]usize{
+        0, // Empty
+        64, // One block
+        1024, // One chunk
+        1024 * 10, // Multiple chunks
+        1024 * 100, // 100KB
+        1024 * 1000, // 1MB
+        1024 * 5000, // 5MB (above threshold)
+        1024 * 10000, // 10MB (above threshold)
+    };
+
+    for (test_sizes) |size| {
+        // Allocate and fill test data with a pattern
+        const input = try allocator.alloc(u8, size);
+        defer allocator.free(input);
+        for (input, 0..) |*byte, i| {
+            byte.* = @truncate(i);
+        }
+
+        // Test regular hash
+        var expected: [32]u8 = undefined;
+        Blake3.hash(input, &expected, .{});
+
+        var actual: [32]u8 = undefined;
+        try Blake3.hashParallel(input, &actual, .{}, allocator, io);
+
+        try std.testing.expectEqualSlices(u8, &expected, &actual);
+
+        // Test keyed hash
+        const key: [32]u8 = @splat(0x42);
+        var expected_keyed: [32]u8 = undefined;
+        Blake3.hash(input, &expected_keyed, .{ .key = key });
+
+        var actual_keyed: [32]u8 = undefined;
+        try Blake3.hashParallel(input, &actual_keyed, .{ .key = key }, allocator, io);
+
+        try std.testing.expectEqualSlices(u8, &expected_keyed, &actual_keyed);
+    }
+}