From 8651d51922b7f8793b1d51454e1d9f3e89a35c35 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Mon, 10 Feb 2025 02:22:05 -0800 Subject: [PATCH] Update `//bench:convolution_bench` to properly flush the L2 caches between runs and use the `num_threads` and `benchmark_min_iters` flags. This also fixes an occasional segfault due to not aligning the `workspace` pointer. PiperOrigin-RevId: 725129452 --- bench/BUILD.bazel | 2 +- bench/convolution.cc | 581 ++++++++++++++++++++----------------------- 2 files changed, 267 insertions(+), 316 deletions(-) diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index dfe00652a95..69a95edbba6 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -627,7 +627,7 @@ xnnpack_benchmark( srcs = ["convolution.cc"], copts = xnnpack_optional_tflite_copts(), tags = xnnpack_slow_benchmark_tags() + ["nowin32"], - deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(), + deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps() + ["@pthreadpool"], ) xnnpack_benchmark( diff --git a/bench/convolution.cc b/bench/convolution.cc index ae39d15a83d..5b24944c569 100644 --- a/bench/convolution.cc +++ b/bench/convolution.cc @@ -7,7 +7,8 @@ // LICENSE file in the root directory of this source tree. #include -#include +#include +#include #include #include #include @@ -15,21 +16,24 @@ #include #include +#include "utils.h" #include "xnnpack.h" +#include "xnnpack/buffer.h" #include "xnnpack/common.h" #include "xnnpack/math.h" - #include +#include "pthreadpool.h" + #ifdef BENCHMARK_TENSORFLOW_LITE -#include "flatbuffers/include/flatbuffers/flatbuffers.h" +#include "flatbuffers/include/flatbuffers/buffer.h" +#include "flatbuffers/include/flatbuffers/flatbuffer_builder.h" +#include "flatbuffers/include/flatbuffers/string.h" +#include "tensorflow/lite/core/interpreter_builder.h" #include "tensorflow/lite/interpreter.h" #include "tensorflow/lite/kernels/register.h" -#include "tensorflow/lite/model.h" #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/version.h" #endif // BENCHMARK_TENSORFLOW_LITE -#include "utils.h" -#include "xnnpack/buffer.h" void xnnpack_convolution_qu8(benchmark::State& state, const char* net) { const size_t batch_size = state.range(0); @@ -47,7 +51,8 @@ void xnnpack_convolution_qu8(benchmark::State& state, const char* net) { std::random_device random_device; auto rng = std::mt19937(random_device()); - auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); + auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), + std::ref(rng)); const size_t output_pixel_stride = groups * group_output_channels; const size_t input_pixel_stride = groups * group_input_channels; @@ -57,16 +62,24 @@ void xnnpack_convolution_qu8(benchmark::State& state, const char* net) { const size_t padding_top = padding_height / 2; const size_t padding_right = padding_width - padding_left; const size_t padding_bottom = padding_height - padding_top; - const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1; - const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1; - - xnnpack::Buffer input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint8_t)); + const size_t output_height = + (input_height + padding_height - effective_kernel_height) / subsampling + + 1; + const size_t output_width = + (input_width + padding_width - effective_kernel_width) / subsampling + 1; + + xnnpack::Buffer input(batch_size * input_height * input_width * + input_pixel_stride + + XNN_EXTRA_BYTES / sizeof(uint8_t)); xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng); - xnnpack::Buffer kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels); + xnnpack::Buffer kernel(groups * group_output_channels * + kernel_height * kernel_width * + group_input_channels); xnnpack::fill_uniform_random_bits(kernel.data(), kernel.size(), rng); xnnpack::Buffer bias(groups * group_output_channels); std::generate(bias.begin(), bias.end(), std::ref(i32rng)); - const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride; + xnnpack::Buffer output(batch_size * output_height * output_width * + output_pixel_stride); xnn_status status = xnn_initialize(nullptr /* allocator */); if (status != xnn_status_success) { @@ -74,87 +87,63 @@ void xnnpack_convolution_qu8(benchmark::State& state, const char* net) { return; } - const size_t num_buffers = 1 + - benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), - sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements); - xnnpack::Buffer output(output_elements * num_buffers); - - xnnpack::Buffer convolution_operators(num_buffers); - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_create_convolution2d_nhwc_qu8( - padding_top, padding_right, padding_bottom, padding_left, - kernel_height, kernel_width, - subsampling, subsampling, - dilation, dilation, - groups, group_input_channels, group_output_channels, - input_pixel_stride, output_pixel_stride, - 127, 0.5f, - 127, 0.5f, - kernel.data(), bias.data(), - 127, 0.5f, 0, 255, - 0 /* flags */, nullptr, nullptr, &convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to create QUINT8 Convolution operator"); - return; - } + xnn_operator_t convolution_op; + status = xnn_create_convolution2d_nhwc_qu8( + padding_top, padding_right, padding_bottom, padding_left, kernel_height, + kernel_width, subsampling, subsampling, dilation, dilation, groups, + group_input_channels, group_output_channels, input_pixel_stride, + output_pixel_stride, 127, 0.5f, 127, 0.5f, kernel.data(), bias.data(), + 127, 0.5f, 0, 255, 0 /* flags */, nullptr, nullptr, &convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to create QUINT8 Convolution operator"); + return; } - size_t max_workspace_size = 0; + pthreadpool_t threadpool = pthreadpool_create(FLAGS_num_threads); - for (size_t i = 0; i < convolution_operators.size(); i++) { - size_t workspace_size = 0; - size_t workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qu8( - convolution_operators[i], - batch_size, input_height, input_width, - &workspace_size, &workspace_alignment, + size_t workspace_size = 0; + size_t workspace_alignment = 0; + status = xnn_reshape_convolution2d_nhwc_qu8( + convolution_op, batch_size, input_height, input_width, &workspace_size, + &workspace_alignment, /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/nullptr); + threadpool); - if (status != xnn_status_success) { - state.SkipWithError("failed to reshape QUINT8 Convolution operator"); - return; - } - - max_workspace_size = std::max(max_workspace_size, workspace_size); + if (status != xnn_status_success) { + state.SkipWithError("failed to reshape QUINT8 Convolution operator"); + return; } - xnnpack::Buffer workspace(max_workspace_size); + xnnpack::Buffer workspace(workspace_size); - for (size_t i = 0; i < convolution_operators.size(); i++) { - status = xnn_setup_convolution2d_nhwc_qu8( - convolution_operators[i], - workspace.data(), - input.data(), output.data() + i * output_elements); - if (status != xnn_status_success) { - state.SkipWithError("failed to setup QUINT8 Convolution operator"); - return; - } + status = xnn_setup_convolution2d_nhwc_qu8( + convolution_op, workspace.data(), input.data(), + output.data()); + if (status != xnn_status_success) { + state.SkipWithError("failed to setup QUINT8 Convolution operator"); + return; } - size_t buffer_index = 0; - for (auto _ : state) { - state.PauseTiming(); - benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t)); - buffer_index = (buffer_index + 1) % num_buffers; - state.ResumeTiming(); + int num_iters = FLAGS_benchmark_min_iters; + while (state.KeepRunningBatch(num_iters)) { + for (int iter = 0; iter < num_iters; iter++) { + benchmark::utils::WipePthreadpoolL2Caches(state, threadpool); - status = xnn_run_operator(convolution_operators[buffer_index], - /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run QUINT8 Convolution operator"); - return; + status = xnn_run_operator(convolution_op, threadpool); + if (status != xnn_status_success) { + state.SkipWithError("failed to run the QUINT8 Convolution operator"); + return; + } } + num_iters = 1; } - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_delete_operator(convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to delete QUINT8 Convolution operator"); - return; - } - convolution_op = nullptr; + status = xnn_delete_operator(convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to delete QUINT8 Convolution operator"); + return; } + convolution_op = nullptr; const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { @@ -162,11 +151,12 @@ void xnnpack_convolution_qu8(benchmark::State& state, const char* net) { } state.counters["OPS"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * - batch_size * output_height * output_width * - groups * group_input_channels * group_output_channels * - kernel_height * kernel_width, - benchmark::Counter::kIsRate); + static_cast(state.iterations()) * 2 * batch_size * + output_height * output_width * groups * group_input_channels * + group_output_channels * kernel_height * kernel_width, + benchmark::Counter::kIsRate); + + pthreadpool_destroy(threadpool); } void xnnpack_convolution_qs8(benchmark::State& state, const char* net) { @@ -185,7 +175,8 @@ void xnnpack_convolution_qs8(benchmark::State& state, const char* net) { std::random_device random_device; auto rng = std::mt19937(random_device()); - auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), std::ref(rng)); + auto i32rng = std::bind(std::uniform_int_distribution(-10000, 10000), + std::ref(rng)); const size_t output_pixel_stride = groups * group_output_channels; const size_t input_pixel_stride = groups * group_input_channels; @@ -195,16 +186,24 @@ void xnnpack_convolution_qs8(benchmark::State& state, const char* net) { const size_t padding_top = padding_height / 2; const size_t padding_right = padding_width - padding_left; const size_t padding_bottom = padding_height - padding_top; - const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1; - const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1; - - xnnpack::Buffer input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(int8_t)); + const size_t output_height = + (input_height + padding_height - effective_kernel_height) / subsampling + + 1; + const size_t output_width = + (input_width + padding_width - effective_kernel_width) / subsampling + 1; + + xnnpack::Buffer input(batch_size * input_height * input_width * + input_pixel_stride + + XNN_EXTRA_BYTES / sizeof(int8_t)); xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng); - xnnpack::Buffer kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels); + xnnpack::Buffer kernel(groups * group_output_channels * + kernel_height * kernel_width * + group_input_channels); xnnpack::fill_uniform_random_bits(kernel.data(), kernel.size(), rng); xnnpack::Buffer bias(groups * group_output_channels); std::generate(bias.begin(), bias.end(), std::ref(i32rng)); - const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride; + xnnpack::Buffer output(batch_size * output_height * output_width * + output_pixel_stride); xnn_status status = xnn_initialize(nullptr /* allocator */); if (status != xnn_status_success) { @@ -212,85 +211,59 @@ void xnnpack_convolution_qs8(benchmark::State& state, const char* net) { return; } - const size_t num_buffers = 1 + - benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), - sizeof(int8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(int8_t) * output_elements); - xnnpack::Buffer output(output_elements * num_buffers); - - xnnpack::Buffer convolution_operators(num_buffers); - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_create_convolution2d_nhwc_qs8( - padding_top, padding_right, padding_bottom, padding_left, - kernel_height, kernel_width, - subsampling, subsampling, - dilation, dilation, - groups, group_input_channels, group_output_channels, - input_pixel_stride, output_pixel_stride, - 127, 0.5f, 0.5f, - kernel.data(), bias.data(), - 127, 0.5f, -128, 127, - 0 /* flags */, nullptr, nullptr, &convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to create QINT8 Convolution operator"); - return; - } + xnn_operator_t convolution_op; + status = xnn_create_convolution2d_nhwc_qs8( + padding_top, padding_right, padding_bottom, padding_left, kernel_height, + kernel_width, subsampling, subsampling, dilation, dilation, groups, + group_input_channels, group_output_channels, input_pixel_stride, + output_pixel_stride, 127, 0.5f, 0.5f, kernel.data(), bias.data(), 127, + 0.5f, -128, 127, 0 /* flags */, nullptr, nullptr, &convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to create QINT8 Convolution operator"); + return; } - size_t max_workspace_size = 0; - - for (size_t i = 0; i < convolution_operators.size(); i++) { - size_t workspace_size = 0; - size_t workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_qs8( - convolution_operators[i], - batch_size, input_height, input_width, - &workspace_size, &workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/nullptr); + pthreadpool_t threadpool = pthreadpool_create(FLAGS_num_threads); - if (status != xnn_status_success) { - state.SkipWithError("failed to reshape QINT8 Convolution operator"); - return; - } + size_t workspace_size = 0; + size_t workspace_alignment = 0; + status = xnn_reshape_convolution2d_nhwc_qs8( + convolution_op, batch_size, input_height, input_width, &workspace_size, + &workspace_alignment, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, threadpool); - max_workspace_size = std::max(max_workspace_size, workspace_size); + if (status != xnn_status_success) { + state.SkipWithError("failed to reshape QINT8 Convolution operator"); + return; } - xnnpack::Buffer workspace(max_workspace_size); + xnnpack::Buffer workspace(workspace_size); - for (size_t i = 0; i < convolution_operators.size(); i++) { - status = xnn_setup_convolution2d_nhwc_qs8( - convolution_operators[i], - workspace.data(), - input.data(), output.data() + i * output_elements); - if (status != xnn_status_success) { - state.SkipWithError("failed to setup QINT8 Convolution operator"); - return; - } + status = xnn_setup_convolution2d_nhwc_qs8(convolution_op, workspace.data(), + input.data(), output.data()); + if (status != xnn_status_success) { + state.SkipWithError("failed to setup QINT8 Convolution operator"); + return; } - size_t buffer_index = 0; - for (auto _ : state) { - state.PauseTiming(); - benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t)); - buffer_index = (buffer_index + 1) % num_buffers; - state.ResumeTiming(); + int num_iters = FLAGS_benchmark_min_iters; + while (state.KeepRunningBatch(num_iters)) { + for (int iter = 0; iter < num_iters; iter++) { + benchmark::utils::WipePthreadpoolL2Caches(state, threadpool); - status = xnn_run_operator(convolution_operators[buffer_index], - /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run QINT8 Convolution operator"); - return; + status = xnn_run_operator(convolution_op, threadpool); + if (status != xnn_status_success) { + state.SkipWithError("failed to run QINT8 Convolution operator"); + return; + } } + num_iters = 1; } - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_delete_operator(convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to delete QINT8 Convolution operator"); - return; - } - convolution_op = nullptr; + status = xnn_delete_operator(convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to delete QINT8 Convolution operator"); + return; } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); @@ -299,11 +272,12 @@ void xnnpack_convolution_qs8(benchmark::State& state, const char* net) { } state.counters["OPS"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * - batch_size * output_height * output_width * - groups * group_input_channels * group_output_channels * - kernel_height * kernel_width, - benchmark::Counter::kIsRate); + static_cast(state.iterations()) * 2 * batch_size * + output_height * output_width * groups * group_input_channels * + group_output_channels * kernel_height * kernel_width, + benchmark::Counter::kIsRate); + + pthreadpool_destroy(threadpool); } void xnnpack_convolution_f16(benchmark::State& state, const char* net) { @@ -322,8 +296,9 @@ void xnnpack_convolution_f16(benchmark::State& state, const char* net) { std::random_device random_device; auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(0.1f, 1.0f), std::ref(rng)); - + auto f32rng = std::bind(std::uniform_real_distribution(0.1f, 1.0f), + std::ref(rng)); + const size_t output_pixel_stride = groups * group_output_channels; const size_t input_pixel_stride = groups * group_input_channels; const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1; @@ -332,16 +307,24 @@ void xnnpack_convolution_f16(benchmark::State& state, const char* net) { const size_t padding_top = padding_height / 2; const size_t padding_right = padding_width - padding_left; const size_t padding_bottom = padding_height - padding_top; - const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1; - const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1; - - xnnpack::Buffer input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(xnn_float16)); + const size_t output_height = + (input_height + padding_height - effective_kernel_height) / subsampling + + 1; + const size_t output_width = + (input_width + padding_width - effective_kernel_width) / subsampling + 1; + + xnnpack::Buffer input(batch_size * input_height * input_width * + input_pixel_stride + + XNN_EXTRA_BYTES / sizeof(xnn_float16)); std::generate(input.begin(), input.end(), f32rng); - xnnpack::Buffer kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels); + xnnpack::Buffer kernel(groups * group_output_channels * + kernel_height * kernel_width * + group_input_channels); std::generate(kernel.begin(), kernel.end(), f32rng); xnnpack::Buffer bias(groups * group_output_channels); std::generate(bias.begin(), bias.end(), f32rng); - const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride; + xnnpack::Buffer output(batch_size * output_height * + output_width * output_pixel_stride); xnn_status status = xnn_initialize(nullptr /* allocator */); if (status != xnn_status_success) { @@ -349,84 +332,63 @@ void xnnpack_convolution_f16(benchmark::State& state, const char* net) { return; } - const size_t num_buffers = 1 + - benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), - sizeof(xnn_float16) * (kernel.size() + bias.size() + output_elements)); - xnnpack::Buffer output(output_elements * num_buffers); - - xnnpack::Buffer convolution_operators(num_buffers); - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_create_convolution2d_nhwc_f16( - padding_top, padding_right, padding_bottom, padding_left, - kernel_height, kernel_width, - subsampling, subsampling, - dilation, dilation, - groups, group_input_channels, group_output_channels, - input_pixel_stride, output_pixel_stride, - kernel.data(), bias.data(), - -std::numeric_limits::infinity(), +std::numeric_limits::infinity(), - 0 /* flags */, nullptr, nullptr, &convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to create FP16 Convolution operator"); - return; - } + xnn_operator_t convolution_op; + status = xnn_create_convolution2d_nhwc_f16( + padding_top, padding_right, padding_bottom, padding_left, kernel_height, + kernel_width, subsampling, subsampling, dilation, dilation, groups, + group_input_channels, group_output_channels, input_pixel_stride, + output_pixel_stride, kernel.data(), bias.data(), + -std::numeric_limits::infinity(), + +std::numeric_limits::infinity(), 0 /* flags */, nullptr, nullptr, + &convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to create FP16 Convolution operator"); + return; } - size_t max_workspace_size = 0; + pthreadpool_t threadpool = pthreadpool_create(FLAGS_num_threads); - for (size_t i = 0; i < convolution_operators.size(); i++) { - size_t workspace_size = 0; - size_t workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f16( - convolution_operators[i], - batch_size, input_height, input_width, - &workspace_size, &workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/nullptr); - - if (status != xnn_status_success) { - state.SkipWithError("failed to reshape FP16 Convolution operator"); - return; - } + size_t workspace_size = 0; + size_t workspace_alignment = 0; + status = xnn_reshape_convolution2d_nhwc_f16( + convolution_op, batch_size, input_height, input_width, &workspace_size, + &workspace_alignment, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, threadpool); - max_workspace_size = std::max(max_workspace_size, workspace_size); + if (status != xnn_status_success) { + state.SkipWithError("failed to reshape FP16 Convolution operator"); + return; } - xnnpack::Buffer workspace(max_workspace_size); + xnnpack::Buffer workspace(workspace_size); - for (size_t i = 0; i < convolution_operators.size(); i++) { - status = xnn_setup_convolution2d_nhwc_f16( - convolution_operators[i], - workspace.data(), - input.data(), output.data() + i * output_elements); - if (status != xnn_status_success) { - state.SkipWithError("failed to setup FP16 Convolution operator"); - return; - } + status = xnn_setup_convolution2d_nhwc_f16(convolution_op, workspace.data(), + input.data(), output.data()); + if (status != xnn_status_success) { + state.SkipWithError("failed to setup FP16 Convolution operator"); + return; } - size_t buffer_index = 0; - for (auto _ : state) { - state.PauseTiming(); - benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(xnn_float16)); - buffer_index = (buffer_index + 1) % num_buffers; - state.ResumeTiming(); + int num_iters = FLAGS_benchmark_min_iters; + while (state.KeepRunningBatch(num_iters)) { + for (int iter = 0; iter < num_iters; iter++) { + benchmark::utils::WipePthreadpoolL2Caches(state, threadpool); - status = xnn_run_operator(convolution_operators[buffer_index], /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run FP16 Convolution operator"); - return; + status = xnn_run_operator(convolution_op, threadpool); + if (status != xnn_status_success) { + state.SkipWithError("failed to run FP16 Convolution operator"); + return; + } } + num_iters = 1; } - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_delete_operator(convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to delete FP16 Convolution operator"); - return; - } - convolution_op = nullptr; + status = xnn_delete_operator(convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to delete FP16 Convolution operator"); + return; } + convolution_op = nullptr; const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { @@ -434,11 +396,12 @@ void xnnpack_convolution_f16(benchmark::State& state, const char* net) { } state.counters["FLOPS"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * - batch_size * output_height * output_width * - groups * group_input_channels * group_output_channels * - kernel_height * kernel_width, - benchmark::Counter::kIsRate); + static_cast(state.iterations()) * 2 * batch_size * + output_height * output_width * groups * group_input_channels * + group_output_channels * kernel_height * kernel_width, + benchmark::Counter::kIsRate); + + pthreadpool_destroy(threadpool); } void xnnpack_convolution_f32(benchmark::State& state, const char* net) { @@ -457,7 +420,8 @@ void xnnpack_convolution_f32(benchmark::State& state, const char* net) { std::random_device random_device; auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(0.0f, 1.0f), std::ref(rng)); + auto f32rng = std::bind(std::uniform_real_distribution(0.0f, 1.0f), + std::ref(rng)); const size_t output_pixel_stride = groups * group_output_channels; const size_t input_pixel_stride = groups * group_input_channels; @@ -467,16 +431,23 @@ void xnnpack_convolution_f32(benchmark::State& state, const char* net) { const size_t padding_top = padding_height / 2; const size_t padding_right = padding_width - padding_left; const size_t padding_bottom = padding_height - padding_top; - const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1; - const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1; - - xnnpack::Buffer input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float)); + const size_t output_height = + (input_height + padding_height - effective_kernel_height) / subsampling + + 1; + const size_t output_width = + (input_width + padding_width - effective_kernel_width) / subsampling + 1; + + xnnpack::Buffer input(batch_size * input_height * input_width * + input_pixel_stride + + XNN_EXTRA_BYTES / sizeof(float)); std::generate(input.begin(), input.end(), std::ref(f32rng)); - xnnpack::Buffer kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels); + xnnpack::Buffer kernel(groups * group_output_channels * kernel_height * + kernel_width * group_input_channels); std::generate(kernel.begin(), kernel.end(), std::ref(f32rng)); xnnpack::Buffer bias(groups * group_output_channels); std::generate(bias.begin(), bias.end(), std::ref(f32rng)); - const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride; + xnnpack::Buffer output(batch_size * output_height * output_width * + output_pixel_stride); xnn_status status = xnn_initialize(nullptr /* allocator */); if (status != xnn_status_success) { @@ -484,84 +455,63 @@ void xnnpack_convolution_f32(benchmark::State& state, const char* net) { return; } - const size_t num_buffers = 1 + - benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), - sizeof(float) * (kernel.size() + bias.size() + output_elements)); - xnnpack::Buffer output(output_elements * num_buffers); - - xnnpack::Buffer convolution_operators(num_buffers); - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_create_convolution2d_nhwc_f32( - padding_top, padding_right, padding_bottom, padding_left, - kernel_height, kernel_width, - subsampling, subsampling, - dilation, dilation, - groups, group_input_channels, group_output_channels, - input_pixel_stride, output_pixel_stride, - kernel.data(), bias.data(), - -std::numeric_limits::infinity(), +std::numeric_limits::infinity(), - 0 /* flags */, nullptr, nullptr, &convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to create FP32 Convolution operator"); - return; - } + xnn_operator_t convolution_op; + status = xnn_create_convolution2d_nhwc_f32( + padding_top, padding_right, padding_bottom, padding_left, kernel_height, + kernel_width, subsampling, subsampling, dilation, dilation, groups, + group_input_channels, group_output_channels, input_pixel_stride, + output_pixel_stride, kernel.data(), bias.data(), + -std::numeric_limits::infinity(), + +std::numeric_limits::infinity(), 0 /* flags */, nullptr, nullptr, + &convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to create FP32 Convolution operator"); + return; } - size_t max_workspace_size = 0; + pthreadpool_t threadpool = pthreadpool_create(FLAGS_num_threads); - for (size_t i = 0; i < convolution_operators.size(); i++) { - size_t workspace_size = 0; - size_t workspace_alignment = 0; - status = xnn_reshape_convolution2d_nhwc_f32( - convolution_operators[i], - batch_size, input_height, input_width, - &workspace_size, &workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - /*threadpool=*/nullptr); - - if (status != xnn_status_success) { - state.SkipWithError("failed to reshape FP32 Convolution operator"); - return; - } + size_t workspace_size = 0; + size_t workspace_alignment = 0; + status = xnn_reshape_convolution2d_nhwc_f32( + convolution_op, batch_size, input_height, input_width, &workspace_size, + &workspace_alignment, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, threadpool); - max_workspace_size = std::max(max_workspace_size, workspace_size); + if (status != xnn_status_success) { + state.SkipWithError("failed to reshape FP32 Convolution operator"); + return; } - xnnpack::Buffer workspace(max_workspace_size); + xnnpack::Buffer workspace(workspace_size); - for (size_t i = 0; i < convolution_operators.size(); i++) { - status = xnn_setup_convolution2d_nhwc_f32( - convolution_operators[i], - workspace.data(), - input.data(), output.data() + i * output_elements); - if (status != xnn_status_success) { - state.SkipWithError("failed to setup FP32 Convolution operator"); - return; - } + status = xnn_setup_convolution2d_nhwc_f32(convolution_op, workspace.data(), + input.data(), output.data()); + if (status != xnn_status_success) { + state.SkipWithError("failed to setup FP32 Convolution operator"); + return; } - size_t buffer_index = 0; - for (auto _ : state) { - state.PauseTiming(); - benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float)); - buffer_index = (buffer_index + 1) % num_buffers; - state.ResumeTiming(); + int num_iters = FLAGS_benchmark_min_iters; + while (state.KeepRunningBatch(num_iters)) { + for (int iter = 0; iter < num_iters; iter++) { + benchmark::utils::WipePthreadpoolL2Caches(state, threadpool); - status = xnn_run_operator(convolution_operators[buffer_index], /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run FP32 Convolution operator"); - return; + status = xnn_run_operator(convolution_op, threadpool); + if (status != xnn_status_success) { + state.SkipWithError("failed to run FP32 Convolution operator"); + return; + } } + num_iters = 1; } - for (xnn_operator_t& convolution_op : convolution_operators) { - status = xnn_delete_operator(convolution_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to delete FP32 Convolution operator"); - return; - } - convolution_op = nullptr; + status = xnn_delete_operator(convolution_op); + if (status != xnn_status_success) { + state.SkipWithError("failed to delete FP32 Convolution operator"); + return; } + convolution_op = nullptr; const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); if (cpu_frequency != 0) { @@ -569,11 +519,12 @@ void xnnpack_convolution_f32(benchmark::State& state, const char* net) { } state.counters["FLOPS"] = benchmark::Counter( - uint64_t(state.iterations()) * 2 * - batch_size * output_height * output_width * - groups * group_input_channels * group_output_channels * - kernel_height * kernel_width, - benchmark::Counter::kIsRate); + static_cast(state.iterations()) * 2 * batch_size * + output_height * output_width * groups * group_input_channels * + group_output_channels * kernel_height * kernel_width, + benchmark::Counter::kIsRate); + + pthreadpool_destroy(threadpool); } #ifdef BENCHMARK_TENSORFLOW_LITE