diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 1f5cd3a2b4d5d..6bb9c0f6af2ca 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1005,13 +1005,8 @@ if("${MAKE}" STREQUAL "") endif() endif() -# Args for external projects using make -if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.28") - # Prevent 'bad file descriptor' error see #39517 #39628 - set(MAKE_BUILD_ARGS "-j1") -else() - set(MAKE_BUILD_ARGS "-j${NPROC}") -endif() +# Args for external projects using make. +set(MAKE_BUILD_ARGS "-j${NPROC}") include(FetchContent) set(FC_DECLARE_COMMON_OPTIONS) @@ -2639,7 +2634,7 @@ macro(build_bzip2) BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} libbz2.a ${MAKE_BUILD_ARGS} ${BZIP2_EXTRA_ARGS} - INSTALL_COMMAND ${MAKE} install -j1 PREFIX=${BZIP2_PREFIX} + INSTALL_COMMAND ${MAKE} install PREFIX=${BZIP2_PREFIX} ${BZIP2_EXTRA_ARGS} INSTALL_DIR ${BZIP2_PREFIX} URL ${ARROW_BZIP2_SOURCE_URL} diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc index a88ce389360f5..f49e201492c9b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc @@ -1510,7 +1510,7 @@ struct ISOCalendar { for (int i = 0; i < 3; i++) { field_builders.push_back( checked_cast(struct_builder->field_builder(i))); - RETURN_NOT_OK(field_builders[i]->Reserve(1)); + RETURN_NOT_OK(field_builders[i]->Reserve(in.length)); } auto visit_null = [&]() { return struct_builder->AppendNull(); }; std::function visit_value; diff --git a/cpp/src/arrow/csv/writer_benchmark.cc b/cpp/src/arrow/csv/writer_benchmark.cc index 54c0f50613754..9baa00d48a6d2 100644 --- a/cpp/src/arrow/csv/writer_benchmark.cc +++ b/cpp/src/arrow/csv/writer_benchmark.cc @@ -97,7 +97,7 @@ void BenchmarkWriteCsv(benchmark::State& state, const WriteOptions& options, const RecordBatch& batch) { int64_t total_size = 0; - while (state.KeepRunning()) { + for (auto _ : state) { auto out = io::BufferOutputStream::Create().ValueOrDie(); ABORT_NOT_OK(WriteCSV(batch, options, out.get())); auto buffer = out->Finish().ValueOrDie(); @@ -106,6 +106,7 @@ void BenchmarkWriteCsv(benchmark::State& state, const WriteOptions& options, // byte size of the generated csv dataset state.SetBytesProcessed(total_size); + state.SetItemsProcessed(state.iterations() * batch.num_columns() * batch.num_rows()); state.counters["null_percent"] = static_cast(state.range(0)); } diff --git a/cpp/src/arrow/dataset/file_benchmark.cc b/cpp/src/arrow/dataset/file_benchmark.cc index 8953cbd110643..8aa2ac5a6fa77 100644 --- a/cpp/src/arrow/dataset/file_benchmark.cc +++ b/cpp/src/arrow/dataset/file_benchmark.cc @@ -30,7 +30,12 @@ namespace arrow { namespace dataset { -static std::shared_ptr GetDataset() { +struct SampleDataset { + std::shared_ptr dataset; + int64_t num_fragments; +}; + +static SampleDataset GetDataset() { std::vector files; std::vector paths; for (int a = 0; a < 100; a++) { @@ -50,25 +55,35 @@ static std::shared_ptr GetDataset() { FinishOptions finish_options; finish_options.inspect_options.fragments = 0; EXPECT_OK_AND_ASSIGN(auto dataset, factory->Finish(finish_options)); - return dataset; + return {dataset, static_cast(paths.size())}; } // A benchmark of filtering fragments in a dataset. static void GetAllFragments(benchmark::State& state) { auto dataset = GetDataset(); for (auto _ : state) { - ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments()); + ASSERT_OK_AND_ASSIGN(auto fragments, dataset.dataset->GetFragments()); ABORT_NOT_OK(fragments.Visit([](std::shared_ptr) { return Status::OK(); })); } + state.SetItemsProcessed(state.iterations() * dataset.num_fragments); + state.counters["num_fragments"] = static_cast(dataset.num_fragments); } static void GetFilteredFragments(benchmark::State& state, compute::Expression filter) { auto dataset = GetDataset(); - ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*dataset->schema())); + ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*dataset.dataset->schema())); + int64_t num_filtered_fragments = 0; for (auto _ : state) { - ASSERT_OK_AND_ASSIGN(auto fragments, dataset->GetFragments(filter)); - ABORT_NOT_OK(fragments.Visit([](std::shared_ptr) { return Status::OK(); })); + num_filtered_fragments = 0; + ASSERT_OK_AND_ASSIGN(auto fragments, dataset.dataset->GetFragments(filter)); + ABORT_NOT_OK(fragments.Visit([&](std::shared_ptr) { + ++num_filtered_fragments; + return Status::OK(); + })); } + state.SetItemsProcessed(state.iterations() * dataset.num_fragments); + state.counters["num_fragments"] = static_cast(dataset.num_fragments); + state.counters["num_filtered_fragments"] = static_cast(num_filtered_fragments); } using compute::field_ref; diff --git a/cpp/src/arrow/dataset/scanner_benchmark.cc b/cpp/src/arrow/dataset/scanner_benchmark.cc index be953b3555945..287d76418ff16 100644 --- a/cpp/src/arrow/dataset/scanner_benchmark.cc +++ b/cpp/src/arrow/dataset/scanner_benchmark.cc @@ -162,7 +162,11 @@ void ScanOnly( acero::DeclarationToTable(std::move(scan))); ASSERT_GT(collected->num_rows(), 0); - ASSERT_EQ(collected->num_columns(), 2); + if (factory_name == "scan") { + ASSERT_EQ(collected->num_columns(), 6); + } else if (factory_name == "scan2") { + ASSERT_EQ(collected->num_columns(), 2); + } } static constexpr int kScanIdx = 0; diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 4bc732ec24313..f70b3991473fa 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -26,7 +26,6 @@ #include #ifdef ARROW_HAVE_SSE4_2 -// Enable the SIMD for ByteStreamSplit Encoder/Decoder #define ARROW_HAVE_SIMD_SPLIT #endif // ARROW_HAVE_SSE4_2 @@ -37,17 +36,15 @@ namespace arrow::util::internal { // #if defined(ARROW_HAVE_SSE4_2) -template +template void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t stride, - T* out) { - constexpr size_t kNumStreams = sizeof(T); - static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + uint8_t* out) { + static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); + constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams; - const int64_t size = num_values * sizeof(T); + const int64_t size = num_values * kNumStreams; const int64_t num_blocks = size / kBlockSize; - uint8_t* output_data = reinterpret_cast(out); // First handle suffix. // This helps catch if the simd-based processing overflows into the suffix @@ -55,11 +52,11 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; for (int64_t i = num_processed_elements; i < num_values; ++i) { uint8_t gathered_byte_data[kNumStreams]; - for (size_t b = 0; b < kNumStreams; ++b) { - const size_t byte_index = b * stride + i; + for (int b = 0; b < kNumStreams; ++b) { + const int64_t byte_index = b * stride + i; gathered_byte_data[b] = data[byte_index]; } - out[i] = arrow::util::SafeLoadAs(&gathered_byte_data[0]); + memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); } // The blocks get processed hierarchically using the unpack intrinsics. @@ -67,53 +64,52 @@ void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t // Stage 1: AAAA BBBB CCCC DDDD // Stage 2: ACAC ACAC BDBD BDBD // Stage 3: ABCD ABCD ABCD ABCD - __m128i stage[kNumStreamsLog2 + 1U][kNumStreams]; - constexpr size_t kNumStreamsHalf = kNumStreams / 2U; + __m128i stage[kNumStreamsLog2 + 1][kNumStreams]; + constexpr int kNumStreamsHalf = kNumStreams / 2U; for (int64_t i = 0; i < num_blocks; ++i) { - for (size_t j = 0; j < kNumStreams; ++j) { + for (int j = 0; j < kNumStreams; ++j) { stage[0][j] = _mm_loadu_si128( reinterpret_cast(&data[i * sizeof(__m128i) + j * stride])); } - for (size_t step = 0; step < kNumStreamsLog2; ++step) { - for (size_t j = 0; j < kNumStreamsHalf; ++j) { + for (int step = 0; step < kNumStreamsLog2; ++step) { + for (int j = 0; j < kNumStreamsHalf; ++j) { stage[step + 1U][j * 2] = _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); stage[step + 1U][j * 2 + 1U] = _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); } } - for (size_t j = 0; j < kNumStreams; ++j) { - _mm_storeu_si128(reinterpret_cast<__m128i*>( - &output_data[(i * kNumStreams + j) * sizeof(__m128i)]), - stage[kNumStreamsLog2][j]); + for (int j = 0; j < kNumStreams; ++j) { + _mm_storeu_si128( + reinterpret_cast<__m128i*>(out + (i * kNumStreams + j) * sizeof(__m128i)), + stage[kNumStreamsLog2][j]); } } } -template -void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_values, +template +void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { - constexpr size_t kNumStreams = sizeof(T); - static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams; + static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); + constexpr int kBlockSize = sizeof(__m128i) * kNumStreams; __m128i stage[3][kNumStreams]; __m128i final_result[kNumStreams]; - const size_t size = num_values * sizeof(T); - const size_t num_blocks = size / kBlockSize; + const int64_t size = num_values * kNumStreams; + const int64_t num_blocks = size / kBlockSize; const __m128i* raw_values_sse = reinterpret_cast(raw_values); __m128i* output_buffer_streams[kNumStreams]; - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { output_buffer_streams[i] = reinterpret_cast<__m128i*>(&output_buffer_raw[num_values * i]); } // First handle suffix. - const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T); - for (size_t i = num_processed_elements; i < num_values; ++i) { - for (size_t j = 0U; j < kNumStreams; ++j) { + const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; + for (int64_t i = num_processed_elements; i < num_values; ++i) { + for (int j = 0; j < kNumStreams; ++j) { const uint8_t byte_in_value = raw_values[i * kNumStreams + j]; output_buffer_raw[j * num_values + i] = byte_in_value; } @@ -131,48 +127,47 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_value // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ... // Step 4: __mm_unpacklo_epi64 and _mm_unpackhi_epi64: // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ... - for (size_t block_index = 0; block_index < num_blocks; ++block_index) { + for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { // First copy the data to stage 0. - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * kNumStreams + i]); } // The shuffling of bytes is performed through the unpack intrinsics. // In my measurements this gives better performance then an implementation // which uses the shuffle intrinsics. - for (size_t stage_lvl = 0; stage_lvl < 2U; ++stage_lvl) { - for (size_t i = 0; i < kNumStreams / 2U; ++i) { + for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) { + for (int i = 0; i < kNumStreams / 2; ++i) { stage[stage_lvl + 1][i * 2] = _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); stage[stage_lvl + 1][i * 2 + 1] = _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); } } - if constexpr (kNumStreams == 8U) { + if constexpr (kNumStreams == 8) { // This is the path for double. __m128i tmp[8]; - for (size_t i = 0; i < 4; ++i) { + for (int i = 0; i < 4; ++i) { tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]); tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]); } - - for (size_t i = 0; i < 4; ++i) { + for (int i = 0; i < 4; ++i) { final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]); final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]); } } else { // this is the path for float. __m128i tmp[4]; - for (size_t i = 0; i < 2; ++i) { + for (int i = 0; i < 2; ++i) { tmp[i * 2] = _mm_unpacklo_epi8(stage[2][i * 2], stage[2][i * 2 + 1]); tmp[i * 2 + 1] = _mm_unpackhi_epi8(stage[2][i * 2], stage[2][i * 2 + 1]); } - for (size_t i = 0; i < 2; ++i) { + for (int i = 0; i < 2; ++i) { final_result[i * 2] = _mm_unpacklo_epi64(tmp[i], tmp[i + 2]); final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]); } } - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { _mm_storeu_si128(&output_buffer_streams[i][block_index], final_result[i]); } } @@ -180,52 +175,50 @@ void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_value #endif // ARROW_HAVE_SSE4_2 #if defined(ARROW_HAVE_AVX2) -template +template void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t stride, - T* out) { - constexpr size_t kNumStreams = sizeof(T); - static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + uint8_t* out) { + static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); + constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams; - const int64_t size = num_values * sizeof(T); + const int64_t size = num_values * kNumStreams; if (size < kBlockSize) // Back to SSE for small size - return ByteStreamSplitDecodeSse2(data, num_values, stride, out); + return ByteStreamSplitDecodeSse2(data, num_values, stride, out); const int64_t num_blocks = size / kBlockSize; - uint8_t* output_data = reinterpret_cast(out); // First handle suffix. const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; for (int64_t i = num_processed_elements; i < num_values; ++i) { uint8_t gathered_byte_data[kNumStreams]; - for (size_t b = 0; b < kNumStreams; ++b) { - const size_t byte_index = b * stride + i; + for (int b = 0; b < kNumStreams; ++b) { + const int64_t byte_index = b * stride + i; gathered_byte_data[b] = data[byte_index]; } - out[i] = arrow::util::SafeLoadAs(&gathered_byte_data[0]); + memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); } // Processed hierarchically using unpack intrinsics, then permute intrinsics. - __m256i stage[kNumStreamsLog2 + 1U][kNumStreams]; + __m256i stage[kNumStreamsLog2 + 1][kNumStreams]; __m256i final_result[kNumStreams]; - constexpr size_t kNumStreamsHalf = kNumStreams / 2U; + constexpr int kNumStreamsHalf = kNumStreams / 2; for (int64_t i = 0; i < num_blocks; ++i) { - for (size_t j = 0; j < kNumStreams; ++j) { + for (int j = 0; j < kNumStreams; ++j) { stage[0][j] = _mm256_loadu_si256( reinterpret_cast(&data[i * sizeof(__m256i) + j * stride])); } - for (size_t step = 0; step < kNumStreamsLog2; ++step) { - for (size_t j = 0; j < kNumStreamsHalf; ++j) { - stage[step + 1U][j * 2] = + for (int step = 0; step < kNumStreamsLog2; ++step) { + for (int j = 0; j < kNumStreamsHalf; ++j) { + stage[step + 1][j * 2] = _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - stage[step + 1U][j * 2 + 1U] = + stage[step + 1][j * 2 + 1] = _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); } } - if constexpr (kNumStreams == 8U) { + if constexpr (kNumStreams == 8) { // path for double, 128i index: // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B}, // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F}, @@ -258,40 +251,41 @@ void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t stage[kNumStreamsLog2][3], 0b00110001); } - for (size_t j = 0; j < kNumStreams; ++j) { - _mm256_storeu_si256(reinterpret_cast<__m256i*>( - &output_data[(i * kNumStreams + j) * sizeof(__m256i)]), - final_result[j]); + for (int j = 0; j < kNumStreams; ++j) { + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(out + (i * kNumStreams + j) * sizeof(__m256i)), + final_result[j]); } } } -template -void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_values, +template +void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { - constexpr size_t kNumStreams = sizeof(T); - static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams; + static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); + constexpr int kBlockSize = sizeof(__m256i) * kNumStreams; - if constexpr (kNumStreams == 8U) // Back to SSE, currently no path for double. - return ByteStreamSplitEncodeSse2(raw_values, num_values, output_buffer_raw); + if constexpr (kNumStreams == 8) // Back to SSE, currently no path for double. + return ByteStreamSplitEncodeSse2(raw_values, num_values, + output_buffer_raw); - const size_t size = num_values * sizeof(T); + const int64_t size = num_values * kNumStreams; if (size < kBlockSize) // Back to SSE for small size - return ByteStreamSplitEncodeSse2(raw_values, num_values, output_buffer_raw); - const size_t num_blocks = size / kBlockSize; + return ByteStreamSplitEncodeSse2(raw_values, num_values, + output_buffer_raw); + const int64_t num_blocks = size / kBlockSize; const __m256i* raw_values_simd = reinterpret_cast(raw_values); __m256i* output_buffer_streams[kNumStreams]; - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { output_buffer_streams[i] = reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]); } // First handle suffix. - const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T); - for (size_t i = num_processed_elements; i < num_values; ++i) { - for (size_t j = 0U; j < kNumStreams; ++j) { + const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; + for (int64_t i = num_processed_elements; i < num_values; ++i) { + for (int j = 0; j < kNumStreams; ++j) { const uint8_t byte_in_value = raw_values[i * kNumStreams + j]; output_buffer_raw[j * num_values + i] = byte_in_value; } @@ -301,20 +295,20 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_value // 1. Processed hierarchically to 32i block using the unpack intrinsics. // 2. Pack 128i block using _mm256_permutevar8x32_epi32. // 3. Pack final 256i block with _mm256_permute2x128_si256. - constexpr size_t kNumUnpack = 3U; + constexpr int kNumUnpack = 3; __m256i stage[kNumUnpack + 1][kNumStreams]; static const __m256i kPermuteMask = _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); __m256i permute[kNumStreams]; __m256i final_result[kNumStreams]; - for (size_t block_index = 0; block_index < num_blocks; ++block_index) { - for (size_t i = 0; i < kNumStreams; ++i) { + for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { + for (int i = 0; i < kNumStreams; ++i) { stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]); } - for (size_t stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) { - for (size_t i = 0; i < kNumStreams / 2U; ++i) { + for (int stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) { + for (int i = 0; i < kNumStreams / 2; ++i) { stage[stage_lvl + 1][i * 2] = _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); stage[stage_lvl + 1][i * 2 + 1] = @@ -322,7 +316,7 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_value } } - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], kPermuteMask); } @@ -331,7 +325,7 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_value final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00100000); final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00110001); - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]); } } @@ -339,53 +333,51 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_value #endif // ARROW_HAVE_AVX2 #if defined(ARROW_HAVE_AVX512) -template +template void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride, - T* out) { - constexpr size_t kNumStreams = sizeof(T); - static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U); + uint8_t* out) { + static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); + constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams; - const int64_t size = num_values * sizeof(T); + const int64_t size = num_values * kNumStreams; if (size < kBlockSize) // Back to AVX2 for small size - return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); + return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); const int64_t num_blocks = size / kBlockSize; - uint8_t* output_data = reinterpret_cast(out); // First handle suffix. const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; for (int64_t i = num_processed_elements; i < num_values; ++i) { uint8_t gathered_byte_data[kNumStreams]; - for (size_t b = 0; b < kNumStreams; ++b) { - const size_t byte_index = b * stride + i; + for (int b = 0; b < kNumStreams; ++b) { + const int64_t byte_index = b * stride + i; gathered_byte_data[b] = data[byte_index]; } - out[i] = arrow::util::SafeLoadAs(&gathered_byte_data[0]); + memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); } // Processed hierarchically using the unpack, then two shuffles. - __m512i stage[kNumStreamsLog2 + 1U][kNumStreams]; + __m512i stage[kNumStreamsLog2 + 1][kNumStreams]; __m512i shuffle[kNumStreams]; __m512i final_result[kNumStreams]; - constexpr size_t kNumStreamsHalf = kNumStreams / 2U; + constexpr int kNumStreamsHalf = kNumStreams / 2U; for (int64_t i = 0; i < num_blocks; ++i) { - for (size_t j = 0; j < kNumStreams; ++j) { + for (int j = 0; j < kNumStreams; ++j) { stage[0][j] = _mm512_loadu_si512( reinterpret_cast(&data[i * sizeof(__m512i) + j * stride])); } - for (size_t step = 0; step < kNumStreamsLog2; ++step) { - for (size_t j = 0; j < kNumStreamsHalf; ++j) { - stage[step + 1U][j * 2] = + for (int step = 0; step < kNumStreamsLog2; ++step) { + for (int j = 0; j < kNumStreamsHalf; ++j) { + stage[step + 1][j * 2] = _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - stage[step + 1U][j * 2 + 1U] = + stage[step + 1][j * 2 + 1] = _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); } } - if constexpr (kNumStreams == 8U) { + if constexpr (kNumStreams == 8) { // path for double, 128i index: // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C}, // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, @@ -435,49 +427,49 @@ void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101); } - for (size_t j = 0; j < kNumStreams; ++j) { - _mm512_storeu_si512(reinterpret_cast<__m512i*>( - &output_data[(i * kNumStreams + j) * sizeof(__m512i)]), - final_result[j]); + for (int j = 0; j < kNumStreams; ++j) { + _mm512_storeu_si512( + reinterpret_cast<__m512i*>(out + (i * kNumStreams + j) * sizeof(__m512i)), + final_result[j]); } } } -template -void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_values, +template +void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { - constexpr size_t kNumStreams = sizeof(T); - static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams."); - constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams; + static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); + constexpr int kBlockSize = sizeof(__m512i) * kNumStreams; - const size_t size = num_values * sizeof(T); + const int64_t size = num_values * kNumStreams; if (size < kBlockSize) // Back to AVX2 for small size - return ByteStreamSplitEncodeAvx2(raw_values, num_values, output_buffer_raw); + return ByteStreamSplitEncodeAvx2(raw_values, num_values, + output_buffer_raw); - const size_t num_blocks = size / kBlockSize; + const int64_t num_blocks = size / kBlockSize; const __m512i* raw_values_simd = reinterpret_cast(raw_values); __m512i* output_buffer_streams[kNumStreams]; - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { output_buffer_streams[i] = reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]); } // First handle suffix. - const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T); - for (size_t i = num_processed_elements; i < num_values; ++i) { - for (size_t j = 0U; j < kNumStreams; ++j) { + const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; + for (int64_t i = num_processed_elements; i < num_values; ++i) { + for (int j = 0; j < kNumStreams; ++j) { const uint8_t byte_in_value = raw_values[i * kNumStreams + j]; output_buffer_raw[j * num_values + i] = byte_in_value; } } - constexpr size_t KNumUnpack = (kNumStreams == 8U) ? 2U : 3U; + constexpr int KNumUnpack = (kNumStreams == 8) ? 2 : 3; __m512i final_result[kNumStreams]; __m512i unpack[KNumUnpack + 1][kNumStreams]; __m512i permutex[kNumStreams]; __m512i permutex_mask; - if constexpr (kNumStreams == 8U) { + if constexpr (kNumStreams == 8) { // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version. permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006, 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004, @@ -488,13 +480,13 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00); } - for (size_t block_index = 0; block_index < num_blocks; ++block_index) { - for (size_t i = 0; i < kNumStreams; ++i) { + for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { + for (int i = 0; i < kNumStreams; ++i) { unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]); } - for (size_t unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) { - for (size_t i = 0; i < kNumStreams / 2U; ++i) { + for (int unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) { + for (int i = 0; i < kNumStreams / 2; ++i) { unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8( unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]); unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8( @@ -502,7 +494,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val } } - if constexpr (kNumStreams == 8U) { + if constexpr (kNumStreams == 8) { // path for double // 1. unpack to epi16 block // 2. permutexvar_epi16 to 128i block @@ -511,7 +503,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E}, // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F}, - for (size_t i = 0; i < kNumStreams; ++i) + for (int i = 0; i < kNumStreams; ++i) permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]); __m512i shuffle[kNumStreams]; @@ -537,7 +529,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val // 1. Processed hierarchically to 32i block using the unpack intrinsics. // 2. Pack 128i block using _mm256_permutevar8x32_epi32. // 3. Pack final 256i block with _mm256_permute2x128_si256. - for (size_t i = 0; i < kNumStreams; ++i) + for (int i = 0; i < kNumStreams; ++i) permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]); final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100); @@ -546,7 +538,7 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110); } - for (size_t i = 0; i < kNumStreams; ++i) { + for (int i = 0; i < kNumStreams; ++i) { _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]); } } @@ -554,32 +546,32 @@ void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_val #endif // ARROW_HAVE_AVX512 #if defined(ARROW_HAVE_SIMD_SPLIT) -template +template void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values, - int64_t stride, T* out) { + int64_t stride, uint8_t* out) { #if defined(ARROW_HAVE_AVX512) - return ByteStreamSplitDecodeAvx512(data, num_values, stride, out); + return ByteStreamSplitDecodeAvx512(data, num_values, stride, out); #elif defined(ARROW_HAVE_AVX2) - return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); + return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); #elif defined(ARROW_HAVE_SSE4_2) - return ByteStreamSplitDecodeSse2(data, num_values, stride, out); + return ByteStreamSplitDecodeSse2(data, num_values, stride, out); #else #error "ByteStreamSplitDecodeSimd not implemented" #endif } -template +template void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { #if defined(ARROW_HAVE_AVX512) - return ByteStreamSplitEncodeAvx512(raw_values, static_cast(num_values), - output_buffer_raw); + return ByteStreamSplitEncodeAvx512(raw_values, num_values, + output_buffer_raw); #elif defined(ARROW_HAVE_AVX2) - return ByteStreamSplitEncodeAvx2(raw_values, static_cast(num_values), - output_buffer_raw); + return ByteStreamSplitEncodeAvx2(raw_values, num_values, + output_buffer_raw); #elif defined(ARROW_HAVE_SSE4_2) - return ByteStreamSplitEncodeSse2(raw_values, static_cast(num_values), - output_buffer_raw); + return ByteStreamSplitEncodeSse2(raw_values, num_values, + output_buffer_raw); #else #error "ByteStreamSplitEncodeSimd not implemented" #endif @@ -678,10 +670,9 @@ inline void DoMergeStreams(const uint8_t** src_streams, int width, int64_t nvalu } } -template +template void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { - constexpr int kNumStreams = static_cast(sizeof(T)); std::array dest_streams; for (int stream = 0; stream < kNumStreams; ++stream) { dest_streams[stream] = &output_buffer_raw[stream * num_values]; @@ -689,35 +680,35 @@ void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const int64_t num_va DoSplitStreams(raw_values, kNumStreams, num_values, dest_streams.data()); } -template +template void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride, - T* out) { - constexpr int kNumStreams = static_cast(sizeof(T)); + uint8_t* out) { std::array src_streams; for (int stream = 0; stream < kNumStreams; ++stream) { src_streams[stream] = &data[stream * stride]; } - DoMergeStreams(src_streams.data(), kNumStreams, num_values, - reinterpret_cast(out)); + DoMergeStreams(src_streams.data(), kNumStreams, num_values, out); } -template +template void inline ByteStreamSplitEncode(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { #if defined(ARROW_HAVE_SIMD_SPLIT) - return ByteStreamSplitEncodeSimd(raw_values, num_values, output_buffer_raw); + return ByteStreamSplitEncodeSimd(raw_values, num_values, + output_buffer_raw); #else - return ByteStreamSplitEncodeScalar(raw_values, num_values, output_buffer_raw); + return ByteStreamSplitEncodeScalar(raw_values, num_values, + output_buffer_raw); #endif } -template +template void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, int64_t stride, - T* out) { + uint8_t* out) { #if defined(ARROW_HAVE_SIMD_SPLIT) - return ByteStreamSplitDecodeSimd(data, num_values, stride, out); + return ByteStreamSplitDecodeSimd(data, num_values, stride, out); #else - return ByteStreamSplitDecodeScalar(data, num_values, stride, out); + return ByteStreamSplitDecodeScalar(data, num_values, stride, out); #endif } diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index c98f0a086738b..71c6063179ea6 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -61,18 +61,30 @@ void ReferenceByteStreamSplitEncode(const uint8_t* src, int width, template class TestByteStreamSplitSpecialized : public ::testing::Test { public: - using EncodeFunc = NamedFunc)>>; - using DecodeFunc = NamedFunc)>>; - static constexpr int kWidth = static_cast(sizeof(T)); + using EncodeFunc = NamedFunc)>>; + using DecodeFunc = NamedFunc)>>; + void SetUp() override { encode_funcs_.push_back({"reference", &ReferenceEncode}); - encode_funcs_.push_back({"scalar", &ByteStreamSplitEncodeScalar}); - decode_funcs_.push_back({"scalar", &ByteStreamSplitDecodeScalar}); + encode_funcs_.push_back({"scalar", &ByteStreamSplitEncodeScalar}); + decode_funcs_.push_back({"scalar", &ByteStreamSplitDecodeScalar}); #if defined(ARROW_HAVE_SIMD_SPLIT) - encode_funcs_.push_back({"simd", &ByteStreamSplitEncodeSimd}); - decode_funcs_.push_back({"simd", &ByteStreamSplitDecodeSimd}); + encode_funcs_.push_back({"simd", &ByteStreamSplitEncodeSimd}); + decode_funcs_.push_back({"simd", &ByteStreamSplitDecodeSimd}); +#endif +#if defined(ARROW_HAVE_SSE4_2) + encode_funcs_.push_back({"sse2", &ByteStreamSplitEncodeSse2}); + decode_funcs_.push_back({"sse2", &ByteStreamSplitDecodeSse2}); +#endif +#if defined(ARROW_HAVE_AVX2) + encode_funcs_.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); + decode_funcs_.push_back({"avx2", &ByteStreamSplitDecodeAvx2}); +#endif +#if defined(ARROW_HAVE_AVX512) + encode_funcs_.push_back({"avx512", &ByteStreamSplitEncodeAvx512}); + decode_funcs_.push_back({"avx512", &ByteStreamSplitDecodeAvx512}); #endif } @@ -92,7 +104,7 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { ARROW_SCOPED_TRACE("decode_func = ", decode_func); decoded.assign(decoded.size(), T{}); decode_func.func(encoded.data(), num_values, /*stride=*/num_values, - decoded.data()); + reinterpret_cast(decoded.data())); ASSERT_EQ(decoded, input); } } @@ -118,7 +130,7 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { while (offset < num_values) { auto chunk_size = std::min(num_values - offset, chunk_size_dist(gen)); decode_func.func(encoded.data() + offset, chunk_size, /*stride=*/num_values, - decoded.data() + offset); + reinterpret_cast(decoded.data() + offset)); offset += chunk_size; } ASSERT_EQ(offset, num_values); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b07ad6c9fb062..b801b5ab11bb9 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -850,8 +850,8 @@ std::shared_ptr ByteStreamSplitEncoder::FlushValues() { AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize()); uint8_t* output_buffer_raw = output_buffer->mutable_data(); const uint8_t* raw_values = sink_.data(); - ::arrow::util::internal::ByteStreamSplitEncode(raw_values, num_values_in_buffer_, - output_buffer_raw); + ::arrow::util::internal::ByteStreamSplitEncode( + raw_values, num_values_in_buffer_, output_buffer_raw); sink_.Reset(); num_values_in_buffer_ = 0; return std::move(output_buffer); @@ -3577,7 +3577,7 @@ class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder decode_buffer_; - static constexpr size_t kNumStreams = sizeof(T); + static constexpr int kNumStreams = sizeof(T); }; template @@ -3607,8 +3607,8 @@ int ByteStreamSplitDecoder::Decode(T* buffer, int max_values) { const int num_decoded_previously = num_values_in_buffer_ - num_values_; const uint8_t* data = data_ + num_decoded_previously; - ::arrow::util::internal::ByteStreamSplitDecode(data, values_to_decode, - num_values_in_buffer_, buffer); + ::arrow::util::internal::ByteStreamSplitDecode( + data, values_to_decode, num_values_in_buffer_, reinterpret_cast(buffer)); num_values_ -= values_to_decode; len_ -= sizeof(T) * values_to_decode; return values_to_decode; @@ -3618,7 +3618,7 @@ template int ByteStreamSplitDecoder::DecodeArrow( int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, typename EncodingTraits::Accumulator* builder) { - constexpr int value_size = static_cast(kNumStreams); + constexpr int value_size = kNumStreams; int values_decoded = num_values - null_count; if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) { ParquetException::EofException(); @@ -3634,8 +3634,9 @@ int ByteStreamSplitDecoder::DecodeArrow( // Use fast decoding into intermediate buffer. This will also decode // some null values, but it's fast enough that we don't care. T* decode_out = EnsureDecodeBuffer(values_decoded); - ::arrow::util::internal::ByteStreamSplitDecode(data, values_decoded, - num_values_in_buffer_, decode_out); + ::arrow::util::internal::ByteStreamSplitDecode( + data, values_decoded, num_values_in_buffer_, + reinterpret_cast(decode_out)); // XXX If null_count is 0, we could even append in bulk or decode directly into // builder @@ -3648,12 +3649,13 @@ int ByteStreamSplitDecoder::DecodeArrow( [&]() { builder->UnsafeAppendNull(); }); #else + // XXX should operate over runs of 0s / 1s VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { uint8_t gathered_byte_data[kNumStreams]; - for (size_t b = 0; b < kNumStreams; ++b) { - const size_t byte_index = b * num_values_in_buffer_ + offset; + for (int b = 0; b < kNumStreams; ++b) { + const int64_t byte_index = b * num_values_in_buffer_ + offset; gathered_byte_data[b] = data[byte_index]; } builder->UnsafeAppend(SafeLoadAs(&gathered_byte_data[0])); diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index b5b6cc8d93e03..76c411244b22d 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -369,7 +369,8 @@ static void BM_ByteStreamSplitDecode(benchmark::State& state, DecodeFunc&& decod for (auto _ : state) { decode_func(values_raw, static_cast(values.size()), - static_cast(values.size()), output.data()); + static_cast(values.size()), + reinterpret_cast(output.data())); benchmark::ClobberMemory(); } state.SetBytesProcessed(state.iterations() * values.size() * sizeof(T)); @@ -390,22 +391,22 @@ static void BM_ByteStreamSplitEncode(benchmark::State& state, EncodeFunc&& encod static void BM_ByteStreamSplitDecode_Float_Scalar(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeScalar); + state, ::arrow::util::internal::ByteStreamSplitDecodeScalar); } static void BM_ByteStreamSplitDecode_Double_Scalar(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeScalar); + state, ::arrow::util::internal::ByteStreamSplitDecodeScalar); } static void BM_ByteStreamSplitEncode_Float_Scalar(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeScalar); + state, ::arrow::util::internal::ByteStreamSplitEncodeScalar); } static void BM_ByteStreamSplitEncode_Double_Scalar(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeScalar); + state, ::arrow::util::internal::ByteStreamSplitEncodeScalar); } BENCHMARK(BM_ByteStreamSplitDecode_Float_Scalar)->Range(MIN_RANGE, MAX_RANGE); @@ -416,22 +417,22 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE); #if defined(ARROW_HAVE_SSE4_2) static void BM_ByteStreamSplitDecode_Float_Sse2(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeSse2); + state, ::arrow::util::internal::ByteStreamSplitDecodeSse2); } static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeSse2); + state, ::arrow::util::internal::ByteStreamSplitDecodeSse2); } static void BM_ByteStreamSplitEncode_Float_Sse2(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeSse2); + state, ::arrow::util::internal::ByteStreamSplitEncodeSse2); } static void BM_ByteStreamSplitEncode_Double_Sse2(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeSse2); + state, ::arrow::util::internal::ByteStreamSplitEncodeSse2); } BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Range(MIN_RANGE, MAX_RANGE); @@ -443,22 +444,22 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Range(MIN_RANGE, MAX_RANGE); #if defined(ARROW_HAVE_AVX2) static void BM_ByteStreamSplitDecode_Float_Avx2(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); + state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); } static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); + state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); } static void BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); + state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); } static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); + state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); } BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE); @@ -470,22 +471,22 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE); #if defined(ARROW_HAVE_AVX512) static void BM_ByteStreamSplitDecode_Float_Avx512(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); + state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); } static void BM_ByteStreamSplitDecode_Double_Avx512(benchmark::State& state) { BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); + state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); } static void BM_ByteStreamSplitEncode_Float_Avx512(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); + state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); } static void BM_ByteStreamSplitEncode_Double_Avx512(benchmark::State& state) { BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); + state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); } BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE); diff --git a/dev/release/post-11-bump-versions-test.rb b/dev/release/post-11-bump-versions-test.rb index 4b6933d6102a9..78d9320bfb312 100644 --- a/dev/release/post-11-bump-versions-test.rb +++ b/dev/release/post-11-bump-versions-test.rb @@ -197,6 +197,15 @@ def test_version_post_tag ] if release_type == :major expected_changes += [ + { + path: "docs/source/index.rst", + hunks: [ + [ + "- Go ", + "+ Go ", + ], + ], + }, { path: "r/pkgdown/assets/versions.json", hunks: [ @@ -212,6 +221,15 @@ def test_version_post_tag ], ], }, + { + path: "r/_pkgdown.yml", + hunks: [ + [ + "- [Go](https://pkg.go.dev/github.com/apache/arrow/go/v#{@snapshot_major_version})
", + "+ [Go](https://pkg.go.dev/github.com/apache/arrow/go/v#{@next_major_version})
", + ], + ], + }, ] else expected_changes += [ diff --git a/dev/release/utils-prepare.sh b/dev/release/utils-prepare.sh index 8e4c8a84ae8fd..51367087228a4 100644 --- a/dev/release/utils-prepare.sh +++ b/dev/release/utils-prepare.sh @@ -127,6 +127,7 @@ update_versions() { DESCRIPTION rm -f DESCRIPTION.bak git add DESCRIPTION + # Replace dev version with release version sed -i.bak -E -e \ "/^