Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/src/arrow/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ add_arrow_test(bit-utility-test
SOURCES
bit_block_counter_test.cc
bit_util_test.cc
bpacking_test.cc
rle_encoding_test.cc)

add_arrow_test(threading-utility-test
Expand All @@ -117,6 +118,7 @@ add_arrow_test(crc32-test

add_arrow_benchmark(bit_block_counter_benchmark)
add_arrow_benchmark(bit_util_benchmark)
add_arrow_benchmark(bpacking_benchmark)
add_arrow_benchmark(bitmap_reader_benchmark)
add_arrow_benchmark(cache_benchmark)
add_arrow_benchmark(compression_benchmark)
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/arrow/util/bit_stream_utils_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "arrow/util/bit_util.h"
#include "arrow/util/bpacking_internal.h"
#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
Expand Down Expand Up @@ -339,8 +340,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {

if (sizeof(T) == 4) {
int num_unpacked =
internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
internal::unpack32(buffer + byte_offset, reinterpret_cast<uint32_t*>(v + i),
batch_size - i, num_bits);
i += num_unpacked;
byte_offset += num_unpacked * num_bits / 8;
} else if (sizeof(T) == 8 && num_bits > 32) {
Expand All @@ -360,8 +361,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
while (i < batch_size) {
int unpack_size = std::min(buffer_size, batch_size - i);
int num_unpacked =
internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
unpack_buffer, unpack_size, num_bits);
internal::unpack32(buffer + byte_offset, unpack_buffer, unpack_size, num_bits);
if (num_unpacked == 0) {
break;
}
Expand Down
20 changes: 9 additions & 11 deletions cpp/src/arrow/util/bpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@
namespace arrow {
namespace internal {

namespace {
int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) {
const uint32_t* in = reinterpret_cast<const uint32_t*>(in_);

int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
batch_size = batch_size / 32 * 32;
int num_loops = batch_size / 32;

Expand Down Expand Up @@ -149,11 +149,13 @@ int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_
return batch_size;
}

namespace {

struct Unpack32DynamicFunction {
using FunctionType = decltype(&unpack32_default);
using FunctionType = decltype(&unpack32_scalar);

static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
return {{DispatchLevel::NONE, unpack32_default}
return {{DispatchLevel::NONE, unpack32_scalar}
#if defined(ARROW_HAVE_RUNTIME_AVX2)
,
{DispatchLevel::AVX2, unpack32_avx2}
Expand All @@ -168,7 +170,7 @@ struct Unpack32DynamicFunction {

} // namespace

int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
#if defined(ARROW_HAVE_NEON)
return unpack32_neon(in, out, batch_size, num_bits);
#else
Expand All @@ -177,9 +179,7 @@ int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
#endif
}

namespace {

int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
batch_size = batch_size / 32 * 32;
int num_loops = batch_size / 32;

Expand Down Expand Up @@ -386,11 +386,9 @@ int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_b
return batch_size;
}

} // namespace

int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) {
// TODO: unpack64_neon, unpack64_avx2 and unpack64_avx512
return unpack64_default(in, out, batch_size, num_bits);
return unpack64_scalar(in, out, batch_size, num_bits);
}

} // namespace internal
Expand Down
8 changes: 3 additions & 5 deletions cpp/src/arrow/util/bpacking64_default_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@

#pragma once

#include "arrow/util/bit_util.h"
#include "arrow/util/endian.h"
#include "arrow/util/ubsan.h"

namespace arrow {
namespace internal {
namespace arrow::internal {

inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) {
for (int k = 0; k < 32; k += 1) {
Expand Down Expand Up @@ -5638,5 +5637,4 @@ inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out) {
return in;
}

} // namespace internal
} // namespace arrow
} // namespace arrow::internal
12 changes: 5 additions & 7 deletions cpp/src/arrow/util/bpacking_avx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,11 @@
#include "arrow/util/bpacking_simd256_generated_internal.h"
#include "arrow/util/bpacking_simd_internal.h"

namespace arrow {
namespace internal {
namespace arrow::internal {

int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
return unpack32_specialized<UnpackBits256<DispatchLevel::AVX2>>(in, out, batch_size,
num_bits);
int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
return unpack32_specialized<UnpackBits256<DispatchLevel::AVX2>>(
reinterpret_cast<const uint32_t*>(in), out, batch_size, num_bits);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can also change the signature of the generated functions, what do you think? The current signature assumes the SIMD functions will load the input in 32-bit chunks, but they might make different choices.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intended to keep that for a future PR, and keep this one focused on the tests and benchmark of the "public" functions. There are a few things we can change internally.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough!

}

} // namespace internal
} // namespace arrow
} // namespace arrow::internal
13 changes: 7 additions & 6 deletions cpp/src/arrow/util/bpacking_avx2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@

#pragma once

#include <stdint.h>
#include "arrow/util/visibility.h"

namespace arrow {
namespace internal {
#include <cstdint>

int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits);
namespace arrow::internal {

} // namespace internal
} // namespace arrow
ARROW_EXPORT int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size,
int num_bits);

} // namespace arrow::internal
12 changes: 5 additions & 7 deletions cpp/src/arrow/util/bpacking_avx512.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,11 @@
#include "arrow/util/bpacking_simd512_generated_internal.h"
#include "arrow/util/bpacking_simd_internal.h"

namespace arrow {
namespace internal {
namespace arrow::internal {

int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
return unpack32_specialized<UnpackBits512<DispatchLevel::AVX512>>(in, out, batch_size,
num_bits);
int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) {
return unpack32_specialized<UnpackBits512<DispatchLevel::AVX512>>(
reinterpret_cast<const uint32_t*>(in), out, batch_size, num_bits);
}

} // namespace internal
} // namespace arrow
} // namespace arrow::internal
13 changes: 7 additions & 6 deletions cpp/src/arrow/util/bpacking_avx512_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@

#pragma once

#include <stdint.h>
#include "arrow/util/visibility.h"

namespace arrow {
namespace internal {
#include <cstdint>

int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits);
namespace arrow::internal {

} // namespace internal
} // namespace arrow
ARROW_EXPORT int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size,
int num_bits);

} // namespace arrow::internal
162 changes: 162 additions & 0 deletions cpp/src/arrow/util/bpacking_benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <stdexcept>
#include <vector>

#include <benchmark/benchmark.h>

#include "arrow/testing/util.h"
#include "arrow/util/bpacking_internal.h"

#if defined(ARROW_HAVE_RUNTIME_AVX2)
# include "arrow/util/bpacking_avx2_internal.h"
# include "arrow/util/cpu_info.h"
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
# include "arrow/util/bpacking_avx512_internal.h"
#endif
#if defined(ARROW_HAVE_NEON)
# include "arrow/util/bpacking_neon_internal.h"
#endif

namespace arrow::internal {
namespace {

template <typename Int>
using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);

/// Get the number of bytes associate with a packing.
constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
const auto num_bits = num_values * bit_width;
if (num_bits % 8 != 0) {
throw std::invalid_argument("Must pack a multiple of 8 bits.");
}
return num_bits / 8;
}

/// Generate random bytes as packed integers.
std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) {
constexpr uint32_t kSeed = 3214;
const auto num_bytes = GetNumBytes(num_values, bit_width);

std::vector<uint8_t> out(num_bytes);
random_bytes(num_bytes, kSeed, out.data());

return out;
}

const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) {
auto addr = reinterpret_cast<std::uintptr_t>(ptr);

if (addr % alignment == 0) {
return ptr;
}

auto remainder = addr % alignment;
auto bytes_to_add = alignment - remainder;

return ptr + bytes_to_add;
}

template <typename Int>
void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bool skip,
std::string skip_msg) {
if (skip) {
state.SkipWithMessage(skip_msg);
}

const auto bit_width = static_cast<int32_t>(state.range(0));
const auto num_values = static_cast<int32_t>(state.range(1));

// Assume std::vector allocation is likely be aligned for greater than a byte.
// So we allocate more values than necessary and skip to the next byte with the
// desired (non) alignment to test the proper condition.
constexpr int32_t kExtraValues = sizeof(Int) * 8;
const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width);
const uint8_t* packed_ptr =
GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1);

std::vector<Int> unpacked(num_values, 0);

for (auto _ : state) {
unpack(packed_ptr, unpacked.data(), num_values, bit_width);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(num_values * state.iterations());
}

constexpr int32_t kMinRange = 64;
constexpr int32_t kMaxRange = 32768;
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = {
kBitWidths32,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = {
kBitWidths64,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
};

/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc<uint32_t> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<uint32_t>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
}

BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar)
->ArgsProduct(kBitWidthsNumValues64);

#if defined(ARROW_HAVE_RUNTIME_AVX2)
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues32);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX512)
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues32);
#endif

#if defined(ARROW_HAVE_NEON)
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, unpack32_neon)
->ArgsProduct(kBitWidthsNumValues32);
#endif

BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, unpack32)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, unpack32)
->ArgsProduct(kBitWidthsNumValues32);

BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, unpack64)
->ArgsProduct(kBitWidthsNumValues64);
BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, unpack64)
->ArgsProduct(kBitWidthsNumValues64);

} // namespace
} // namespace arrow::internal
19 changes: 12 additions & 7 deletions cpp/src/arrow/util/bpacking_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,23 @@

#pragma once

#include "arrow/util/endian.h"
#include "arrow/util/visibility.h"

#include <stdint.h>
#include <cstdint>

namespace arrow {
namespace internal {
namespace arrow::internal {

/// The scalar 32 bit unpacking.
ARROW_EXPORT int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size,
int num_bits);

/// The scalar 64 bit unpacking.
ARROW_EXPORT int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size,
int num_bits);

ARROW_EXPORT
int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits);
int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits);
ARROW_EXPORT
int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits);

} // namespace internal
} // namespace arrow
} // namespace arrow::internal
Loading
Loading