Skip to content

Commit b2cf62f

Browse files
kiszkpitrou
authored andcommitted
ARROW-8486: [C++] Fix BitArray failures on big-endian platforms
This PR supports big-endian platforms of bit operations such as Bitmap, BitRead, BitWriter, and others. Ths PR can fix `BitArray.TestBool` and `BitArray.TestValue` in arrow-utility-test's [failures](https://travis-ci.org/github/apache/arrow/builds/684931696). This PR consists of two parts. 1. Convert data layout to native-endian to process data. This PR basically inserts this conversion before and after `memcpy`. This is because because `memcpy` transfers data from/to variable using a memory layout in the buffer. This PR still assumes that data layout in buffer is in little-endian. 2. Add ``BitUtil::ByteSwap` for uint8. Closes apache#7136 from kiszk/ARROW-8486 Authored-by: Kazuaki Ishizaki <ishizaki@jp.ibm.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent e215e89 commit b2cf62f

File tree

2 files changed

+37
-17
lines changed

2 files changed

+37
-17
lines changed

cpp/src/arrow/util/bit_stream_utils.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class BitWriter {
6363
/// Writes v to the next aligned byte using num_bytes. If T is larger than
6464
/// num_bytes, the extra high-order bytes will be ignored. Returns false if
6565
/// there was not enough space.
66+
/// Assume the v is stored in buffer_ as a litte-endian format
6667
template <typename T>
6768
bool PutAligned(T v, int num_bytes);
6869

@@ -107,6 +108,7 @@ class BitReader {
107108
: buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
108109
int num_bytes = std::min(8, max_bytes_ - byte_offset_);
109110
memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
111+
buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
110112
}
111113

112114
BitReader()
@@ -123,6 +125,7 @@ class BitReader {
123125
bit_offset_ = 0;
124126
int num_bytes = std::min(8, max_bytes_ - byte_offset_);
125127
memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
128+
buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
126129
}
127130

128131
/// Gets the next value from the buffer. Returns true if 'v' could be read or false if
@@ -139,6 +142,7 @@ class BitReader {
139142
/// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
140143
/// be advanced to the start of the next byte before 'v' is read. Returns
141144
/// false if there are not enough bytes left.
145+
/// Assume the v was stored in buffer_ as a litte-endian format
142146
template <typename T>
143147
bool GetAligned(int num_bytes, T* v);
144148

@@ -185,6 +189,7 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
185189

186190
if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
187191
// Flush buffered_values_ and write out bits of v that did not fit
192+
buffered_values_ = arrow::BitUtil::ToLittleEndian(buffered_values_);
188193
memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
189194
buffered_values_ = 0;
190195
byte_offset_ += 8;
@@ -198,7 +203,8 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
198203
inline void BitWriter::Flush(bool align) {
199204
int num_bytes = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
200205
DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
201-
memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes);
206+
auto buffered_values = arrow::BitUtil::ToLittleEndian(buffered_values_);
207+
memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);
202208

203209
if (align) {
204210
buffered_values_ = 0;
@@ -220,6 +226,7 @@ template <typename T>
220226
inline bool BitWriter::PutAligned(T val, int num_bytes) {
221227
uint8_t* ptr = GetNextBytePtr(num_bytes);
222228
if (ptr == NULL) return false;
229+
val = arrow::BitUtil::ToLittleEndian(val);
223230
memcpy(ptr, &val, num_bytes);
224231
return true;
225232
}
@@ -249,6 +256,7 @@ inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
249256
} else {
250257
memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
251258
}
259+
*buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
252260
#ifdef _MSC_VER
253261
#pragma warning(push)
254262
#pragma warning(disable : 4800 4805)
@@ -335,6 +343,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
335343
} else {
336344
memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
337345
}
346+
buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);
338347

339348
for (; i < batch_size; ++i) {
340349
detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
@@ -362,6 +371,7 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) {
362371
// Advance byte_offset to next unread byte and read num_bytes
363372
byte_offset_ += bytes_read;
364373
memcpy(v, buffer_ + byte_offset_, num_bytes);
374+
*v = arrow::BitUtil::FromLittleEndian(*v);
365375
byte_offset_ += num_bytes;
366376

367377
// Reset buffered_values_
@@ -372,6 +382,7 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) {
372382
} else {
373383
memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
374384
}
385+
buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
375386
return true;
376387
}
377388

cpp/src/arrow/util/bit_util.h

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ static inline int16_t ByteSwap(int16_t value) {
329329
static inline uint16_t ByteSwap(uint16_t value) {
330330
return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
331331
}
332+
static inline uint8_t ByteSwap(uint8_t value) { return value; }
332333

333334
// Write the swapped bytes into dst. Src and dst cannot overlap.
334335
static inline void ByteSwap(void* dst, const void* src, int len) {
@@ -358,53 +359,61 @@ static inline void ByteSwap(void* dst, const void* src, int len) {
358359

359360
// Convert to little/big endian format from the machine's native endian format.
360361
#if ARROW_LITTLE_ENDIAN
361-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
362-
uint32_t, int16_t, uint16_t>>
362+
template <typename T,
363+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
364+
int16_t, uint16_t, uint8_t>>
363365
static inline T ToBigEndian(T value) {
364366
return ByteSwap(value);
365367
}
366368

367-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
368-
uint32_t, int16_t, uint16_t>>
369+
template <typename T,
370+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
371+
int16_t, uint16_t, uint8_t>>
369372
static inline T ToLittleEndian(T value) {
370373
return value;
371374
}
372375
#else
373-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
374-
uint32_t, int16_t, uint16_t>>
376+
template <typename T,
377+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
378+
int16_t, uint16_t, uint8_t>>
375379
static inline T ToBigEndian(T value) {
376380
return value;
377381
}
378382

379-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
380-
uint32_t, int16_t, uint16_t>>
383+
template <typename T,
384+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
385+
int16_t, uint16_t, uint8_t>>
381386
static inline T ToLittleEndian(T value) {
382387
return ByteSwap(value);
383388
}
384389
#endif
385390

386391
// Convert from big/little endian format to the machine's native endian format.
387392
#if ARROW_LITTLE_ENDIAN
388-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
389-
uint32_t, int16_t, uint16_t>>
393+
template <typename T,
394+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
395+
int16_t, uint16_t, uint8_t>>
390396
static inline T FromBigEndian(T value) {
391397
return ByteSwap(value);
392398
}
393399

394-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
395-
uint32_t, int16_t, uint16_t>>
400+
template <typename T,
401+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
402+
int16_t, uint16_t, uint8_t>>
396403
static inline T FromLittleEndian(T value) {
397404
return value;
398405
}
399406
#else
400-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
401-
uint32_t, int16_t, uint16_t>>
407+
template <typename T,
408+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
409+
int16_t, uint16_t, uint8_t>>
402410
static inline T FromBigEndian(T value) {
403411
return value;
404412
}
405413

406-
template <typename T, typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t,
407-
uint32_t, int16_t, uint16_t>>
414+
template <typename T,
415+
typename = internal::EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t,
416+
int16_t, uint16_t, uint8_t>>
408417
static inline T FromLittleEndian(T value) {
409418
return ByteSwap(value);
410419
}

0 commit comments

Comments
 (0)