Skip to content

ARROW-9133: [C++] Add utf8_upper and utf8_lower #7449

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b144149
ARROW-8961: [C++] Add utf8proc library to toolchain
xhochy Jun 16, 2020
934a66a
Preserve alphabetical ordering
xhochy Jun 18, 2020
8a5add3
ARROW-9133: [C++] Add utf8_upper and utf8_lower
maartenbreddels Jun 16, 2020
6c4da7f
perf: transform using the ascii version if possible
maartenbreddels Jun 16, 2020
683ffa4
cleanups
maartenbreddels Jun 16, 2020
4cd3d72
rebase
maartenbreddels Jun 16, 2020
46168a1
utf benchmarks also touch the offsets
maartenbreddels Jun 16, 2020
9e33377
expose kernels to Python
maartenbreddels Jun 16, 2020
a4992cc
implement comments by Wes
maartenbreddels Jun 17, 2020
d27d3ba
faster case algo with a lookup table and custom encoder/decoder
maartenbreddels Jun 17, 2020
054692b
ascii version of utf8 transform not required anymore
maartenbreddels Jun 17, 2020
2829446
lint
maartenbreddels Jun 17, 2020
5b251bb
invalid data is replace by ?
maartenbreddels Jun 22, 2020
0249a12
raise error when string capacity is not enough for 32bit
maartenbreddels Jun 22, 2020
3e7856f
support sliced arrays
maartenbreddels Jun 22, 2020
d1108a4
use similar naming
maartenbreddels Jun 22, 2020
c682974
mistake, it appears that codepoints above 0xffff can change their cas…
maartenbreddels Jun 22, 2020
a81a0c7
don't ignore return value
maartenbreddels Jun 22, 2020
58cd0a6
-Wconversion fixes
maartenbreddels Jun 22, 2020
6dc038a
implement comments by antoine
maartenbreddels Jun 29, 2020
bd43f9b
non-gcc compiler complaints
maartenbreddels Jun 29, 2020
07bb6fd
implement comments by antoine, part2
maartenbreddels Jun 29, 2020
17657a6
Resize buffer at end instead of creating it anew
pitrou Jun 29, 2020
e6c800c
Try to fix R on Windows builds
pitrou Jun 29, 2020
6b2f07b
Add -lutf8proc for R Windows linking
pitrou Jun 29, 2020
d47ea4a
Error out on invalid UTF8 instead of succeeding
pitrou Jun 29, 2020
9a9079d
Also tweak r_windows_build.sh ...
pitrou Jun 29, 2020
11c342a
check capacity for scalar
maartenbreddels Jun 30, 2020
85a169e
Merge branch 'master' into ARROW-9133
pitrou Jun 30, 2020
592eb10
Try to fix RTools build again...
pitrou Jun 30, 2020
05c2f38
Merge branch 'ARROW-9133' of https://github.com/maartenbreddels/arrow…
pitrou Jun 30, 2020
ff3df58
Last one
pitrou Jun 30, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions ci/scripts/PKGBUILD
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat
arch=("any")
url="https://arrow.apache.org/"
license=("Apache-2.0")
depends=("${MINGW_PACKAGE_PREFIX}-thrift"
depends=("${MINGW_PACKAGE_PREFIX}-libutf8proc"
"${MINGW_PACKAGE_PREFIX}-thrift"
"${MINGW_PACKAGE_PREFIX}-snappy"
"${MINGW_PACKAGE_PREFIX}-zlib"
"${MINGW_PACKAGE_PREFIX}-lz4"
Expand Down Expand Up @@ -62,13 +63,17 @@ build() {
mkdir -p ${cpp_build_dir}
pushd ${cpp_build_dir}

# The Rtools libutf8proc is a static lib, but Findutf8proc.cmake doesn't
# set the appropriate compiler definition.
export CPPFLAGS="-DUTF8PROC_STATIC"

# This is the difference between rtools-packages and rtools-backports
# Remove this when submitting to rtools-packages
if [ "$RTOOLS_VERSION" = "35" ]; then
export CC="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin/gcc"
export CXX="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin/g++"
export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH"
export CPPFLAGS="-I${MINGW_PREFIX}/include"
export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include"
export LIBS="-L${MINGW_PREFIX}/libs"
fi

Expand All @@ -93,11 +98,13 @@ build() {
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_CXXFLAGS="${CPPFLAGS}" \
-DCMAKE_BUILD_TYPE="release" \
-DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} \
-DCMAKE_UNITY_BUILD=ON
-DCMAKE_UNITY_BUILD=ON \
-DCMAKE_VERBOSE_MAKEFILE=ON

make -j2
make -j3
popd
}

Expand Down
4 changes: 2 additions & 2 deletions ci/scripts/r_windows_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x6
cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386

# These are from https://dl.bintray.com/rtools/mingw{32,64}/
cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto}.a $DST_DIR/lib/x64
cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto}.a $DST_DIR/lib/i386
cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,utf8proc}.a $DST_DIR/lib/x64
cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,utf8proc}.a $DST_DIR/lib/i386

# Create build artifact
zip -r ${DST_DIR}.zip $DST_DIR
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/arrow/compute/kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,17 @@ namespace compute {
// ----------------------------------------------------------------------
// KernelContext

Result<std::shared_ptr<Buffer>> KernelContext::Allocate(int64_t nbytes) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> result,
AllocateBuffer(nbytes, exec_ctx_->memory_pool()));
Result<std::shared_ptr<ResizableBuffer>> KernelContext::Allocate(int64_t nbytes) {
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> result,
AllocateResizableBuffer(nbytes, exec_ctx_->memory_pool()));
result->ZeroPadding();
return result;
}

Result<std::shared_ptr<Buffer>> KernelContext::AllocateBitmap(int64_t num_bits) {
Result<std::shared_ptr<ResizableBuffer>> KernelContext::AllocateBitmap(int64_t num_bits) {
const int64_t nbytes = BitUtil::BytesForBits(num_bits);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> result,
AllocateBuffer(nbytes, exec_ctx_->memory_pool()));
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> result,
AllocateResizableBuffer(nbytes, exec_ctx_->memory_pool()));
// Some utility methods access the last byte before it might be
// initialized this makes valgrind/asan unhappy, so we proactively
// zero it.
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ class ARROW_EXPORT KernelContext {

/// \brief Allocate buffer from the context's memory pool. The contents are
/// not initialized.
Result<std::shared_ptr<Buffer>> Allocate(int64_t nbytes);
Result<std::shared_ptr<ResizableBuffer>> Allocate(int64_t nbytes);

/// \brief Allocate buffer for bitmap from the context's memory pool. Like
/// Allocate, the contents of the buffer are not initialized but the last
/// byte is preemptively zeroed to help avoid ASAN or valgrind issues.
Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t num_bits);
Result<std::shared_ptr<ResizableBuffer>> AllocateBitmap(int64_t num_bits);

/// \brief Indicate that an error has occurred, to be checked by a exec caller
/// \param[in] status a Status instance.
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/arrow/compute/kernels/codegen_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,17 @@ namespace internal {

#endif // ARROW_EXTRA_ERROR_CONTEXT

#define KERNEL_ASSIGN_OR_RAISE_IMPL(result_name, lhs, ctx, rexpr) \
auto result_name = (rexpr); \
KERNEL_RETURN_IF_ERROR(ctx, (result_name).status()); \
lhs = std::move(result_name).MoveValueUnsafe();

#define KERNEL_ASSIGN_OR_RAISE_NAME(x, y) ARROW_CONCAT(x, y)

#define KERNEL_ASSIGN_OR_RAISE(lhs, ctx, rexpr) \
KERNEL_ASSIGN_OR_RAISE_IMPL(KERNEL_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
lhs, ctx, rexpr);

/// KernelState adapter for the common case of kernels whose only
/// state is an instance of a subclass of FunctionOptions.
/// Default FunctionOptions are *not* handled here.
Expand Down
187 changes: 170 additions & 17 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@
#include <cctype>
#include <string>

#include <utf8proc.h>

#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_string_internal.h"
#include "arrow/util/utf8.h"
#include "arrow/util/value_parsing.h"

namespace arrow {
Expand All @@ -30,6 +33,39 @@ namespace internal {

namespace {

// lookup tables
constexpr uint32_t kMaxCodepointLookup =
0xffff; // up to this codepoint is in a lookup table
std::vector<uint32_t> lut_upper_codepoint;
std::vector<uint32_t> lut_lower_codepoint;
std::once_flag flag_case_luts;

void EnsureLookupTablesFilled() {
std::call_once(flag_case_luts, []() {
lut_upper_codepoint.reserve(kMaxCodepointLookup + 1);
lut_lower_codepoint.reserve(kMaxCodepointLookup + 1);
for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) {
lut_upper_codepoint.push_back(utf8proc_toupper(i));
lut_lower_codepoint.push_back(utf8proc_tolower(i));
}
});
}

// Code units in the range [a-z] can only be an encoding of an ascii
// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
// codepoint. This guaranteed by non-overlap design of the unicode standard. (see
// section 2.5 of Unicode Standard Core Specification v13.0)

static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
: utf8_code_unit;
}

static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
: utf8_code_unit;
}

// TODO: optional ascii validation

struct AsciiLength {
Expand All @@ -39,6 +75,126 @@ struct AsciiLength {
}
};

template <typename Type, typename Derived>
struct Utf8Transform {
using offset_type = typename Type::offset_type;
using ArrayType = typename TypeTraits<Type>::ArrayType;

static bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
uint8_t* output, offset_type* output_written) {
uint8_t* output_start = output;
if (ARROW_PREDICT_FALSE(
!arrow::util::Utf8Transform(input, input + input_string_ncodeunits, &output,
Derived::TransformCodepoint))) {
return false;
}
*output_written = static_cast<offset_type>(output - output_start);
return true;
}

static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
EnsureLookupTablesFilled();
const ArrayData& input = *batch[0].array();
ArrayType input_boxed(batch[0].array());
ArrayData* output = out->mutable_array();

offset_type input_ncodeunits = input_boxed.total_values_length();
offset_type input_nstrings = static_cast<offset_type>(input.length);

// Section 5.18 of the Unicode spec claim that the number of codepoints for case
// mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
// However, since we don't support all casings (SpecialCasing.txt) the growth
// is actually only at max 3/2 (as covered by the unittest).
// Note that rounding down the 3/2 is ok, since only codepoints encoded by
// two code units (even) can grow to 3 code units.

int64_t output_ncodeunits_max = static_cast<int64_t>(input_ncodeunits) * 3 / 2;
if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
ctx->SetStatus(Status::CapacityError(
"Result might not fit in a 32bit utf8 array, convert to large_utf8"));
return;
}

KERNEL_ASSIGN_OR_RAISE(auto values_buffer, ctx,
ctx->Allocate(output_ncodeunits_max));
output->buffers[2] = values_buffer;

// We could reuse the indices if the data is all ascii, benchmarking showed this
// not to matter.
// output->buffers[1] = input.buffers[1];
KERNEL_ASSIGN_OR_RAISE(output->buffers[1], ctx,
ctx->Allocate((input_nstrings + 1) * sizeof(offset_type)));
uint8_t* output_str = output->buffers[2]->mutable_data();
offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
offset_type output_ncodeunits = 0;

output_string_offsets[0] = 0;
for (int64_t i = 0; i < input_nstrings; i++) {
offset_type input_string_ncodeunits;
const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
offset_type encoded_nbytes;
if (ARROW_PREDICT_FALSE(!Derived::Transform(input_string, input_string_ncodeunits,
output_str + output_ncodeunits,
&encoded_nbytes))) {
ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
return;
}
output_ncodeunits += encoded_nbytes;
output_string_offsets[i + 1] = output_ncodeunits;
}

// Trim the codepoint buffer, since we allocated too much
KERNEL_RETURN_IF_ERROR(
ctx, values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice way to make code more readable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:-)

} else {
const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
if (input.is_valid) {
result->is_valid = true;
offset_type data_nbytes = static_cast<offset_type>(input.value->size());

// See note above in the Array version explaining the 3 / 2
int64_t output_ncodeunits_max = static_cast<int64_t>(data_nbytes) * 3 / 2;
if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
ctx->SetStatus(Status::CapacityError(
"Result might not fit in a 32bit utf8 array, convert to large_utf8"));
return;
}
KERNEL_ASSIGN_OR_RAISE(auto value_buffer, ctx,
ctx->Allocate(output_ncodeunits_max));
result->value = value_buffer;
offset_type encoded_nbytes;
if (ARROW_PREDICT_FALSE(!Derived::Transform(input.value->data(), data_nbytes,
value_buffer->mutable_data(),
&encoded_nbytes))) {
ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
return;
}
KERNEL_RETURN_IF_ERROR(
ctx, value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true));
}
out->value = result;
}
}
};

template <typename Type>
struct Utf8Upper : Utf8Transform<Type, Utf8Upper<Type>> {
inline static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
: utf8proc_toupper(codepoint);
}
};

template <typename Type>
struct Utf8Lower : Utf8Transform<Type, Utf8Lower<Type>> {
static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
: utf8proc_tolower(codepoint);
}
};

using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;

// Transform a buffer of offsets to one which begins with 0 and has same
Expand Down Expand Up @@ -103,16 +259,7 @@ void StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
}

void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
for (int64_t i = 0; i < length; ++i) {
const uint8_t utf8_code_unit = *input++;
// Code units in the range [a-z] can only be an encoding of an ascii
// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
// codepoint. This guaranteed by non-overlap design of the unicode standard. (see
// section 2.5 of Unicode Standard Core Specification v13.0)
*output++ = ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z'))
? (utf8_code_unit - 32)
: utf8_code_unit;
}
std::transform(input, input + length, output, ascii_toupper);
}

template <typename Type>
Expand All @@ -123,13 +270,7 @@ struct AsciiUpper {
};

void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) {
for (int64_t i = 0; i < length; ++i) {
// As with TransformAsciiUpper, the same guarantee holds for the range [A-Z]
const uint8_t utf8_code_unit = *input++;
*output++ = ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z'))
? (utf8_code_unit + 32)
: utf8_code_unit;
}
std::transform(input, input + length, output, ascii_tolower);
}

template <typename Type>
Expand Down Expand Up @@ -206,11 +347,23 @@ void MakeUnaryStringBatchKernel(std::string name, FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}

template <template <typename> class Transformer>
void MakeUnaryStringUtf8TransformKernel(std::string name, FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_64));
DCHECK_OK(registry->AddFunction(std::move(func)));
}

} // namespace

void RegisterScalarStringAscii(FunctionRegistry* registry) {
MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);
MakeUnaryStringUtf8TransformKernel<Utf8Upper>("utf8_upper", registry);
MakeUnaryStringUtf8TransformKernel<Utf8Lower>("utf8_lower", registry);
AddAsciiLength(registry);
AddStrptime(registry);
}
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,12 @@ static void UnaryStringBenchmark(benchmark::State& state, const std::string& fun
const double null_probability = 0.01;
random::RandomArrayGenerator rng(kSeed);

// NOTE: this produces only-Ascii data
auto values =
rng.String(array_length, value_min_size, value_max_size, null_probability);
// Make sure lookup tables are initialized before measuring
ABORT_NOT_OK(CallFunction(func_name, {values}));

for (auto _ : state) {
ABORT_NOT_OK(CallFunction(func_name, {values}));
}
Expand All @@ -52,8 +56,18 @@ static void AsciiUpper(benchmark::State& state) {
UnaryStringBenchmark(state, "ascii_upper");
}

static void Utf8Upper(benchmark::State& state) {
UnaryStringBenchmark(state, "utf8_upper");
}

static void Utf8Lower(benchmark::State& state) {
UnaryStringBenchmark(state, "utf8_lower");
}

BENCHMARK(AsciiLower);
BENCHMARK(AsciiUpper);
BENCHMARK(Utf8Lower);
BENCHMARK(Utf8Upper);

} // namespace compute
} // namespace arrow
Loading