apache · maartenbreddels · Jun 16, 2020 · Jun 18, 2020 · Jun 16, 2020 · Jun 16, 2020
diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
@@ -24,7 +24,8 @@ pkgdesc="Apache Arrow is a cross-language development platform for in-memory dat
 arch=("any")
 url="https://arrow.apache.org/"
 license=("Apache-2.0")
-depends=("${MINGW_PACKAGE_PREFIX}-thrift"
+depends=("${MINGW_PACKAGE_PREFIX}-libutf8proc"
+         "${MINGW_PACKAGE_PREFIX}-thrift"
          "${MINGW_PACKAGE_PREFIX}-snappy"
          "${MINGW_PACKAGE_PREFIX}-zlib"
          "${MINGW_PACKAGE_PREFIX}-lz4"
@@ -62,13 +63,17 @@ build() {
   mkdir -p ${cpp_build_dir}
   pushd ${cpp_build_dir}
 
+  # The Rtools libutf8proc is a static lib, but Findutf8proc.cmake doesn't
+  # set the appropriate compiler definition.
+  export CPPFLAGS="-DUTF8PROC_STATIC"
+
   # This is the difference between rtools-packages and rtools-backports
   # Remove this when submitting to rtools-packages
   if [ "$RTOOLS_VERSION" = "35" ]; then
     export CC="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin/gcc"
     export CXX="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin/g++"
     export PATH="/C/Rtools${MINGW_PREFIX/mingw/mingw_}/bin:$PATH"
-    export CPPFLAGS="-I${MINGW_PREFIX}/include"
+    export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include"
     export LIBS="-L${MINGW_PREFIX}/libs"
   fi
 
@@ -93,11 +98,13 @@ build() {
     -DARROW_WITH_SNAPPY=ON \
     -DARROW_WITH_ZLIB=ON \
     -DARROW_WITH_ZSTD=ON \
+    -DARROW_CXXFLAGS="${CPPFLAGS}" \
     -DCMAKE_BUILD_TYPE="release" \
     -DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} \
-    -DCMAKE_UNITY_BUILD=ON
+    -DCMAKE_UNITY_BUILD=ON \
+    -DCMAKE_VERBOSE_MAKEFILE=ON
 
-  make -j2
+  make -j3
   popd
 }
 

diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh
@@ -81,8 +81,8 @@ cp $MSYS_LIB_DIR/mingw64/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/x6
 cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i386
 
 # These are from https://dl.bintray.com/rtools/mingw{32,64}/
-cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto}.a $DST_DIR/lib/x64
-cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto}.a $DST_DIR/lib/i386
+cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,utf8proc}.a $DST_DIR/lib/x64
+cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,utf8proc}.a $DST_DIR/lib/i386
 
 # Create build artifact
 zip -r ${DST_DIR}.zip $DST_DIR

diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
@@ -45,17 +45,17 @@ namespace compute {
 // ----------------------------------------------------------------------
 // KernelContext
 
-Result<std::shared_ptr<Buffer>> KernelContext::Allocate(int64_t nbytes) {
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> result,
-                        AllocateBuffer(nbytes, exec_ctx_->memory_pool()));
+Result<std::shared_ptr<ResizableBuffer>> KernelContext::Allocate(int64_t nbytes) {
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> result,
+                        AllocateResizableBuffer(nbytes, exec_ctx_->memory_pool()));
   result->ZeroPadding();
   return result;
 }
 
-Result<std::shared_ptr<Buffer>> KernelContext::AllocateBitmap(int64_t num_bits) {
+Result<std::shared_ptr<ResizableBuffer>> KernelContext::AllocateBitmap(int64_t num_bits) {
   const int64_t nbytes = BitUtil::BytesForBits(num_bits);
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> result,
-                        AllocateBuffer(nbytes, exec_ctx_->memory_pool()));
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ResizableBuffer> result,
+                        AllocateResizableBuffer(nbytes, exec_ctx_->memory_pool()));
   // Some utility methods access the last byte before it might be
   // initialized this makes valgrind/asan unhappy, so we proactively
   // zero it.

diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
@@ -56,12 +56,12 @@ class ARROW_EXPORT KernelContext {
 
   /// \brief Allocate buffer from the context's memory pool. The contents are
   /// not initialized.
-  Result<std::shared_ptr<Buffer>> Allocate(int64_t nbytes);
+  Result<std::shared_ptr<ResizableBuffer>> Allocate(int64_t nbytes);
 
   /// \brief Allocate buffer for bitmap from the context's memory pool. Like
   /// Allocate, the contents of the buffer are not initialized but the last
   /// byte is preemptively zeroed to help avoid ASAN or valgrind issues.
-  Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t num_bits);
+  Result<std::shared_ptr<ResizableBuffer>> AllocateBitmap(int64_t num_bits);
 
   /// \brief Indicate that an error has occurred, to be checked by a exec caller
   /// \param[in] status a Status instance.

diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -83,6 +83,17 @@ namespace internal {
 
 #endif  // ARROW_EXTRA_ERROR_CONTEXT
 
+#define KERNEL_ASSIGN_OR_RAISE_IMPL(result_name, lhs, ctx, rexpr) \
+  auto result_name = (rexpr);                                     \
+  KERNEL_RETURN_IF_ERROR(ctx, (result_name).status());            \
+  lhs = std::move(result_name).MoveValueUnsafe();
+
+#define KERNEL_ASSIGN_OR_RAISE_NAME(x, y) ARROW_CONCAT(x, y)
+
+#define KERNEL_ASSIGN_OR_RAISE(lhs, ctx, rexpr)                                          \
+  KERNEL_ASSIGN_OR_RAISE_IMPL(KERNEL_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+                              lhs, ctx, rexpr);
+
 /// KernelState adapter for the common case of kernels whose only
 /// state is an instance of a subclass of FunctionOptions.
 /// Default FunctionOptions are *not* handled here.

diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -19,9 +19,12 @@
 #include <cctype>
 #include <string>
 
+#include <utf8proc.h>
+
 #include "arrow/compute/api_scalar.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/scalar_string_internal.h"
+#include "arrow/util/utf8.h"
 #include "arrow/util/value_parsing.h"
 
 namespace arrow {
@@ -30,6 +33,39 @@ namespace internal {
 
 namespace {
 
+// lookup tables
+constexpr uint32_t kMaxCodepointLookup =
+    0xffff;  // up to this codepoint is in a lookup table
+std::vector<uint32_t> lut_upper_codepoint;
+std::vector<uint32_t> lut_lower_codepoint;
+std::once_flag flag_case_luts;
+
+void EnsureLookupTablesFilled() {
+  std::call_once(flag_case_luts, []() {
+    lut_upper_codepoint.reserve(kMaxCodepointLookup + 1);
+    lut_lower_codepoint.reserve(kMaxCodepointLookup + 1);
+    for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) {
+      lut_upper_codepoint.push_back(utf8proc_toupper(i));
+      lut_lower_codepoint.push_back(utf8proc_tolower(i));
+    }
+  });
+}
+
+// Code units in the range [a-z] can only be an encoding of an ascii
+// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
+// codepoint. This guaranteed by non-overlap design of the unicode standard. (see
+// section 2.5 of Unicode Standard Core Specification v13.0)
+
+static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
+  return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
+                                                              : utf8_code_unit;
+}
+
+static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
+  return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
+                                                              : utf8_code_unit;
+}
+
 // TODO: optional ascii validation
 
 struct AsciiLength {
@@ -39,6 +75,126 @@ struct AsciiLength {
   }
 };
 
+template <typename Type, typename Derived>
+struct Utf8Transform {
+  using offset_type = typename Type::offset_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  static bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                        uint8_t* output, offset_type* output_written) {
+    uint8_t* output_start = output;
+    if (ARROW_PREDICT_FALSE(
+            !arrow::util::Utf8Transform(input, input + input_string_ncodeunits, &output,
+                                        Derived::TransformCodepoint))) {
+      return false;
+    }
+    *output_written = static_cast<offset_type>(output - output_start);
+    return true;
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::ARRAY) {
+      EnsureLookupTablesFilled();
+      const ArrayData& input = *batch[0].array();
+      ArrayType input_boxed(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      offset_type input_ncodeunits = input_boxed.total_values_length();
+      offset_type input_nstrings = static_cast<offset_type>(input.length);
+
+      // Section 5.18 of the Unicode spec claim that the number of codepoints for case
+      // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
+      // However, since we don't support all casings (SpecialCasing.txt) the growth
+      // is actually only at max 3/2 (as covered by the unittest).
+      // Note that rounding down the 3/2 is ok, since only codepoints encoded by
+      // two code units (even) can grow to 3 code units.
+
+      int64_t output_ncodeunits_max = static_cast<int64_t>(input_ncodeunits) * 3 / 2;
+      if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+        ctx->SetStatus(Status::CapacityError(
+            "Result might not fit in a 32bit utf8 array, convert to large_utf8"));
+        return;
+      }
+
+      KERNEL_ASSIGN_OR_RAISE(auto values_buffer, ctx,
+                             ctx->Allocate(output_ncodeunits_max));
+      output->buffers[2] = values_buffer;
+
+      // We could reuse the indices if the data is all ascii, benchmarking showed this
+      // not to matter.
+      //   output->buffers[1] = input.buffers[1];
+      KERNEL_ASSIGN_OR_RAISE(output->buffers[1], ctx,
+                             ctx->Allocate((input_nstrings + 1) * sizeof(offset_type)));
+      uint8_t* output_str = output->buffers[2]->mutable_data();
+      offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
+      offset_type output_ncodeunits = 0;
+
+      output_string_offsets[0] = 0;
+      for (int64_t i = 0; i < input_nstrings; i++) {
+        offset_type input_string_ncodeunits;
+        const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
+        offset_type encoded_nbytes;
+        if (ARROW_PREDICT_FALSE(!Derived::Transform(input_string, input_string_ncodeunits,
+                                                    output_str + output_ncodeunits,
+                                                    &encoded_nbytes))) {
+          ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+          return;
+        }
+        output_ncodeunits += encoded_nbytes;
+        output_string_offsets[i + 1] = output_ncodeunits;
+      }
+
+      // Trim the codepoint buffer, since we allocated too much
+      KERNEL_RETURN_IF_ERROR(
+          ctx, values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true));
+    } else {
+      const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+      auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
+      if (input.is_valid) {
+        result->is_valid = true;
+        offset_type data_nbytes = static_cast<offset_type>(input.value->size());
+
+        // See note above in the Array version explaining the 3 / 2
+        int64_t output_ncodeunits_max = static_cast<int64_t>(data_nbytes) * 3 / 2;
+        if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+          ctx->SetStatus(Status::CapacityError(
+              "Result might not fit in a 32bit utf8 array, convert to large_utf8"));
+          return;
+        }
+        KERNEL_ASSIGN_OR_RAISE(auto value_buffer, ctx,
+                               ctx->Allocate(output_ncodeunits_max));
+        result->value = value_buffer;
+        offset_type encoded_nbytes;
+        if (ARROW_PREDICT_FALSE(!Derived::Transform(input.value->data(), data_nbytes,
+                                                    value_buffer->mutable_data(),
+                                                    &encoded_nbytes))) {
+          ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
+          return;
+        }
+        KERNEL_RETURN_IF_ERROR(
+            ctx, value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true));
+      }
+      out->value = result;
+    }
+  }
+};
+
+template <typename Type>
+struct Utf8Upper : Utf8Transform<Type, Utf8Upper<Type>> {
+  inline static uint32_t TransformCodepoint(uint32_t codepoint) {
+    return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
+                                            : utf8proc_toupper(codepoint);
+  }
+};
+
+template <typename Type>
+struct Utf8Lower : Utf8Transform<Type, Utf8Lower<Type>> {
+  static uint32_t TransformCodepoint(uint32_t codepoint) {
+    return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
+                                            : utf8proc_tolower(codepoint);
+  }
+};
+
 using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;
 
 // Transform a buffer of offsets to one which begins with 0 and has same
@@ -103,16 +259,7 @@ void StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
 }
 
 void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
-  for (int64_t i = 0; i < length; ++i) {
-    const uint8_t utf8_code_unit = *input++;
-    // Code units in the range [a-z] can only be an encoding of an ascii
-    // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
-    // codepoint. This guaranteed by non-overlap design of the unicode standard. (see
-    // section 2.5 of Unicode Standard Core Specification v13.0)
-    *output++ = ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z'))
-                    ? (utf8_code_unit - 32)
-                    : utf8_code_unit;
-  }
+  std::transform(input, input + length, output, ascii_toupper);
 }
 
 template <typename Type>
@@ -123,13 +270,7 @@ struct AsciiUpper {
 };
 
 void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) {
-  for (int64_t i = 0; i < length; ++i) {
-    // As with TransformAsciiUpper, the same guarantee holds for the range [A-Z]
-    const uint8_t utf8_code_unit = *input++;
-    *output++ = ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z'))
-                    ? (utf8_code_unit + 32)
-                    : utf8_code_unit;
-  }
+  std::transform(input, input + length, output, ascii_tolower);
 }
 
 template <typename Type>
@@ -206,11 +347,23 @@ void MakeUnaryStringBatchKernel(std::string name, FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+template <template <typename> class Transformer>
+void MakeUnaryStringUtf8TransformKernel(std::string name, FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
+  ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
+  ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
+  DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
+  DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_64));
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
 }  // namespace
 
 void RegisterScalarStringAscii(FunctionRegistry* registry) {
   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);
+  MakeUnaryStringUtf8TransformKernel<Utf8Upper>("utf8_upper", registry);
+  MakeUnaryStringUtf8TransformKernel<Utf8Lower>("utf8_lower", registry);
   AddAsciiLength(registry);
   AddStrptime(registry);
 }

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
@@ -35,8 +35,12 @@ static void UnaryStringBenchmark(benchmark::State& state, const std::string& fun
   const double null_probability = 0.01;
   random::RandomArrayGenerator rng(kSeed);
 
+  // NOTE: this produces only-Ascii data
   auto values =
       rng.String(array_length, value_min_size, value_max_size, null_probability);
+  // Make sure lookup tables are initialized before measuring
+  ABORT_NOT_OK(CallFunction(func_name, {values}));
+
   for (auto _ : state) {
     ABORT_NOT_OK(CallFunction(func_name, {values}));
   }
@@ -52,8 +56,18 @@ static void AsciiUpper(benchmark::State& state) {
   UnaryStringBenchmark(state, "ascii_upper");
 }
 
+static void Utf8Upper(benchmark::State& state) {
+  UnaryStringBenchmark(state, "utf8_upper");
+}
+
+static void Utf8Lower(benchmark::State& state) {
+  UnaryStringBenchmark(state, "utf8_lower");
+}
+
 BENCHMARK(AsciiLower);
 BENCHMARK(AsciiUpper);
+BENCHMARK(Utf8Lower);
+BENCHMARK(Utf8Upper);
 
 }  // namespace compute
 }  // namespace arrow