Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,22 @@ struct BinaryLength {
}
};

struct Utf8Length {
template <typename OutValue, typename Arg0Value = util::string_view>
static OutValue Call(KernelContext*, Arg0Value val) {
auto str = reinterpret_cast<const uint8_t*>(val.data());
auto strlen = val.size();

OutValue length = 0;
while (strlen > 0) {
length += ((*str & 0xc0) != 0x80);
++str;
--strlen;
}
return length;
}
};

#ifdef ARROW_WITH_UTF8PROC

// Direct lookup tables for unicode properties
Expand Down Expand Up @@ -1569,9 +1585,14 @@ const FunctionDoc strptime_doc(

const FunctionDoc binary_length_doc(
"Compute string lengths",
("For each string in `strings`, emit its length. Null values emit null."),
("For each string in `strings`, emit the number of bytes. Null values emit null."),
{"strings"});

const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
("For each string in `strings`, emit the number of "
"UTF8 characters. Null values emit null."),
{"strings"});

void AddStrptime(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
Expand All @@ -1597,6 +1618,21 @@ void AddBinaryLength(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}

void AddUtf8Length(FunctionRegistry* registry) {
auto func =
std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);

ArrayKernelExec exec_offset_32 =
applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));

ArrayKernelExec exec_offset_64 =
applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));

DCHECK_OK(registry->AddFunction(std::move(func)));
}

template <template <typename> class ExecFunctor>
void MakeUnaryStringBatchKernel(
std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
Expand Down Expand Up @@ -1866,6 +1902,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {

AddSplit(registry);
AddBinaryLength(registry);
AddUtf8Length(registry);
AddMatchSubstring(registry);
AddStrptime(registry);
}
Expand Down
10 changes: 8 additions & 2 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ class TestBinaryKernels : public BaseTestStringKernels<TestType> {};
TYPED_TEST_SUITE(TestBinaryKernels, BinaryTypes);

TYPED_TEST(TestBinaryKernels, BinaryLength) {
this->CheckUnary("binary_length", R"(["aaa", null, "", "b"])", this->offset_type(),
"[3, null, 0, 1]");
this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])",
this->offset_type(), "[3, null, 10, 0, 1]");
}

template <typename TestType>
Expand Down Expand Up @@ -101,6 +101,12 @@ TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {
CallFunction("utf8_upper", {scalar}, options));
}

TYPED_TEST(TestStringKernels, Utf8Length) {
this->CheckUnary("utf8_length",
R"(["aaa", null, "áéíóú", "ɑɽⱤoW😀", "áéí 0😀", "", "b"])",
this->offset_type(), "[3, null, 5, 6, 6, 0, 1]");
}

#ifdef ARROW_WITH_UTF8PROC

TYPED_TEST(TestStringKernels, Utf8Upper) {
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/util/utf8_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ class UTF8Test : public ::testing::Test {
static std::vector<std::string> invalid_sequences_ascii;
};

std::vector<std::string> UTF8Test::valid_sequences_1 = {"a", "\x7f"};
std::vector<std::string> UTF8Test::valid_sequences_1 = {"a", "\x7f",
std::string("\0", 1)};
std::vector<std::string> UTF8Test::valid_sequences_2 = {"\xc2\x80", "\xc3\xbf",
"\xdf\xbf"};
std::vector<std::string> UTF8Test::valid_sequences_3 = {"\xe0\xa0\x80", "\xe8\x9d\xa5",
Expand Down
11 changes: 8 additions & 3 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -435,9 +435,11 @@ String transforms
+--------------------------+------------+-------------------------+---------------------+---------+
| binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(2) |
+--------------------------+------------+-------------------------+---------------------+---------+
| utf8_lower | Unary | String-like | String-like | \(3) |
| utf8_length | Unary | String-like | Int32 or Int64 | \(3) |
+--------------------------+------------+-------------------------+---------------------+---------+
| utf8_upper | Unary | String-like | String-like | \(3) |
| utf8_lower | Unary | String-like | String-like | \(4) |
+--------------------------+------------+-------------------------+---------------------+---------+
| utf8_upper | Unary | String-like | String-like | \(4) |
+--------------------------+------------+-------------------------+---------------------+---------+


Expand All @@ -447,7 +449,10 @@ String transforms
* \(2) Output is the physical length in bytes of each input element. Output
type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.

* \(3) Each UTF8-encoded character in the input is converted to lowercase or
* \(3) Output is the number of characters (not bytes) of each input element.
Output type is Int32 for String, Int64 for LargeString.

* \(4) Each UTF8-encoded character in the input is converted to lowercase or
uppercase.


Expand Down