Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,20 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
utf8(), kResultNullIfNull, "substr_utf8_int64",
NativeFunction::kNeedsContext),

NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
kResultNullIfNull, "lpad_utf8_int32_utf8",
NativeFunction::kNeedsContext),

NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext),

NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
kResultNullIfNull, "rpad_utf8_int32_utf8",
NativeFunction::kNeedsContext),

NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext),

NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "concatOperator_utf8_utf8",
NativeFunction::kNeedsContext),
Expand Down
156 changes: 156 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
return count;
}

// Count the number of utf8 characters, ignoring invalid char, considering size 1
FORCE_INLINE
gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
int char_len = 0;
int count = 0;
for (int i = 0; i < data_len; i += char_len) {
char_len = utf8_char_length(data[i]);
if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
// if invalid byte or incomplete glyph, ignore it
char_len = 1;
}
for (int j = 1; j < char_len; ++j) {
if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
char_len += 1;
}
}
++count;
}
return count;
}

// Get the byte position corresponding to a character position for a non-empty utf8
// sequence
FORCE_INLINE
Expand Down Expand Up @@ -1422,6 +1443,141 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
out_len);
}

FORCE_INLINE
const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text,
gdv_int32 fill_text_len, gdv_int32* out_len) {
// if the text length or the defined return length (number of characters to return)
// is <=0, then return an empty string.
if (text_len == 0 || return_length <= 0) {
*out_len = 0;
return "";
}

// count the number of utf8 characters on text, ignoring invalid bytes
int text_char_count = utf8_length_ignore_invalid(text, text_len);

if (return_length == text_char_count ||
(return_length > text_char_count && fill_text_len == 0)) {
// case where the return length is same as the text's length, or if it need to
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
} else if (return_length < text_char_count) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
} else {
// case (return_length > text_char_count)
// case where it needs to copy "fill_text" on the string left. The total number
// of chars to copy is given by (return_length - text_char_count)
char* ret =
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
// try to fulfill the return string with the "fill_text" continuously
int32_t copied_chars_count = 0;
int32_t copied_chars_position = 0;
while (copied_chars_count < return_length - text_char_count) {
int32_t char_len;
int32_t fill_index;
// for each char, evaluate its length to consider it when mem copying
for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
if (copied_chars_count >= return_length - text_char_count) {
break;
}
char_len = utf8_char_length(fill_text[fill_index]);
// ignore invalid char on the fill text, considering it as size 1
if (char_len == 0) char_len += 1;
copied_chars_count++;
}
memcpy(ret + copied_chars_position, fill_text, fill_index);
copied_chars_position += fill_index;
}
// after fulfilling the text, copy the main string
memcpy(ret + copied_chars_position, text, text_len);
*out_len = copied_chars_position + text_len;
return ret;
}
}

FORCE_INLINE
const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text,
gdv_int32 fill_text_len, gdv_int32* out_len) {
// if the text length or the defined return length (number of characters to return)
// is <=0, then return an empty string.
if (text_len == 0 || return_length <= 0) {
*out_len = 0;
return "";
}

// count the number of utf8 characters on text, ignoring invalid bytes
int text_char_count = utf8_length_ignore_invalid(text, text_len);

if (return_length == text_char_count ||
(return_length > text_char_count && fill_text_len == 0)) {
// case where the return length is same as the text's length, or if it need to
// fill into text but "fill_text" is empty, then return text directly.
*out_len = text_len;
return text;
} else if (return_length < text_char_count) {
// case where it truncates the result on return length.
*out_len = utf8_byte_pos(context, text, text_len, return_length);
return text;
} else {
// case (return_length > text_char_count)
// case where it needs to copy "fill_text" on the string right
char* ret =
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
if (ret == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
// fulfill the initial text copying the main input string
memcpy(ret, text, text_len);
// try to fulfill the return string with the "fill_text" continuously
int32_t copied_chars_count = 0;
int32_t copied_chars_position = 0;
while (text_char_count + copied_chars_count < return_length) {
int32_t char_len;
int32_t fill_length;
// for each char, evaluate its length to consider it when mem copying
for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
if (text_char_count + copied_chars_count >= return_length) {
break;
}
char_len = utf8_char_length(fill_text[fill_length]);
// ignore invalid char on the fill text, considering it as size 1
if (char_len == 0) char_len += 1;
copied_chars_count++;
}
memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
copied_chars_position += fill_length;
}
*out_len = copied_chars_position + text_len;
return ret;
}
}

FORCE_INLINE
const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, gdv_int32* out_len) {
return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
}

FORCE_INLINE
const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, gdv_int32* out_len) {
return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
}

FORCE_INLINE
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
const char* delimiter, gdv_int32 delim_len, gdv_int32 index,
Expand Down
138 changes: 138 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,144 @@ TEST(TestStringOps, TestLtrim) {
EXPECT_FALSE(ctx.has_error());
}

TEST(TestStringOps, TestLpadString) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;
const char* out_str;

// LPAD function tests - with defined fill pad text
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Test");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "FillFTestString");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "ддабвгд");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");

out_str = lpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "дhello");

// LPAD function tests - with NO pad text
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Test");

out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = lpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len);
EXPECT_EQ(std::string(out_str, out_len), " TestString");

out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len);
EXPECT_EQ(std::string(out_str, out_len), " TestString");

out_str = lpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len);
EXPECT_EQ(std::string(out_str, out_len), " абвгд");
}

TEST(TestStringOps, TestRpadString) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;
const char* out_str;

// RPAD function tests - with defined fill pad text
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Test");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFill");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillF");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFillFi");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "абвгддд");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");

out_str = rpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "helloд");

// RPAD function tests - with NO pad text
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Test");

out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");

out_str = rpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");

out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString ");

out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString ");

out_str = rpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "абвгд ");
}

TEST(TestStringOps, TestRtrim) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/gandiva/precompiled/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,20 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
gdv_int32 sub_str_len, const char* str,
gdv_int32 str_len, gdv_int32 start_pos);

const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text,
gdv_int32 fill_text_len, gdv_int32* out_len);

const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, const char* fill_text,
gdv_int32 fill_text_len, gdv_int32* out_len);

const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, gdv_int32* out_len);

const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
gdv_int32 return_length, gdv_int32* out_len);

const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
gdv_int32 text_len, const char* from_str,
gdv_int32 from_str_len,
Expand Down
Loading