Skip to content

Commit aecaff6

Browse files
committed
Add base implementation and tests for LPAD function considering string input values
1 parent 4363fef commit aecaff6

File tree

3 files changed

+114
-0
lines changed

3 files changed

+114
-0
lines changed

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1422,6 +1422,73 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
14221422
out_len);
14231423
}
14241424

1425+
FORCE_INLINE
1426+
const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
1427+
gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len,
1428+
gdv_int32* out_len) {
1429+
// if the text length or the defined return length (number of characters to return)
1430+
// is <=0, then return an empty string.
1431+
if (text_len == 0 || return_length <= 0) {
1432+
*out_len = 0;
1433+
return "";
1434+
}
1435+
1436+
// initially counts the number of utf8 characters in the defined text and fill_text
1437+
int32_t text_char_count = utf8_length(context, text, text_len);
1438+
int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
1439+
// text_char_count is zero if input has invalid utf8 char
1440+
// fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
1441+
if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
1442+
*out_len = 0;
1443+
return "";
1444+
}
1445+
1446+
if (return_length == text_char_count ||
1447+
(return_length > text_char_count && fill_text_len == 0)) {
1448+
// case where the return length is same as the text's length, or if it need to
1449+
// fill into text but "fill_text" is empty, then return text directly.
1450+
*out_len = text_len;
1451+
return text;
1452+
} else if (return_length < text_char_count) {
1453+
// case where it truncates the result on return length.
1454+
*out_len = utf8_byte_pos(context, text, text_len, return_length);
1455+
return text;
1456+
} else {
1457+
// case (return_length > text_char_count)
1458+
// case where it needs to copy "fill_text" on the string left. The total number
1459+
// of chars to copy is given by (return_length - text_char_count)
1460+
char* ret =
1461+
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1462+
if (ret == nullptr) {
1463+
gdv_fn_context_set_error_msg(context,
1464+
"Could not allocate memory for output string");
1465+
*out_len = 0;
1466+
return "";
1467+
}
1468+
// try to fulfill the return string with the "fill_text" continuously
1469+
int32_t copied_chars_count = 0;
1470+
int32_t copied_chars_position = 0;
1471+
while (copied_chars_count < return_length - text_char_count) {
1472+
int32_t char_len;
1473+
int32_t fill_index;
1474+
// for each char, evaluate its length to consider it when mem copying
1475+
for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
1476+
if (copied_chars_count >= return_length - text_char_count) {
1477+
break;
1478+
}
1479+
char_len = utf8_char_length(fill_text[fill_index]);
1480+
copied_chars_count++;
1481+
}
1482+
memcpy(ret + copied_chars_position, fill_text, fill_index);
1483+
copied_chars_position += fill_index;
1484+
}
1485+
// after fulfilling the text, copy the main string
1486+
memcpy(ret + copied_chars_position, text, text_len);
1487+
*out_len = copied_chars_position + text_len;
1488+
return ret;
1489+
}
1490+
}
1491+
14251492
FORCE_INLINE
14261493
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
14271494
const char* delimiter, gdv_int32 delim_len, gdv_int32 index,

cpp/src/gandiva/precompiled/string_ops_test.cc

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,49 @@ TEST(TestStringOps, TestLtrim) {
696696
EXPECT_FALSE(ctx.has_error());
697697
}
698698

699+
TEST(TestStringOps, TestLpadString) {
700+
gandiva::ExecutionContext ctx;
701+
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
702+
gdv_int32 out_len = 0;
703+
const char* out_str;
704+
705+
out_str = lpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
706+
EXPECT_EQ(std::string(out_str, out_len), "Test");
707+
708+
out_str = lpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
709+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
710+
711+
out_str = lpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
712+
EXPECT_EQ(std::string(out_str, out_len), "");
713+
714+
out_str = lpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
715+
EXPECT_EQ(std::string(out_str, out_len), "");
716+
717+
out_str = lpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
718+
EXPECT_EQ(std::string(out_str, out_len), "");
719+
720+
out_str = lpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
721+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
722+
723+
out_str = lpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
724+
EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString");
725+
726+
out_str = lpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
727+
EXPECT_EQ(std::string(out_str, out_len), "FillFTestString");
728+
729+
out_str = lpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
730+
EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString");
731+
732+
out_str = lpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
733+
EXPECT_EQ(std::string(out_str, out_len), "ддабвгд");
734+
735+
out_str = lpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
736+
EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");
737+
738+
out_str = lpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
739+
EXPECT_EQ(std::string(out_str, out_len), "дhello");
740+
}
741+
699742
TEST(TestStringOps, TestRtrim) {
700743
gandiva::ExecutionContext ctx;
701744
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);

cpp/src/gandiva/precompiled/types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,10 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
407407
gdv_int32 sub_str_len, const char* str,
408408
gdv_int32 str_len, gdv_int32 start_pos);
409409

410+
const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
411+
gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len,
412+
gdv_int32* out_len);
413+
410414
const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
411415
gdv_int32 text_len, const char* from_str,
412416
gdv_int32 from_str_len,

0 commit comments

Comments
 (0)