Skip to content

Commit 0477cfc

Browse files
jpedroantunespraveenbingo
authored andcommitted
ARROW-12567: [C++][Gandiva] Implement LPAD and RPAD functions for string input values
#### Implement LPAD and RPAD functions for string input values. - LPAD([string] basetext, [number] x, [optional string] padtext) - RPAD([string] basetext, [number] x, [optional string] padtext) #### Description lpad - Prepends padtext to basetext in a way that allows as many characters as possible from padtext given an output string length of x. When x is less than or equal to the length of basetext, only characters from basetext are printed in the output. If padtext is omitted then spaces are prepended. rpad - Appends padtext to basetext in a way that allows as many characters as possible from padtext given an output string length of x. When x is less than or equal to the length of basetext, only characters from basetext are printed in the output. If padtext is omitted then spaces are appended. Closes apache#10173 from jpedroantunes/feature/lpad-rpad-functions and squashes the following commits: 4efc0fe <João Pedro> Add utf8_length method that ignore invalid char considering size 1 33a5a14 <João Pedro> Fix identation on function string registry 4c4b2f4 <João Pedro> Change lpad and rpad functions signature and definition 26b90b0 <João Pedro> Correct ci lint errors on gandiva 66594a0 <João Pedro> Correct lint local errors on gandiva b6b63e9 <João Pedro> Add projector test for RPAD string function dc72148 <João Pedro> Add function registry for RPAD string function without pad text c270fb1 <João Pedro> Add base implementation and tests for RPAD functions 08d2053 <João Pedro> Add function registry for LPAD string function without pad text 585cad3 <João Pedro> Add base implementation and tests for LPAD function without pad texts considering string input values 73927fc <João Pedro> Add projector test for LPAD string function 2c929a9 <João Pedro> Add function registry for LPAD string function aecaff6 <João Pedro> Add base implementation and tests for LPAD function considering string input values Authored-by: João Pedro <joaop@simbioseventures.com> Signed-off-by: Praveen <praveen@dremio.com>
1 parent 3791510 commit 0477cfc

File tree

5 files changed

+406
-0
lines changed

5 files changed

+406
-0
lines changed

cpp/src/gandiva/function_registry_string.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,20 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
146146
utf8(), kResultNullIfNull, "substr_utf8_int64",
147147
NativeFunction::kNeedsContext),
148148

149+
NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
150+
kResultNullIfNull, "lpad_utf8_int32_utf8",
151+
NativeFunction::kNeedsContext),
152+
153+
NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
154+
kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext),
155+
156+
NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
157+
kResultNullIfNull, "rpad_utf8_int32_utf8",
158+
NativeFunction::kNeedsContext),
159+
160+
NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
161+
kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext),
162+
149163
NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(),
150164
kResultNullIfNull, "concatOperator_utf8_utf8",
151165
NativeFunction::kNeedsContext),

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
192192
return count;
193193
}
194194

195+
// Count the number of utf8 characters, ignoring invalid char, considering size 1
196+
FORCE_INLINE
197+
gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
198+
int char_len = 0;
199+
int count = 0;
200+
for (int i = 0; i < data_len; i += char_len) {
201+
char_len = utf8_char_length(data[i]);
202+
if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
203+
// if invalid byte or incomplete glyph, ignore it
204+
char_len = 1;
205+
}
206+
for (int j = 1; j < char_len; ++j) {
207+
if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
208+
char_len += 1;
209+
}
210+
}
211+
++count;
212+
}
213+
return count;
214+
}
215+
195216
// Get the byte position corresponding to a character position for a non-empty utf8
196217
// sequence
197218
FORCE_INLINE
@@ -1580,6 +1601,141 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
15801601
out_len);
15811602
}
15821603

1604+
FORCE_INLINE
1605+
const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1606+
gdv_int32 return_length, const char* fill_text,
1607+
gdv_int32 fill_text_len, gdv_int32* out_len) {
1608+
// if the text length or the defined return length (number of characters to return)
1609+
// is <=0, then return an empty string.
1610+
if (text_len == 0 || return_length <= 0) {
1611+
*out_len = 0;
1612+
return "";
1613+
}
1614+
1615+
// count the number of utf8 characters on text, ignoring invalid bytes
1616+
int text_char_count = utf8_length_ignore_invalid(text, text_len);
1617+
1618+
if (return_length == text_char_count ||
1619+
(return_length > text_char_count && fill_text_len == 0)) {
1620+
// case where the return length is same as the text's length, or if it need to
1621+
// fill into text but "fill_text" is empty, then return text directly.
1622+
*out_len = text_len;
1623+
return text;
1624+
} else if (return_length < text_char_count) {
1625+
// case where it truncates the result on return length.
1626+
*out_len = utf8_byte_pos(context, text, text_len, return_length);
1627+
return text;
1628+
} else {
1629+
// case (return_length > text_char_count)
1630+
// case where it needs to copy "fill_text" on the string left. The total number
1631+
// of chars to copy is given by (return_length - text_char_count)
1632+
char* ret =
1633+
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1634+
if (ret == nullptr) {
1635+
gdv_fn_context_set_error_msg(context,
1636+
"Could not allocate memory for output string");
1637+
*out_len = 0;
1638+
return "";
1639+
}
1640+
// try to fulfill the return string with the "fill_text" continuously
1641+
int32_t copied_chars_count = 0;
1642+
int32_t copied_chars_position = 0;
1643+
while (copied_chars_count < return_length - text_char_count) {
1644+
int32_t char_len;
1645+
int32_t fill_index;
1646+
// for each char, evaluate its length to consider it when mem copying
1647+
for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
1648+
if (copied_chars_count >= return_length - text_char_count) {
1649+
break;
1650+
}
1651+
char_len = utf8_char_length(fill_text[fill_index]);
1652+
// ignore invalid char on the fill text, considering it as size 1
1653+
if (char_len == 0) char_len += 1;
1654+
copied_chars_count++;
1655+
}
1656+
memcpy(ret + copied_chars_position, fill_text, fill_index);
1657+
copied_chars_position += fill_index;
1658+
}
1659+
// after fulfilling the text, copy the main string
1660+
memcpy(ret + copied_chars_position, text, text_len);
1661+
*out_len = copied_chars_position + text_len;
1662+
return ret;
1663+
}
1664+
}
1665+
1666+
FORCE_INLINE
1667+
const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1668+
gdv_int32 return_length, const char* fill_text,
1669+
gdv_int32 fill_text_len, gdv_int32* out_len) {
1670+
// if the text length or the defined return length (number of characters to return)
1671+
// is <=0, then return an empty string.
1672+
if (text_len == 0 || return_length <= 0) {
1673+
*out_len = 0;
1674+
return "";
1675+
}
1676+
1677+
// count the number of utf8 characters on text, ignoring invalid bytes
1678+
int text_char_count = utf8_length_ignore_invalid(text, text_len);
1679+
1680+
if (return_length == text_char_count ||
1681+
(return_length > text_char_count && fill_text_len == 0)) {
1682+
// case where the return length is same as the text's length, or if it need to
1683+
// fill into text but "fill_text" is empty, then return text directly.
1684+
*out_len = text_len;
1685+
return text;
1686+
} else if (return_length < text_char_count) {
1687+
// case where it truncates the result on return length.
1688+
*out_len = utf8_byte_pos(context, text, text_len, return_length);
1689+
return text;
1690+
} else {
1691+
// case (return_length > text_char_count)
1692+
// case where it needs to copy "fill_text" on the string right
1693+
char* ret =
1694+
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1695+
if (ret == nullptr) {
1696+
gdv_fn_context_set_error_msg(context,
1697+
"Could not allocate memory for output string");
1698+
*out_len = 0;
1699+
return "";
1700+
}
1701+
// fulfill the initial text copying the main input string
1702+
memcpy(ret, text, text_len);
1703+
// try to fulfill the return string with the "fill_text" continuously
1704+
int32_t copied_chars_count = 0;
1705+
int32_t copied_chars_position = 0;
1706+
while (text_char_count + copied_chars_count < return_length) {
1707+
int32_t char_len;
1708+
int32_t fill_length;
1709+
// for each char, evaluate its length to consider it when mem copying
1710+
for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
1711+
if (text_char_count + copied_chars_count >= return_length) {
1712+
break;
1713+
}
1714+
char_len = utf8_char_length(fill_text[fill_length]);
1715+
// ignore invalid char on the fill text, considering it as size 1
1716+
if (char_len == 0) char_len += 1;
1717+
copied_chars_count++;
1718+
}
1719+
memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
1720+
copied_chars_position += fill_length;
1721+
}
1722+
*out_len = copied_chars_position + text_len;
1723+
return ret;
1724+
}
1725+
}
1726+
1727+
FORCE_INLINE
1728+
const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1729+
gdv_int32 return_length, gdv_int32* out_len) {
1730+
return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1731+
}
1732+
1733+
FORCE_INLINE
1734+
const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1735+
gdv_int32 return_length, gdv_int32* out_len) {
1736+
return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1737+
}
1738+
15831739
FORCE_INLINE
15841740
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
15851741
const char* delimiter, gdv_int32 delim_len, gdv_int32 index,

cpp/src/gandiva/precompiled/string_ops_test.cc

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,144 @@ TEST(TestStringOps, TestLtrim) {
801801
EXPECT_FALSE(ctx.has_error());
802802
}
803803

804+
TEST(TestStringOps, TestLpadString) {
805+
gandiva::ExecutionContext ctx;
806+
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
807+
gdv_int32 out_len = 0;
808+
const char* out_str;
809+
810+
// LPAD function tests - with defined fill pad text
811+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
812+
EXPECT_EQ(std::string(out_str, out_len), "Test");
813+
814+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
815+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
816+
817+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
818+
EXPECT_EQ(std::string(out_str, out_len), "");
819+
820+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
821+
EXPECT_EQ(std::string(out_str, out_len), "");
822+
823+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
824+
EXPECT_EQ(std::string(out_str, out_len), "");
825+
826+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
827+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
828+
829+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
830+
EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString");
831+
832+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
833+
EXPECT_EQ(std::string(out_str, out_len), "FillFTestString");
834+
835+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
836+
EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString");
837+
838+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
839+
EXPECT_EQ(std::string(out_str, out_len), "ддабвгд");
840+
841+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
842+
EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");
843+
844+
out_str = lpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
845+
EXPECT_EQ(std::string(out_str, out_len), "дhello");
846+
847+
// LPAD function tests - with NO pad text
848+
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len);
849+
EXPECT_EQ(std::string(out_str, out_len), "Test");
850+
851+
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
852+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
853+
854+
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len);
855+
EXPECT_EQ(std::string(out_str, out_len), "");
856+
857+
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len);
858+
EXPECT_EQ(std::string(out_str, out_len), "");
859+
860+
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len);
861+
EXPECT_EQ(std::string(out_str, out_len), "");
862+
863+
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len);
864+
EXPECT_EQ(std::string(out_str, out_len), " TestString");
865+
866+
out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len);
867+
EXPECT_EQ(std::string(out_str, out_len), " TestString");
868+
869+
out_str = lpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len);
870+
EXPECT_EQ(std::string(out_str, out_len), " абвгд");
871+
}
872+
873+
TEST(TestStringOps, TestRpadString) {
874+
gandiva::ExecutionContext ctx;
875+
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
876+
gdv_int32 out_len = 0;
877+
const char* out_str;
878+
879+
// RPAD function tests - with defined fill pad text
880+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
881+
EXPECT_EQ(std::string(out_str, out_len), "Test");
882+
883+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
884+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
885+
886+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
887+
EXPECT_EQ(std::string(out_str, out_len), "");
888+
889+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
890+
EXPECT_EQ(std::string(out_str, out_len), "");
891+
892+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
893+
EXPECT_EQ(std::string(out_str, out_len), "");
894+
895+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
896+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
897+
898+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
899+
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFill");
900+
901+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
902+
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillF");
903+
904+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
905+
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFillFi");
906+
907+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
908+
EXPECT_EQ(std::string(out_str, out_len), "абвгддд");
909+
910+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
911+
EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");
912+
913+
out_str = rpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
914+
EXPECT_EQ(std::string(out_str, out_len), "helloд");
915+
916+
// RPAD function tests - with NO pad text
917+
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len);
918+
EXPECT_EQ(std::string(out_str, out_len), "Test");
919+
920+
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len);
921+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
922+
923+
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len);
924+
EXPECT_EQ(std::string(out_str, out_len), "");
925+
926+
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len);
927+
EXPECT_EQ(std::string(out_str, out_len), "");
928+
929+
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len);
930+
EXPECT_EQ(std::string(out_str, out_len), "");
931+
932+
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len);
933+
EXPECT_EQ(std::string(out_str, out_len), "TestString ");
934+
935+
out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len);
936+
EXPECT_EQ(std::string(out_str, out_len), "TestString ");
937+
938+
out_str = rpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len);
939+
EXPECT_EQ(std::string(out_str, out_len), "абвгд ");
940+
}
941+
804942
TEST(TestStringOps, TestRtrim) {
805943
gandiva::ExecutionContext ctx;
806944
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);

cpp/src/gandiva/precompiled/types.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,20 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str,
419419
gdv_int32 sub_str_len, const char* str,
420420
gdv_int32 str_len, gdv_int32 start_pos);
421421

422+
const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
423+
gdv_int32 return_length, const char* fill_text,
424+
gdv_int32 fill_text_len, gdv_int32* out_len);
425+
426+
const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
427+
gdv_int32 return_length, const char* fill_text,
428+
gdv_int32 fill_text_len, gdv_int32* out_len);
429+
430+
const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
431+
gdv_int32 return_length, gdv_int32* out_len);
432+
433+
const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
434+
gdv_int32 return_length, gdv_int32* out_len);
435+
422436
const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
423437
gdv_int32 text_len, const char* from_str,
424438
gdv_int32 from_str_len,

0 commit comments

Comments
 (0)