Skip to content

Commit 2ca3236

Browse files
zhouyuanjpedroantunesZMZ91
authored
lpad (#113)
* ARROW-12567: [C++][Gandiva] Implement LPAD and RPAD functions for string input values - LPAD([string] basetext, [number] x, [optional string] padtext) - RPAD([string] basetext, [number] x, [optional string] padtext) lpad - Prepends padtext to basetext in a way that allows as many characters as possible from padtext given an output string length of x. When x is less than or equal to the length of basetext, only characters from basetext are printed in the output. If padtext is omitted then spaces are prepended. rpad - Appends padtext to basetext in a way that allows as many characters as possible from padtext given an output string length of x. When x is less than or equal to the length of basetext, only characters from basetext are printed in the output. If padtext is omitted then spaces are appended. Closes apache#10173 from jpedroantunes/feature/lpad-rpad-functions and squashes the following commits: 4efc0fe <João Pedro> Add utf8_length method that ignore invalid char considering size 1 33a5a14 <João Pedro> Fix identation on function string registry 4c4b2f4 <João Pedro> Change lpad and rpad functions signature and definition 26b90b0 <João Pedro> Correct ci lint errors on gandiva 66594a0 <João Pedro> Correct lint local errors on gandiva b6b63e9 <João Pedro> Add projector test for RPAD string function dc72148 <João Pedro> Add function registry for RPAD string function without pad text c270fb1 <João Pedro> Add base implementation and tests for RPAD functions 08d2053 <João Pedro> Add function registry for LPAD string function without pad text 585cad3 <João Pedro> Add base implementation and tests for LPAD function without pad texts considering string input values 73927fc <João Pedro> Add projector test for LPAD string function 2c929a9 <João Pedro> Add function registry for LPAD string function aecaff6 <João Pedro> Add base implementation and tests for LPAD function considering string input values Authored-by: João Pedro <joaop@simbioseventures.com> Signed-off-by: Praveen <praveen@dremio.com> * ARROW-13780: [Gandiva][UDF] Fix bug in udf space/rpad/lpad - add max/min return length for space/lpad/rpad udfs - correct return length Closes apache#11016 from ZMZ91/bugfix/limit_return_chars_count Authored-by: ZMZ <zmz@yanhuangdata.com> Signed-off-by: Pindikura Ravindra <ravindra@dremio.com> Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> * fix concat Signed-off-by: Yuan Zhou <yuan.zhou@intel.com> Co-authored-by: João Pedro <joaop@simbioseventures.com> Co-authored-by: ZMZ <zmz@yanhuangdata.com>
1 parent 6a43921 commit 2ca3236

File tree

7 files changed

+47708
-1
lines changed

7 files changed

+47708
-1
lines changed

cpp/src/gandiva/function_registry_string.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,20 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
217217
NativeFunction::kNeedsFunctionHolder |
218218
NativeFunction::kCanReturnErrors),
219219

220+
NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
221+
kResultNullIfNull, "lpad_utf8_int32_utf8",
222+
NativeFunction::kNeedsContext),
223+
224+
NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
225+
kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext),
226+
227+
NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(),
228+
kResultNullIfNull, "rpad_utf8_int32_utf8",
229+
NativeFunction::kNeedsContext),
230+
231+
NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(),
232+
kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext),
233+
220234
NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(),
221235
kResultNullIfNull, "concatOperator_utf8_utf8",
222236
NativeFunction::kNeedsContext),

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 216 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
191191
return count;
192192
}
193193

194+
// Count the number of utf8 characters, ignoring invalid char, considering size 1
195+
FORCE_INLINE
196+
gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
197+
int char_len = 0;
198+
int count = 0;
199+
for (int i = 0; i < data_len; i += char_len) {
200+
char_len = utf8_char_length(data[i]);
201+
if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
202+
// if invalid byte or incomplete glyph, ignore it
203+
char_len = 1;
204+
}
205+
for (int j = 1; j < char_len; ++j) {
206+
if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
207+
char_len += 1;
208+
}
209+
}
210+
++count;
211+
}
212+
return count;
213+
}
214+
194215
// Get the byte position corresponding to a character position for a non-empty utf8
195216
// sequence
196217
FORCE_INLINE
@@ -281,6 +302,37 @@ const char* lower_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
281302
return ret;
282303
}
283304

305+
// set max/min str length for space_int32, space_int64, lpad_utf8_int32_utf8
306+
// and rpad_utf8_int32_utf8 to avoid exceptions
307+
static const gdv_int32 max_str_length = 65536;
308+
static const gdv_int32 min_str_length = 0;
309+
// Returns a string of 'n' spaces.
310+
#define SPACE_STR(IN_TYPE) \
311+
GANDIVA_EXPORT \
312+
const char* space_##IN_TYPE(gdv_int64 ctx, gdv_##IN_TYPE n, int32_t* out_len) { \
313+
n = std::min(static_cast<gdv_##IN_TYPE>(max_str_length), n); \
314+
n = std::max(static_cast<gdv_##IN_TYPE>(min_str_length), n); \
315+
gdv_int32 n_times = static_cast<gdv_int32>(n); \
316+
if (n_times <= 0) { \
317+
*out_len = 0; \
318+
return ""; \
319+
} \
320+
char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(ctx, n_times)); \
321+
if (ret == nullptr) { \
322+
gdv_fn_context_set_error_msg(ctx, "Could not allocate memory for output string"); \
323+
*out_len = 0; \
324+
return ""; \
325+
} \
326+
for (int i = 0; i < n_times; i++) { \
327+
ret[i] = ' '; \
328+
} \
329+
*out_len = n_times; \
330+
return ret; \
331+
}
332+
333+
SPACE_STR(int32)
334+
SPACE_STR(int64)
335+
284336
// Reverse a utf8 sequence
285337
FORCE_INLINE
286338
const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
@@ -767,11 +819,13 @@ const char* concat_utf8_utf8_utf8(gdv_int64 context, const char* in1, gdv_int32
767819
bool in3_validity, gdv_int32* out_len) {
768820
if (!in1_validity) {
769821
in1_len = 0;
822+
in2_len = 0;
770823
}
771-
if (!in2_validity) {
824+
if (!in2_validity || (!in1_validity && !in3_validity)) {
772825
in2_len = 0;
773826
}
774827
if (!in3_validity) {
828+
in2_len = 0;
775829
in3_len = 0;
776830
}
777831
return concatOperator_utf8_utf8_utf8(context, in1, in1_len, in2, in2_len, in3, in3_len,
@@ -1424,6 +1478,167 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
14241478
out_len);
14251479
}
14261480

1481+
FORCE_INLINE
1482+
gdv_int32 evaluate_return_char_length(gdv_int32 text_len, gdv_int32 actual_text_len,
1483+
gdv_int32 return_length, const char* fill_text,
1484+
gdv_int32 fill_text_len) {
1485+
gdv_int32 fill_actual_text_len = utf8_length_ignore_invalid(fill_text, fill_text_len);
1486+
gdv_int32 repeat_times = (return_length - actual_text_len) / fill_actual_text_len;
1487+
gdv_int32 return_char_length = repeat_times * fill_text_len + text_len;
1488+
gdv_int32 mod = (return_length - actual_text_len) % fill_actual_text_len;
1489+
gdv_int32 char_len = 0;
1490+
gdv_int32 fill_index = 0;
1491+
for (gdv_int32 i = 0; i < mod; i++) {
1492+
char_len = utf8_char_length(fill_text[fill_index]);
1493+
fill_index += char_len;
1494+
return_char_length += char_len;
1495+
}
1496+
return return_char_length;
1497+
}
1498+
1499+
FORCE_INLINE
1500+
const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1501+
gdv_int32 return_length, const char* fill_text,
1502+
gdv_int32 fill_text_len, gdv_int32* out_len) {
1503+
// if the text length or the defined return length (number of characters to return)
1504+
// is <=0, then return an empty string.
1505+
return_length = std::min(max_str_length, return_length);
1506+
return_length = std::max(min_str_length, return_length);
1507+
if (text_len == 0 || return_length <= 0) {
1508+
*out_len = 0;
1509+
return "";
1510+
}
1511+
1512+
// count the number of utf8 characters on text, ignoring invalid bytes
1513+
int actual_text_len = utf8_length_ignore_invalid(text, text_len);
1514+
1515+
if (return_length == actual_text_len ||
1516+
(return_length > actual_text_len && fill_text_len == 0)) {
1517+
// case where the return length is same as the text's length, or if it need to
1518+
// fill into text but "fill_text" is empty, then return text directly.
1519+
*out_len = text_len;
1520+
return text;
1521+
} else if (return_length < actual_text_len) {
1522+
// case where it truncates the result on return length.
1523+
*out_len = utf8_byte_pos(context, text, text_len, return_length);
1524+
return text;
1525+
} else {
1526+
// case (return_length > actual_text_len)
1527+
// case where it needs to copy "fill_text" on the string left. The total number
1528+
// of chars to copy is given by (return_length - actual_text_len)
1529+
gdv_int32 return_char_length = evaluate_return_char_length(
1530+
text_len, actual_text_len, return_length, fill_text, fill_text_len);
1531+
char* ret = reinterpret_cast<gdv_binary>(
1532+
gdv_fn_context_arena_malloc(context, return_char_length));
1533+
if (ret == nullptr) {
1534+
gdv_fn_context_set_error_msg(context,
1535+
"Could not allocate memory for output string");
1536+
*out_len = 0;
1537+
return "";
1538+
}
1539+
// try to fulfill the return string with the "fill_text" continuously
1540+
int32_t copied_chars_count = 0;
1541+
int32_t copied_chars_position = 0;
1542+
while (copied_chars_count < return_length - actual_text_len) {
1543+
int32_t char_len;
1544+
int32_t fill_index;
1545+
// for each char, evaluate its length to consider it when mem copying
1546+
for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) {
1547+
if (copied_chars_count >= return_length - actual_text_len) {
1548+
break;
1549+
}
1550+
char_len = utf8_char_length(fill_text[fill_index]);
1551+
// ignore invalid char on the fill text, considering it as size 1
1552+
if (char_len == 0) char_len += 1;
1553+
copied_chars_count++;
1554+
}
1555+
memcpy(ret + copied_chars_position, fill_text, fill_index);
1556+
copied_chars_position += fill_index;
1557+
}
1558+
// after fulfilling the text, copy the main string
1559+
memcpy(ret + copied_chars_position, text, text_len);
1560+
*out_len = copied_chars_position + text_len;
1561+
return ret;
1562+
}
1563+
}
1564+
1565+
FORCE_INLINE
1566+
const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len,
1567+
gdv_int32 return_length, const char* fill_text,
1568+
gdv_int32 fill_text_len, gdv_int32* out_len) {
1569+
// if the text length or the defined return length (number of characters to return)
1570+
// is <=0, then return an empty string.
1571+
return_length = std::min(max_str_length, return_length);
1572+
return_length = std::max(min_str_length, return_length);
1573+
if (text_len == 0 || return_length <= 0) {
1574+
*out_len = 0;
1575+
return "";
1576+
}
1577+
1578+
// count the number of utf8 characters on text, ignoring invalid bytes
1579+
int actual_text_len = utf8_length_ignore_invalid(text, text_len);
1580+
1581+
if (return_length == actual_text_len ||
1582+
(return_length > actual_text_len && fill_text_len == 0)) {
1583+
// case where the return length is same as the text's length, or if it need to
1584+
// fill into text but "fill_text" is empty, then return text directly.
1585+
*out_len = text_len;
1586+
return text;
1587+
} else if (return_length < actual_text_len) {
1588+
// case where it truncates the result on return length.
1589+
*out_len = utf8_byte_pos(context, text, text_len, return_length);
1590+
return text;
1591+
} else {
1592+
// case (return_length > actual_text_len)
1593+
// case where it needs to copy "fill_text" on the string right
1594+
gdv_int32 return_char_length = evaluate_return_char_length(
1595+
text_len, actual_text_len, return_length, fill_text, fill_text_len);
1596+
char* ret = reinterpret_cast<gdv_binary>(
1597+
gdv_fn_context_arena_malloc(context, return_char_length));
1598+
if (ret == nullptr) {
1599+
gdv_fn_context_set_error_msg(context,
1600+
"Could not allocate memory for output string");
1601+
*out_len = 0;
1602+
return "";
1603+
}
1604+
// fulfill the initial text copying the main input string
1605+
memcpy(ret, text, text_len);
1606+
// try to fulfill the return string with the "fill_text" continuously
1607+
int32_t copied_chars_count = 0;
1608+
int32_t copied_chars_position = 0;
1609+
while (actual_text_len + copied_chars_count < return_length) {
1610+
int32_t char_len;
1611+
int32_t fill_length;
1612+
// for each char, evaluate its length to consider it when mem copying
1613+
for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
1614+
if (actual_text_len + copied_chars_count >= return_length) {
1615+
break;
1616+
}
1617+
char_len = utf8_char_length(fill_text[fill_length]);
1618+
// ignore invalid char on the fill text, considering it as size 1
1619+
if (char_len == 0) char_len += 1;
1620+
copied_chars_count++;
1621+
}
1622+
memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
1623+
copied_chars_position += fill_length;
1624+
}
1625+
*out_len = copied_chars_position + text_len;
1626+
return ret;
1627+
}
1628+
}
1629+
1630+
FORCE_INLINE
1631+
const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1632+
gdv_int32 return_length, gdv_int32* out_len) {
1633+
return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1634+
}
1635+
1636+
FORCE_INLINE
1637+
const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len,
1638+
gdv_int32 return_length, gdv_int32* out_len) {
1639+
return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len);
1640+
}
1641+
14271642
FORCE_INLINE
14281643
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
14291644
const char* delimiter, gdv_int32 delim_len, gdv_int32 index,

0 commit comments

Comments
 (0)