Skip to content

Commit 4efc0fe

Browse files
committed
Add utf8_length method that ignore invalid char considering size 1
1 parent 33a5a14 commit 4efc0fe

File tree

1 file changed

+29
-18
lines changed

1 file changed

+29
-18
lines changed

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
190190
return count;
191191
}
192192

193+
// Count the number of utf8 characters, ignoring invalid char, considering size 1
194+
FORCE_INLINE
195+
gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) {
196+
int char_len = 0;
197+
int count = 0;
198+
for (int i = 0; i < data_len; i += char_len) {
199+
char_len = utf8_char_length(data[i]);
200+
if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
201+
// if invalid byte or incomplete glyph, ignore it
202+
char_len = 1;
203+
}
204+
for (int j = 1; j < char_len; ++j) {
205+
if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
206+
char_len += 1;
207+
}
208+
}
209+
++count;
210+
}
211+
return count;
212+
}
213+
193214
// Get the byte position corresponding to a character position for a non-empty utf8
194215
// sequence
195216
FORCE_INLINE
@@ -1433,15 +1454,8 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
14331454
return "";
14341455
}
14351456

1436-
// initially counts the number of utf8 characters in the defined text and fill_text
1437-
int32_t text_char_count = utf8_length(context, text, text_len);
1438-
int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
1439-
// text_char_count is zero if input has invalid utf8 char
1440-
// fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
1441-
if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
1442-
*out_len = 0;
1443-
return "";
1444-
}
1457+
// count the number of utf8 characters on text, ignoring invalid bytes
1458+
int text_char_count = utf8_length_ignore_invalid(text, text_len);
14451459

14461460
if (return_length == text_char_count ||
14471461
(return_length > text_char_count && fill_text_len == 0)) {
@@ -1477,6 +1491,8 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
14771491
break;
14781492
}
14791493
char_len = utf8_char_length(fill_text[fill_index]);
1494+
// ignore invalid char on the fill text, considering it as size 1
1495+
if (char_len == 0) char_len += 1;
14801496
copied_chars_count++;
14811497
}
14821498
memcpy(ret + copied_chars_position, fill_text, fill_index);
@@ -1500,15 +1516,8 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
15001516
return "";
15011517
}
15021518

1503-
// initially counts the number of utf8 characters in the defined text and fill_text
1504-
int32_t text_char_count = utf8_length(context, text, text_len);
1505-
int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
1506-
// text_char_count is zero if input has invalid utf8 char
1507-
// fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
1508-
if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
1509-
*out_len = 0;
1510-
return "";
1511-
}
1519+
// count the number of utf8 characters on text, ignoring invalid bytes
1520+
int text_char_count = utf8_length_ignore_invalid(text, text_len);
15121521

15131522
if (return_length == text_char_count ||
15141523
(return_length > text_char_count && fill_text_len == 0)) {
@@ -1545,6 +1554,8 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
15451554
break;
15461555
}
15471556
char_len = utf8_char_length(fill_text[fill_length]);
1557+
// ignore invalid char on the fill text, considering it as size 1
1558+
if (char_len == 0) char_len += 1;
15481559
copied_chars_count++;
15491560
}
15501561
memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);

0 commit comments

Comments
 (0)