@@ -190,6 +190,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) {
190190 return count;
191191}
192192
193+ // Count the number of utf8 characters, ignoring invalid char, considering size 1
194+ FORCE_INLINE
195+ gdv_int32 utf8_length_ignore_invalid (const char * data, gdv_int32 data_len) {
196+ int char_len = 0 ;
197+ int count = 0 ;
198+ for (int i = 0 ; i < data_len; i += char_len) {
199+ char_len = utf8_char_length (data[i]);
200+ if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph
201+ // if invalid byte or incomplete glyph, ignore it
202+ char_len = 1 ;
203+ }
204+ for (int j = 1 ; j < char_len; ++j) {
205+ if ((data[i + j] & 0xC0 ) != 0x80 ) { // bytes following head-byte of glyph
206+ char_len += 1 ;
207+ }
208+ }
209+ ++count;
210+ }
211+ return count;
212+ }
213+
193214// Get the byte position corresponding to a character position for a non-empty utf8
194215// sequence
195216FORCE_INLINE
@@ -1433,15 +1454,8 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
14331454 return " " ;
14341455 }
14351456
1436- // initially counts the number of utf8 characters in the defined text and fill_text
1437- int32_t text_char_count = utf8_length (context, text, text_len);
1438- int32_t fill_char_count = utf8_length (context, fill_text, fill_text_len);
1439- // text_char_count is zero if input has invalid utf8 char
1440- // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
1441- if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0 )) {
1442- *out_len = 0 ;
1443- return " " ;
1444- }
1457+ // count the number of utf8 characters on text, ignoring invalid bytes
1458+ int text_char_count = utf8_length_ignore_invalid (text, text_len);
14451459
14461460 if (return_length == text_char_count ||
14471461 (return_length > text_char_count && fill_text_len == 0 )) {
@@ -1477,6 +1491,8 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
14771491 break ;
14781492 }
14791493 char_len = utf8_char_length (fill_text[fill_index]);
1494+ // ignore invalid char on the fill text, considering it as size 1
1495+ if (char_len == 0 ) char_len += 1 ;
14801496 copied_chars_count++;
14811497 }
14821498 memcpy (ret + copied_chars_position, fill_text, fill_index);
@@ -1500,15 +1516,8 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
15001516 return " " ;
15011517 }
15021518
1503- // initially counts the number of utf8 characters in the defined text and fill_text
1504- int32_t text_char_count = utf8_length (context, text, text_len);
1505- int32_t fill_char_count = utf8_length (context, fill_text, fill_text_len);
1506- // text_char_count is zero if input has invalid utf8 char
1507- // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
1508- if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0 )) {
1509- *out_len = 0 ;
1510- return " " ;
1511- }
1519+ // count the number of utf8 characters on text, ignoring invalid bytes
1520+ int text_char_count = utf8_length_ignore_invalid (text, text_len);
15121521
15131522 if (return_length == text_char_count ||
15141523 (return_length > text_char_count && fill_text_len == 0 )) {
@@ -1545,6 +1554,8 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32
15451554 break ;
15461555 }
15471556 char_len = utf8_char_length (fill_text[fill_length]);
1557+ // ignore invalid char on the fill text, considering it as size 1
1558+ if (char_len == 0 ) char_len += 1 ;
15481559 copied_chars_count++;
15491560 }
15501561 memcpy (ret + text_len + copied_chars_position, fill_text, fill_length);
0 commit comments