Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ PHP 8.4 UPGRADE NOTES
5. Changed Functions
========================================

- MBString:
. The behavior of mb_strcut is more consistent now on invalid UTF-8 and UTF-16
strings. (For valid UTF-8 and UTF-16 strings, there is no change.)

- PGSQL:
. pg_select, the conditions arguments accepts an empty array and is optional.

Expand Down Expand Up @@ -168,3 +172,5 @@ PHP 8.4 UPGRADE NOTES

* The performance of strspn() is greatly improved. It now runs in linear time
instead of being bounded by quadratic time.

* mb_strcut() is much faster now for UTF-8 and UTF-16 strings.
95 changes: 92 additions & 3 deletions ext/mbstring/libmbfl/filters/mbfilter_utf16.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf

static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end);
static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end);
static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end);

static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};

Expand All @@ -190,7 +193,7 @@ const mbfl_encoding mbfl_encoding_utf16 = {
mb_utf16_to_wchar,
mb_wchar_to_utf16be,
NULL,
NULL,
mb_cut_utf16
};

const mbfl_encoding mbfl_encoding_utf16be = {
Expand All @@ -205,7 +208,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
mb_utf16be_to_wchar,
mb_wchar_to_utf16be,
NULL,
NULL,
mb_cut_utf16be
};

const mbfl_encoding mbfl_encoding_utf16le = {
Expand All @@ -220,7 +223,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
mb_utf16le_to_wchar,
mb_wchar_to_utf16le,
NULL,
NULL,
mb_cut_utf16le
};

const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
Expand Down Expand Up @@ -1043,3 +1046,89 @@ static void mb_wchar_to_utf16le_avx2(uint32_t *in, size_t len, mb_convert_buf *b
}

#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */

static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end)
{
if (len > end - (str + from)) {
len = end - (str + from);
}
from &= ~1;
len &= ~1;
unsigned char *start = str + from;
if (len < 2 || (end - start) < 2) {
return zend_empty_string;
}
/* Check if 1st codepoint is 2nd part of surrogate pair */
if (from > 0) {
uint32_t start_cp = (*start << 8) + *(start + 1);
if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
uint32_t preceding_cp = (*(start - 2) << 8) + *(start - 1);
if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
from -= 2;
}
}
}
/* Same for ending cut point */
unsigned char *_end = start + len;
if (_end > end) {
_end = end;
}
uint32_t ending_cp = (*(_end - 2) << 8) + *(_end - 1);
if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
_end -= 2;
}
return zend_string_init_fast((char*)start, _end - start);
}

static zend_string* mb_cut_utf16le(unsigned char *str, size_t from, size_t len, unsigned char *end)
{
if (len > end - (str + from)) {
len = end - (str + from);
}
from &= ~1;
len &= ~1;
unsigned char *start = str + from;
if (len < 2 || (end - start) < 2) {
return zend_empty_string;
}
/* Check if 1st codepoint is 2nd part of surrogate pair */
if (from > 0) {
uint32_t start_cp = (*(start + 1) << 8) + *start;
if (start_cp >= 0xDC00 && start_cp <= 0xDFFF) {
uint32_t preceding_cp = (*(start - 1) << 8) + *(start - 2);
if (preceding_cp >= 0xD800 && preceding_cp <= 0xDBFF) {
from -= 2;
}
}
}
/* Same for ending cut point */
unsigned char *_end = start + len;
if (_end > end) {
_end = end;
}
uint32_t ending_cp = (*(_end - 1) << 8) + *(_end - 2);
if (ending_cp >= 0xD800 && ending_cp <= 0xDBFF) {
_end -= 2;
}
return zend_string_init_fast((char*)start, _end - start);
}

static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end)
{
if (len < 2 || (end - str) < 2) {
return zend_empty_string;
}
uint32_t cp = (*str << 8) + *(str + 1);
if (cp == 0xFFFE) {
/* Little-endian BOM */
if (from < 2) {
from = 2;
}
return mb_cut_utf16le(str, from, len, end);
} else {
if (cp == 0xFEFF && from < 2) {
from = 2;
}
return mb_cut_utf16be(str, from, len, end);
}
}
2 changes: 1 addition & 1 deletion ext/mbstring/tests/mb_strcut.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ OK
Single byte: []
With from=1: []
Bad surrogate: []
Bad surrogate followed by other bytes: [003f1243]
Bad surrogate followed by other bytes: [d9001243]
BE byte order mark: []
LE byte order mark: []
Length=0: []
Expand Down