Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Zend/zend_operators.c
Original file line number Diff line number Diff line change
Expand Up @@ -2952,6 +2952,34 @@ ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, boo
}
/* }}} */

ZEND_API bool ZEND_FASTCALL zend_str_is_utf8_pure_ascii(const char *str, size_t length) /* {{{ */
{
unsigned char *p = (unsigned char *) str;
unsigned char *end = p + length;

#ifdef HAVE_BLOCKCONV
__m128i blconv_80 = _mm_set1_epi8(0x80), blconv_operand, blconv_mingle;
while (p + BLOCKCONV_STRIDE <= end) {
blconv_operand = _mm_loadu_si128((__m128i*)(p));
blconv_mingle = _mm_cmpeq_epi8(_mm_max_epu8(blconv_operand, blconv_80), blconv_operand);
if (BLOCKCONV_FOUND()) {
return false;
}
p += BLOCKCONV_STRIDE;
}
#endif

while (p < end) {
if (*p >= 0x80) {
return false;
}
p++;
}

return true;
}
/* }}} */

ZEND_API int ZEND_FASTCALL zend_binary_strcmp(const char *s1, size_t len1, const char *s2, size_t len2) /* {{{ */
{
int retval;
Expand Down
1 change: 1 addition & 0 deletions Zend/zend_operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ ZEND_API char* ZEND_FASTCALL zend_str_tolower_dup_ex(const char *source,
ZEND_API char* ZEND_FASTCALL zend_str_toupper_dup_ex(const char *source, size_t length);
ZEND_API zend_string* ZEND_FASTCALL zend_string_tolower_ex(zend_string *str, bool persistent);
ZEND_API zend_string* ZEND_FASTCALL zend_string_toupper_ex(zend_string *str, bool persistent);
ZEND_API bool ZEND_FASTCALL zend_str_is_utf8_pure_ascii(const char *str, size_t length);

#define zend_string_tolower(str) zend_string_tolower_ex(str, 0)
#define zend_string_toupper(str) zend_string_toupper_ex(str, 0)
Expand Down
31 changes: 21 additions & 10 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -2588,22 +2588,29 @@ PHP_FUNCTION(mb_convert_case)
/* {{{ Returns a upper cased version of source_string */
PHP_FUNCTION(mb_strtoupper)
{
zend_string *str;
zend_string *from_encoding = NULL;
char *str;
size_t str_len, ret_len;
const mbfl_encoding *enc;
char *newstr;
size_t ret_len;

ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STRING(str, str_len)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STR_OR_NULL(from_encoding)
ZEND_PARSE_PARAMETERS_END();

const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
enc = php_mb_get_encoding(from_encoding, 2);
if (!enc) {
RETURN_THROWS();
}

char *newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, str, str_len, &ret_len, enc);
// optimize performance for UTF-8 encoding and input string consisting of lower/7-bit ASCII characters only
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, but that deoptimizes the performance for non ASCII strings. I think we need to measure this.

Copy link
Contributor Author

@mvorisek mvorisek Jul 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance drop for non-ASCII strings is around 5%. Which is highly negligible, in short, one ASCII only string conversion gains this performance drop back for the next 20 non-ASCII conversions. 😅

Please note, the zend_str_is_utf8_pure_ascii check is optimized with SSE2.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, but how often would there be pure ASCII strings. In the worst case never.

Copy link
Contributor Author

@mvorisek mvorisek Jul 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

often 😃, consider this function for example for lowercasing a name list to build a case-insensitive index (for short strings, there is "worst case gain" of 250%, for longer strings the gain is even more)

if (enc == &mbfl_encoding_utf8 && zend_str_is_utf8_pure_ascii(ZSTR_VAL(str), ZSTR_LEN(str))) {
RETURN_STR(zend_string_toupper(str));
}

newstr = mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), &ret_len, enc);
/* If newstr is NULL something went wrong in mbfl and this is a bug */
ZEND_ASSERT(newstr != NULL);

Expand All @@ -2616,15 +2623,14 @@ PHP_FUNCTION(mb_strtoupper)
/* {{{ Returns a lower cased version of source_string */
PHP_FUNCTION(mb_strtolower)
{
zend_string *str;
zend_string *from_encoding = NULL;
char *str;
size_t str_len;
const mbfl_encoding *enc;
char *newstr;
size_t ret_len;
const mbfl_encoding *enc;

ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STRING(str, str_len)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STR_OR_NULL(from_encoding)
ZEND_PARSE_PARAMETERS_END();
Expand All @@ -2634,7 +2640,12 @@ PHP_FUNCTION(mb_strtolower)
RETURN_THROWS();
}

newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, str, str_len, &ret_len, enc);
// optimize performance for UTF-8 encoding and input string consisting of lower/7-bit ASCII characters only
if (enc == &mbfl_encoding_utf8 && zend_str_is_utf8_pure_ascii(ZSTR_VAL(str), ZSTR_LEN(str))) {
RETURN_STR(zend_string_tolower(str));
}

newstr = mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), &ret_len, enc);
/* If newstr is NULL something went wrong in mbfl and this is a bug */
ZEND_ASSERT(newstr != NULL);

Expand Down