Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ PHP 8.4 UPGRADE NOTES
. mb_encode_numericentity() and mb_decode_numericentity() now check that
the $map is only composed of integers, if not a ValueError is thrown.
. mb_http_input() now always throws a ValueError if the $type is invalid.
. On invalid strings (those with encoding errors), mb_substr() now interprets
character indices in the same manner as most other mbstring functions. This
means that character indices returned by mb_strpos() can be passed to mb_substr().
. For SJIS-Mac (MacJapanese) strings, character indices passed to mb_substr() now
refer to the indices of the Unicode codepoints which are produced when the string
is converted to Unicode. This is significant because around 40 SJIS-Mac characters
convert to a sequence of multiple Unicode codepoints.

- PDO_DBLIB:
. setAttribute, DBLIB_ATTR_STRINGIFY_UNIQUEIDENTIFIER and DBLIB_ATTR_DATETIME_CONVERT
Expand Down
46 changes: 5 additions & 41 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "libmbfl/mbfl/mbfilter_wchar.h"
#include "libmbfl/mbfl/eaw_table.h"
#include "libmbfl/filters/mbfilter_base64.h"
#include "libmbfl/filters/mbfilter_cjk.h"
#include "libmbfl/filters/mbfilter_qprint.h"
#include "libmbfl/filters/mbfilter_htmlent.h"
#include "libmbfl/filters/mbfilter_uuencode.h"
Expand Down Expand Up @@ -2112,8 +2113,9 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
unsigned char *in = (unsigned char*)ZSTR_VAL(input);
size_t in_len = ZSTR_LEN(input);

if (from >= in_len || len == 0) {
/* No supported text encoding decodes to more than one codepoint per byte
if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
/* Other than MacJapanese, no supported text encoding decodes to
* more than one codepoint per byte
* So if the number of codepoints to skip >= number of input bytes,
* then definitely the output should be empty */
return zend_empty_string;
Expand All @@ -2134,30 +2136,6 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
len = in_len;
}
return zend_string_init_fast((const char*)in, len);
} else if (enc->mblen_table) {
/* The use of the `mblen_table` means that for encodings like MacJapanese,
* we treat each character in its native charset as "1 character", even if it
* maps to a sequence of several codepoints */
const unsigned char *mbtab = enc->mblen_table;
unsigned char *limit = in + in_len;
while (from && in < limit) {
in += mbtab[*in];
from--;
}
if (in >= limit) {
return zend_empty_string;
} else if (len == MBFL_SUBSTR_UNTIL_END) {
return zend_string_init_fast((const char*)in, limit - in);
}
unsigned char *end = in;
while (len && end < limit) {
end += mbtab[*end];
len--;
}
if (end > limit) {
end = limit;
}
return zend_string_init_fast((const char*)in, end - in);
}

return mb_get_substr_slow(in, in_len, from, len, enc);
Expand Down Expand Up @@ -2350,21 +2328,7 @@ PHP_FUNCTION(mb_substr)

size_t mblen = 0;
if (from < 0 || (!len_is_null && len < 0)) {
if (enc->mblen_table) {
/* Because we use the `mblen_table` when iterating over the string and
* extracting the requested part, we also need to use it here for counting
* the "length" of the string
* Otherwise, we can get wrong results for text encodings like MacJapanese,
* where one native 'character' can map to a sequence of several codepoints */
const unsigned char *mbtab = enc->mblen_table;
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
while (p < e) {
p += mbtab[*p];
mblen++;
}
} else {
mblen = mb_get_strlen(str, enc);
}
mblen = mb_get_strlen(str, enc);
}

/* if "from" position is negative, count start position from the end
Expand Down
9 changes: 8 additions & 1 deletion ext/mbstring/tests/mb_strstr.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP(
var_dump(bin2hex(mb_strstr("\xdd\x00", "", false, 'UTF-8')));
var_dump(bin2hex(mb_strstr("M\xff\xff\xff\x00", "\x00", false, "SJIS")));

// Test handling of invalid UTF-8 string
// Thanks to Stefan Schiller
var_dump(mb_strstr("\xf0start", "start", false, "UTF-8"));
var_dump(mb_strstr("\xf0start", "start", true, "UTF-8"));

?>
--EXPECT--
string(18) "おかきくけこ"
Expand All @@ -36,5 +41,7 @@ string(12) "あいうえ"
string(18) "おかきくけこ"
string(18) "おかきくけこ"
string(12) "あいうえ"
string(4) "dd00"
string(4) "3f00"
string(2) "00"
string(5) "start"
string(1) "?"
39 changes: 25 additions & 14 deletions ext/mbstring/tests/mb_substr.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@ print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UT
print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";

echo "Testing agreement with mb_strpos on invalid UTF-8 string:\n";
/* Stefan Schiller pointed out that on invalid UTF-8 strings, character indices returned
* by mb_strpos would not extract the desired part of the string when passed to mb_substr.
* This is the test case which he provided: */
$data = "\xF0AAA<b>";
$pos = mb_strpos($data, "<", 0, "UTF-8");
$out = mb_substr($data, 0, $pos, "UTF-8");
print $out . "\n";

echo "Regression:\n";
/* During development, one >= comparison in mb_get_substr was wrongly written as >
* This was caught by libFuzzer */
Expand All @@ -138,30 +147,30 @@ SJIS:
4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
5:
-- Testing illegal SJIS byte 0x80 --
6380
806162
633f
3f6162
SJIS-2004:
6380
806162
633f
3f6162
MacJapanese:
6380
806162
SJIS-Mobile#DOCOMO:
6380
806162
633f
3f6162
SJIS-Mobile#KDDI:
6380
806162
633f
3f6162
SJIS-Mobile#SoftBank:
6380
806162
633f
3f6162
-- Testing MacJapanese characters which map to 3-5 codepoints each --
616263
85ab85ac
85ac
3f3f
58
616263
85bf85c0
85c0
3f3f
78
ISO-2022-JP:
1: 1b2442212121721b284241
2: 43
Expand Down Expand Up @@ -200,5 +209,7 @@ UTF-7:
3: йте
4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
5:
Testing agreement with mb_strpos on invalid UTF-8 string:
?AAA
Regression:
1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842