Skip to content

Commit fe5e635

Browse files
committed
Add macro for determining if UTF-8 is Unicode-strict
1 parent 16ef187 commit fe5e635

File tree

6 files changed

+323
-12
lines changed

6 files changed

+323
-12
lines changed

ext/XS-APItest/APItest.xs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5315,6 +5315,13 @@ test_isUTF8_CHAR(char *s, STRLEN len)
53155315
OUTPUT:
53165316
RETVAL
53175317

5318+
STRLEN
5319+
test_isSTRICT_UTF8_CHAR(char *s, STRLEN len)
5320+
CODE:
5321+
RETVAL = isSTRICT_UTF8_CHAR((U8 *) s, (U8 *) s + len);
5322+
OUTPUT:
5323+
RETVAL
5324+
53185325
bool
53195326
test_is_utf8_valid_partial_char_flags(char *s, STRLEN len, U32 flags)
53205327
CODE:

ext/XS-APItest/t/utf8.t

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,11 +422,15 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
422422
$this_utf8_flags &=
423423
~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
424424
}
425+
426+
my $valid_under_strict = 1;
425427
if ($n > 0x10FFFF) {
426428
$this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
429+
$valid_under_strict = 0;
427430
}
428431
elsif (($n & 0xFFFE) == 0xFFFE) {
429432
$this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
433+
$valid_under_strict = 0;
430434
}
431435

432436
undef @warnings;
@@ -467,6 +471,29 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
467471

468472
undef @warnings;
469473

474+
$ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
475+
my $expected_len = ($valid_under_strict) ? $len : 0;
476+
is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
477+
478+
unless (is(scalar @warnings, 0,
479+
"Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings"))
480+
{
481+
diag "The warnings were: " . join(", ", @warnings);
482+
}
483+
484+
undef @warnings;
485+
486+
$ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1);
487+
is($ret, 0, "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0");
488+
489+
unless (is(scalar @warnings, 0,
490+
"Verify isSTRICT_UTF8_CHAR() generated no warnings"))
491+
{
492+
diag "The warnings were: " . join(", ", @warnings);
493+
}
494+
495+
undef @warnings;
496+
470497
$ret_ref = test_valid_utf8_to_uvchr($bytes);
471498
is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
472499
is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
@@ -734,6 +761,14 @@ foreach my $test (@malformations) {
734761
diag "The warnings were: " . join(", ", @warnings);
735762
}
736763

764+
$ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
765+
is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
766+
unless (is(scalar @warnings, 0,
767+
"$testname: isSTRICT_UTF8_CHAR() generated no warnings"))
768+
{
769+
diag "The warnings were: " . join(", ", @warnings);
770+
}
771+
737772
for my $j (1 .. $length - 1) {
738773
my $partial = substr($bytes, 0, $j);
739774

@@ -1240,6 +1275,25 @@ foreach my $test (@tests) {
12401275
diag "The warnings were: " . join(", ", @warnings);
12411276
}
12421277

1278+
undef @warnings;
1279+
$ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
1280+
if ($will_overflow) {
1281+
is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
1282+
}
1283+
else {
1284+
my $expected_ret = ( $testname =~ /surrogate|non-character/
1285+
|| $allowed_uv > 0x10FFFF)
1286+
? 0
1287+
: $length;
1288+
is($ret, $expected_ret,
1289+
"isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
1290+
}
1291+
unless (is(scalar @warnings, 0,
1292+
"isSTRICT_UTF8_CHAR() $testname: generated no warnings"))
1293+
{
1294+
diag "The warnings were: " . join(", ", @warnings);
1295+
}
1296+
12431297
# Test partial character handling, for each byte not a full character
12441298
for my $j (1.. $length - 1) {
12451299

regcharclass.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1876,6 +1876,6 @@
18761876
* 5c7eb94310e2aaa15702fd6bed24ff0e7ab5448f9a8231d8c49ca96c9e941089 lib/unicore/mktables
18771877
* cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version
18781878
* 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
1879-
* 1876ece914e2c14ed38c8a589adaa3d8193532c3a5bbe9ea5c3279bc9d29b279 regen/regcharclass.pl
1879+
* e3dc81163da3e92f7be01e9b953f6edb548eba93f1abb3d334e3b0469573c46d regen/regcharclass.pl
18801880
* 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl
18811881
* ex: set ro: */

regen/regcharclass.pl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,6 +1660,49 @@ sub make_macro {
16601660
#=> UTF8 :no_length_checks only_ebcdic_platform
16611661
#0xA0 - 0x1FFFFF
16621662
1663+
#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points
1664+
#=> UTF8 :no_length_checks only_ascii_platform
1665+
#0x0080 - 0xD7FF
1666+
#0xE000 - 0xFDCF
1667+
#0xFDF0 - 0xFFFD
1668+
#0x10000 - 0x1FFFD
1669+
#0x20000 - 0x2FFFD
1670+
#0x30000 - 0x3FFFD
1671+
#0x40000 - 0x4FFFD
1672+
#0x50000 - 0x5FFFD
1673+
#0x60000 - 0x6FFFD
1674+
#0x70000 - 0x7FFFD
1675+
#0x80000 - 0x8FFFD
1676+
#0x90000 - 0x9FFFD
1677+
#0xA0000 - 0xAFFFD
1678+
#0xB0000 - 0xBFFFD
1679+
#0xC0000 - 0xCFFFD
1680+
#0xD0000 - 0xDFFFD
1681+
#0xE0000 - 0xEFFFD
1682+
#0xF0000 - 0xFFFFD
1683+
#0x100000 - 0x10FFFD
1684+
#
1685+
#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points
1686+
#=> UTF8 :no_length_checks only_ebcdic_platform
1687+
#0x00A0 - 0xD7FF
1688+
#0xE000 - 0xFDCF
1689+
#0xFDF0 - 0xFFFD
1690+
#0x10000 - 0x1FFFD
1691+
#0x20000 - 0x2FFFD
1692+
#0x30000 - 0x3FFFD
1693+
#0x40000 - 0x4FFFD
1694+
#0x50000 - 0x5FFFD
1695+
#0x60000 - 0x6FFFD
1696+
#0x70000 - 0x7FFFD
1697+
#0x80000 - 0x8FFFD
1698+
#0x90000 - 0x9FFFD
1699+
#0xA0000 - 0xAFFFD
1700+
#0xB0000 - 0xBFFFD
1701+
#0xC0000 - 0xCFFFD
1702+
#0xD0000 - 0xDFFFD
1703+
#0xE0000 - 0xEFFFD
1704+
#0xF0000 - 0xFFFFD
1705+
#0x100000 - 0x10FFFD
16631706
16641707
QUOTEMETA: Meta-characters that \Q should quote
16651708
=> high :fast

utf8.h

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,55 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
338338
/* The above macro handles UTF-8 that has this start byte as the maximum */
339339
#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF7
340340

341+
/* A helper macro for isSTRICT_UTF8_CHAR, so use that one instead of this.
342+
* Like is_UTF8_CHAR_utf8_no_length_checks(), this was moved here and LIKELYs
343+
* added manually.
344+
*
345+
STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no
346+
surrrogates nor non-character code points
347+
*/
348+
/*** GENERATED CODE ***/
349+
#define is_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \
350+
( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
351+
( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
352+
: ( 0xE0 == ((U8*)s)[0] ) ? \
353+
( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
354+
: ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || 0xEE == ((U8*)s)[0] ) ?\
355+
( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
356+
: ( 0xED == ((U8*)s)[0] ) ? \
357+
( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
358+
: ( 0xEF == ((U8*)s)[0] ) ? \
359+
( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xB6 ) || ( 0xB8 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
360+
( LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ? 3 : 0 ) \
361+
: ( 0xB7 == ((U8*)s)[1] ) ? \
362+
( LIKELY( ( ((U8*)s)[2] & 0xF0 ) == 0x80 || ( ((U8*)s)[2] & 0xF0 ) == 0xB0 ) ? 3 : 0 )\
363+
: ( ( 0xBF == ((U8*)s)[1] ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBD ) ) ? 3 : 0 )\
364+
: ( 0xF0 == ((U8*)s)[0] ) ? \
365+
( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9E ) || ( 0xA0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xAE ) || ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
366+
( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
367+
: ( ((U8*)s)[1] == 0x9F || ( ( ((U8*)s)[1] & 0xEF ) == 0xAF ) ) ? \
368+
( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
369+
( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
370+
: LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
371+
: 0 ) \
372+
: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ? \
373+
( ( ( ( ((U8*)s)[1] & 0xC8 ) == 0x80 ) || ( ( ((U8*)s)[1] & 0xCC ) == 0x88 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x8C ) || ( ( ((U8*)s)[1] & 0xCF ) == 0x8E ) ) ?\
374+
( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
375+
: ( ( ((U8*)s)[1] & 0xCF ) == 0x8F ) ? \
376+
( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
377+
( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
378+
: LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
379+
: 0 ) \
380+
: ( 0xF4 == ((U8*)s)[0] ) ? \
381+
( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x8E ) ? \
382+
( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
383+
: ( 0x8F == ((U8*)s)[1] ) ? \
384+
( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
385+
( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
386+
: LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
387+
: 0 ) \
388+
: 0 )
389+
341390
#endif /* EBCDIC vs ASCII */
342391

343392
/* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -889,9 +938,6 @@ point's representation.
889938

890939
#define SHARP_S_SKIP 2
891940

892-
/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
893-
* log for earlier versions which gave details for these */
894-
895941
/*
896942
897943
=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
@@ -932,6 +978,35 @@ is a valid UTF-8 character.
932978

933979
#define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
934980

981+
/*
982+
983+
=for apidoc Am|STRLEN|isSTRICT_UTF8_CHAR|const U8 *s|const U8 *e
984+
985+
Evaluates to non-zero if the first few bytes of the string starting at C<s> and
986+
looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
987+
Unicode code point completely acceptable for open interchange between all
988+
applications; otherwise it evaluates to 0. If non-zero, the value gives how
989+
many many bytes starting at C<s> comprise the code point's representation.
990+
991+
The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
992+
be a surrogate nor a non-character code point. Thus this excludes any code
993+
point from Perl's extended UTF-8.
994+
995+
This is used to efficiently decide if the next few bytes in C<s> is
996+
legal Unicode-acceptable UTF-8 for a single character.
997+
998+
=cut
999+
*/
1000+
1001+
#define isSTRICT_UTF8_CHAR(s, e) \
1002+
(UNLIKELY((e) <= (s)) \
1003+
? 0 \
1004+
: (UTF8_IS_INVARIANT(*s)) \
1005+
? 1 \
1006+
: UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
1007+
? 0 \
1008+
: is_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
1009+
9351010
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
9361011
* retained solely for backwards compatibility */
9371012
#define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n)

0 commit comments

Comments
 (0)