Skip to content

Commit b34eff6

Browse files
committed
Add macro for Unicode Corregindum #9 strict
This macro follows Unicode Corrigendum #9 to allow non-character code points. These are still discouraged but not completely forbidden. It's best for code that isn't intended to operate on arbitrary other code text to use the original definition, but code that does things, such as source code control, should change to use this definition if it wants to be Unicode-strict. Perl can't adopt C9 wholesale, as it might create security holes in existing applications that rely on Perl keeping non-chars out.
1 parent f2ee672 commit b34eff6

File tree

6 files changed

+164
-2
lines changed

6 files changed

+164
-2
lines changed

ext/XS-APItest/APItest.xs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5322,6 +5322,13 @@ test_isSTRICT_UTF8_CHAR(char *s, STRLEN len)
53225322
OUTPUT:
53235323
RETVAL
53245324

5325+
STRLEN
5326+
test_isC9_STRICT_UTF8_CHAR(char *s, STRLEN len)
5327+
CODE:
5328+
RETVAL = isC9_STRICT_UTF8_CHAR((U8 *) s, (U8 *) s + len);
5329+
OUTPUT:
5330+
RETVAL
5331+
53255332
bool
53265333
test_is_utf8_valid_partial_char_flags(char *s, STRLEN len, U32 flags)
53275334
CODE:

ext/XS-APItest/t/utf8.t

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,9 +424,11 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
424424
}
425425

426426
my $valid_under_strict = 1;
427+
my $valid_under_c9strict = 1;
427428
if ($n > 0x10FFFF) {
428429
$this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
429430
$valid_under_strict = 0;
431+
$valid_under_c9strict = 0;
430432
}
431433
elsif (($n & 0xFFFE) == 0xFFFE) {
432434
$this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
@@ -492,6 +494,27 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
492494
diag "The warnings were: " . join(", ", @warnings);
493495
}
494496

497+
$ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
498+
$expected_len = ($valid_under_c9strict) ? $len : 0;
499+
is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
500+
501+
unless (is(scalar @warnings, 0,
502+
"Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings"))
503+
{
504+
diag "The warnings were: " . join(", ", @warnings);
505+
}
506+
507+
undef @warnings;
508+
509+
$ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1);
510+
is($ret, 0, "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0");
511+
512+
unless (is(scalar @warnings, 0,
513+
"Verify isC9_STRICT_UTF8_CHAR() generated no warnings"))
514+
{
515+
diag "The warnings were: " . join(", ", @warnings);
516+
}
517+
495518
undef @warnings;
496519

497520
$ret_ref = test_valid_utf8_to_uvchr($bytes);
@@ -769,6 +792,14 @@ foreach my $test (@malformations) {
769792
diag "The warnings were: " . join(", ", @warnings);
770793
}
771794

795+
$ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
796+
is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0");
797+
unless (is(scalar @warnings, 0,
798+
"$testname: isC9_STRICT_UTF8_CHAR() generated no warnings"))
799+
{
800+
diag "The warnings were: " . join(", ", @warnings);
801+
}
802+
772803
for my $j (1 .. $length - 1) {
773804
my $partial = substr($bytes, 0, $j);
774805

@@ -1294,6 +1325,25 @@ foreach my $test (@tests) {
12941325
diag "The warnings were: " . join(", ", @warnings);
12951326
}
12961327

1328+
undef @warnings;
1329+
$ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
1330+
if ($will_overflow) {
1331+
is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
1332+
}
1333+
else {
1334+
my $expected_ret = ( $testname =~ /surrogate/
1335+
|| $allowed_uv > 0x10FFFF)
1336+
? 0
1337+
: $length;
1338+
is($ret, $expected_ret,
1339+
"isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
1340+
}
1341+
unless (is(scalar @warnings, 0,
1342+
"isC9_STRICT_UTF8_CHAR() $testname: generated no warnings"))
1343+
{
1344+
diag "The warnings were: " . join(", ", @warnings);
1345+
}
1346+
12971347
# Test partial character handling, for each byte not a full character
12981348
for my $j (1.. $length - 1) {
12991349

regcharclass.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1876,6 +1876,6 @@
18761876
* 5c7eb94310e2aaa15702fd6bed24ff0e7ab5448f9a8231d8c49ca96c9e941089 lib/unicore/mktables
18771877
* cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version
18781878
* 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
1879-
* e3dc81163da3e92f7be01e9b953f6edb548eba93f1abb3d334e3b0469573c46d regen/regcharclass.pl
1879+
* 66e20f857451956f9fc7ad7432de972e84fb857885009838878bcf6f91ffbeef regen/regcharclass.pl
18801880
* 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl
18811881
* ex: set ro: */

regen/regcharclass.pl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,6 +1704,16 @@ sub make_macro {
17041704
#0xF0000 - 0xFFFFD
17051705
#0x100000 - 0x10FFFD
17061706
1707+
#C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrogates
1708+
#=> UTF8 :no_length_checks only_ascii_platform
1709+
#0x0080 - 0xD7FF
1710+
#0xE000 - 0x10FFFF
1711+
#
1712+
#C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points including non-character code points, no surrogates
1713+
#=> UTF8 :no_length_checks only_ebcdic_platform
1714+
#0x00A0 - 0xD7FF
1715+
#0xE000 - 0x10FFFF
1716+
17071717
QUOTEMETA: Meta-characters that \Q should quote
17081718
=> high :fast
17091719
\p{_Perl_Quotemeta}

utf8.h

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,28 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
387387
: 0 ) \
388388
: 0 )
389389

390+
/* Similarly,
391+
C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code
392+
points, no surrogates
393+
0x0080 - 0xD7FF
394+
0xE000 - 0x10FFFF
395+
*/
396+
/*** GENERATED CODE ***/
397+
#define is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s) \
398+
( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
399+
( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
400+
: ( 0xE0 == ((U8*)s)[0] ) ? \
401+
( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
402+
: ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || ( ((U8*)s)[0] & 0xFE ) == 0xEE ) ?\
403+
( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
404+
: ( 0xED == ((U8*)s)[0] ) ? \
405+
( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
406+
: ( 0xF0 == ((U8*)s)[0] ) ? \
407+
( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
408+
: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ? \
409+
( LIKELY( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
410+
: LIKELY( ( ( ( 0xF4 == ((U8*)s)[0] ) && ( ( ((U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
411+
390412
#endif /* EBCDIC vs ASCII */
391413

392414
/* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -993,7 +1015,8 @@ be a surrogate nor a non-character code point. Thus this excludes any code
9931015
point from Perl's extended UTF-8.
9941016
9951017
This is used to efficiently decide if the next few bytes in C<s> is
996-
legal Unicode-acceptable UTF-8 for a single character.
1018+
legal Unicode-acceptable UTF-8 for a single character. Use
1019+
C<L</isC9_STRICT_UTF8_CHAR>> to also accept non-character code points.
9971020
9981021
=cut
9991022
*/
@@ -1007,6 +1030,36 @@ legal Unicode-acceptable UTF-8 for a single character.
10071030
? 0 \
10081031
: is_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
10091032

1033+
/*
1034+
1035+
=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e
1036+
1037+
Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1038+
looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1039+
Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1040+
the value gives how many many bytes starting at C<s> comprise the code point's
1041+
representation.
1042+
1043+
The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1044+
differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1045+
code points. This corresponds to
1046+
L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1047+
which said that non-character code points are merely discouraged rather than
1048+
completely forbidden in open interchange. See
1049+
L<perlunicode/Noncharacter code points>.
1050+
1051+
=cut
1052+
*/
1053+
1054+
#define isC9_STRICT_UTF8_CHAR(s, e) \
1055+
(UNLIKELY((e) <= (s)) \
1056+
? 0 \
1057+
: (UTF8_IS_INVARIANT(*s)) \
1058+
? 1 \
1059+
: UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
1060+
? 0 \
1061+
: is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
1062+
10101063
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
10111064
* retained solely for backwards compatibility */
10121065
#define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n)

0 commit comments

Comments
 (0)