Add macro for determining if UTF-8 is Unicode-strict

khwilliamson · khwilliamson · commit fe5e6357de2d · 2016-09-14T20:37:40.000-06:00
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
@@ -5315,6 +5315,13 @@ test_isUTF8_CHAR(char *s, STRLEN len)
     OUTPUT:
         RETVAL
 
+STRLEN
+test_isSTRICT_UTF8_CHAR(char *s, STRLEN len)
+    CODE:
+        RETVAL = isSTRICT_UTF8_CHAR((U8 *) s, (U8 *) s + len);
+    OUTPUT:
+        RETVAL
+
 bool
 test_is_utf8_valid_partial_char_flags(char *s, STRLEN len, U32 flags)
     CODE:
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
@@ -422,11 +422,15 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
         $this_utf8_flags &=
                         ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
     }
+
+    my $valid_under_strict = 1;
     if ($n > 0x10FFFF) {
         $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
+        $valid_under_strict = 0;
     }
     elsif (($n & 0xFFFE) == 0xFFFE) {
         $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
+        $valid_under_strict = 0;
     }
 
     undef @warnings;
@@ -467,6 +471,29 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
 
     undef @warnings;
 
+    $ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
+    my $expected_len = ($valid_under_strict) ? $len : 0;
+    is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
+    undef @warnings;
+
+    $ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1);
+    is($ret, 0, "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isSTRICT_UTF8_CHAR() generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
+    undef @warnings;
+
     $ret_ref = test_valid_utf8_to_uvchr($bytes);
     is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
     is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
@@ -734,6 +761,14 @@ foreach my $test (@malformations) {
         diag "The warnings were: " . join(", ", @warnings);
     }
 
+    $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
+    is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
+    unless (is(scalar @warnings, 0,
+               "$testname: isSTRICT_UTF8_CHAR() generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
     for my $j (1 .. $length - 1) {
         my $partial = substr($bytes, 0, $j);
 
@@ -1240,6 +1275,25 @@ foreach my $test (@tests) {
             diag "The warnings were: " . join(", ", @warnings);
         }
 
+        undef @warnings;
+        $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
+        if ($will_overflow) {
+            is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
+        }
+        else {
+            my $expected_ret = (   $testname =~ /surrogate|non-character/
+                                || $allowed_uv > 0x10FFFF)
+                               ? 0
+                               : $length;
+            is($ret, $expected_ret,
+               "isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
+        }
+        unless (is(scalar @warnings, 0,
+                "isSTRICT_UTF8_CHAR() $testname: generated no warnings"))
+        {
+            diag "The warnings were: " . join(", ", @warnings);
+        }
+
         # Test partial character handling, for each byte not a full character
         for my $j (1.. $length - 1) {
 
diff --git a/regcharclass.h b/regcharclass.h
@@ -1876,6 +1876,6 @@
  * 5c7eb94310e2aaa15702fd6bed24ff0e7ab5448f9a8231d8c49ca96c9e941089 lib/unicore/mktables
  * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
- * 1876ece914e2c14ed38c8a589adaa3d8193532c3a5bbe9ea5c3279bc9d29b279 regen/regcharclass.pl
+ * e3dc81163da3e92f7be01e9b953f6edb548eba93f1abb3d334e3b0469573c46d regen/regcharclass.pl
  * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl
  * ex: set ro: */
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
@@ -1660,6 +1660,49 @@ sub make_macro {
 #=> UTF8 :no_length_checks only_ebcdic_platform
 #0xA0 - 0x1FFFFF
 
+#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points
+#=> UTF8 :no_length_checks only_ascii_platform
+#0x0080 - 0xD7FF
+#0xE000 - 0xFDCF
+#0xFDF0 - 0xFFFD
+#0x10000 - 0x1FFFD
+#0x20000 - 0x2FFFD
+#0x30000 - 0x3FFFD
+#0x40000 - 0x4FFFD
+#0x50000 - 0x5FFFD
+#0x60000 - 0x6FFFD
+#0x70000 - 0x7FFFD
+#0x80000 - 0x8FFFD
+#0x90000 - 0x9FFFD
+#0xA0000 - 0xAFFFD
+#0xB0000 - 0xBFFFD
+#0xC0000 - 0xCFFFD
+#0xD0000 - 0xDFFFD
+#0xE0000 - 0xEFFFD
+#0xF0000 - 0xFFFFD
+#0x100000 - 0x10FFFD
+#
+#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points
+#=> UTF8 :no_length_checks only_ebcdic_platform
+#0x00A0 - 0xD7FF
+#0xE000 - 0xFDCF
+#0xFDF0 - 0xFFFD
+#0x10000 - 0x1FFFD
+#0x20000 - 0x2FFFD
+#0x30000 - 0x3FFFD
+#0x40000 - 0x4FFFD
+#0x50000 - 0x5FFFD
+#0x60000 - 0x6FFFD
+#0x70000 - 0x7FFFD
+#0x80000 - 0x8FFFD
+#0x90000 - 0x9FFFD
+#0xA0000 - 0xAFFFD
+#0xB0000 - 0xBFFFD
+#0xC0000 - 0xCFFFD
+#0xD0000 - 0xDFFFD
+#0xE0000 - 0xEFFFD
+#0xF0000 - 0xFFFFD
+#0x100000 - 0x10FFFD
 
 QUOTEMETA: Meta-characters that \Q should quote
 => high :fast
diff --git a/utf8.h b/utf8.h
@@ -338,6 +338,55 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
 /* The above macro handles UTF-8 that has this start byte as the maximum */
 #define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF7
 
+/* A helper macro for isSTRICT_UTF8_CHAR, so use that one instead of this.
+ * Like is_UTF8_CHAR_utf8_no_length_checks(), this was moved here and LIKELYs
+ * added manually.
+ *
+	STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no
+                          surrrogates nor non-character code points
+*/
+/*** GENERATED CODE ***/
+#define is_STRICT_UTF8_CHAR_utf8_no_length_checks(s)                        \
+( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                          \
+    ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                          \
+: ( 0xE0 == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || 0xEE == ((U8*)s)[0] ) ?\
+    ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xEF == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xB6 ) || ( 0xB8 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
+	( LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ? 3 : 0 )                      \
+    : ( 0xB7 == ((U8*)s)[1] ) ?                                             \
+	( LIKELY( ( ((U8*)s)[2] & 0xF0 ) == 0x80 || ( ((U8*)s)[2] & 0xF0 ) == 0xB0 ) ? 3 : 0 )\
+    : ( ( 0xBF == ((U8*)s)[1] ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBD ) ) ? 3 : 0 )\
+: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9E ) || ( 0xA0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xAE ) || ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
+	( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+    : ( ((U8*)s)[1] == 0x9F || ( ( ((U8*)s)[1] & 0xEF ) == 0xAF ) ) ?       \
+	( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ?                  \
+	    ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 )                  \
+	: LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
+    : 0 )                                                                   \
+: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ?                          \
+    ( ( ( ( ((U8*)s)[1] & 0xC8 ) == 0x80 ) || ( ( ((U8*)s)[1] & 0xCC ) == 0x88 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x8C ) || ( ( ((U8*)s)[1] & 0xCF ) == 0x8E ) ) ?\
+	( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+    : ( ( ((U8*)s)[1] & 0xCF ) == 0x8F ) ?                                  \
+	( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ?                  \
+	    ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 )                  \
+	: LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
+    : 0 )                                                                   \
+: ( 0xF4 == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x8E ) ?                      \
+	( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+    : ( 0x8F == ((U8*)s)[1] ) ?                                             \
+	( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ?                  \
+	    ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 )                  \
+	: LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
+    : 0 )                                                                   \
+: 0 )
+
 #endif /* EBCDIC vs ASCII */
 
 /* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -889,9 +938,6 @@ point's representation.
 
 #define SHARP_S_SKIP 2
 
-/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
- * log for earlier versions which gave details for these */
-
 /*
 
 =for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
@@ -932,6 +978,35 @@ is a valid UTF-8 character.
 
 #define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
 
+/*
+
+=for apidoc Am|STRLEN|isSTRICT_UTF8_CHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode code point completely acceptable for open interchange between all
+applications; otherwise it evaluates to 0.  If non-zero, the value gives how
+many many bytes starting at C<s> comprise the code point's representation.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
+be a surrogate nor a non-character code point.  Thus this excludes any code
+point from Perl's extended UTF-8.
+
+This is used to efficiently decide if the next few bytes in C<s> is
+legal Unicode-acceptable UTF-8 for a single character.
+
+=cut
+*/
+
+#define isSTRICT_UTF8_CHAR(s, e)                                            \
+    (UNLIKELY((e) <= (s))                                                   \
+    ? 0                                                                     \
+    : (UTF8_IS_INVARIANT(*s))                                               \
+      ? 1                                                                   \
+      : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                 \
+        ? 0                                                                 \
+        : is_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
+
 /* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
  * retained solely for backwards compatibility */
 #define IS_UTF8_CHAR(p, n)      (isUTF8_CHAR(p, (p) + (n)) == n)
diff --git a/utfebcdic.h b/utfebcdic.h