@@ -338,6 +338,55 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
338
338
/* The above macro handles UTF-8 that has this start byte as the maximum */
339
339
#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF7
340
340
341
+ /* A helper macro for isSTRICT_UTF8_CHAR, so use that one instead of this.
342
+ * Like is_UTF8_CHAR_utf8_no_length_checks(), this was moved here and LIKELYs
343
+ * added manually.
344
+ *
345
+ STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no
346
+ surrrogates nor non-character code points
347
+ */
348
+ /*** GENERATED CODE ***/
349
+ #define is_STRICT_UTF8_CHAR_utf8_no_length_checks (s ) \
350
+ ( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
351
+ ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
352
+ : ( 0xE0 == ((U8*)s)[0] ) ? \
353
+ ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
354
+ : ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || 0xEE == ((U8*)s)[0] ) ?\
355
+ ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
356
+ : ( 0xED == ((U8*)s)[0] ) ? \
357
+ ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
358
+ : ( 0xEF == ((U8*)s)[0] ) ? \
359
+ ( ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xB6 ) || ( 0xB8 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
360
+ ( LIKELY( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ? 3 : 0 ) \
361
+ : ( 0xB7 == ((U8*)s)[1] ) ? \
362
+ ( LIKELY( ( ((U8*)s)[2] & 0xF0 ) == 0x80 || ( ((U8*)s)[2] & 0xF0 ) == 0xB0 ) ? 3 : 0 )\
363
+ : ( ( 0xBF == ((U8*)s)[1] ) && ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBD ) ) ? 3 : 0 )\
364
+ : ( 0xF0 == ((U8*)s)[0] ) ? \
365
+ ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x9E ) || ( 0xA0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xAE ) || ( 0xB0 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBE ) ) ?\
366
+ ( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
367
+ : ( ((U8*)s)[1] == 0x9F || ( ( ((U8*)s)[1] & 0xEF ) == 0xAF ) ) ? \
368
+ ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
369
+ ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
370
+ : LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
371
+ : 0 ) \
372
+ : ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ? \
373
+ ( ( ( ( ((U8*)s)[1] & 0xC8 ) == 0x80 ) || ( ( ((U8*)s)[1] & 0xCC ) == 0x88 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x8C ) || ( ( ((U8*)s)[1] & 0xCF ) == 0x8E ) ) ?\
374
+ ( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
375
+ : ( ( ((U8*)s)[1] & 0xCF ) == 0x8F ) ? \
376
+ ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
377
+ ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
378
+ : LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
379
+ : 0 ) \
380
+ : ( 0xF4 == ((U8*)s)[0] ) ? \
381
+ ( ( 0x80 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x8E ) ? \
382
+ ( LIKELY( ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
383
+ : ( 0x8F == ((U8*)s)[1] ) ? \
384
+ ( ( 0x80 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xBE ) ? \
385
+ ( LIKELY( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ? 4 : 0 ) \
386
+ : LIKELY( ( 0xBF == ((U8*)s)[2] ) && ( 0x80 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0xBD ) ) ? 4 : 0 )\
387
+ : 0 ) \
388
+ : 0 )
389
+
341
390
#endif /* EBCDIC vs ASCII */
342
391
343
392
/* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -889,9 +938,6 @@ point's representation.
889
938
890
939
#define SHARP_S_SKIP 2
891
940
892
- /* If you want to exclude surrogates, and beyond legal Unicode, see the blame
893
- * log for earlier versions which gave details for these */
894
-
895
941
/*
896
942
897
943
=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
@@ -932,6 +978,35 @@ is a valid UTF-8 character.
932
978
933
979
#define is_utf8_char_buf (buf , buf_end ) isUTF8_CHAR(buf, buf_end)
934
980
981
+ /*
982
+
983
+ =for apidoc Am|STRLEN|isSTRICT_UTF8_CHAR|const U8 *s|const U8 *e
984
+
985
+ Evaluates to non-zero if the first few bytes of the string starting at C<s> and
986
+ looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
987
+ Unicode code point completely acceptable for open interchange between all
988
+ applications; otherwise it evaluates to 0. If non-zero, the value gives how
989
+ many many bytes starting at C<s> comprise the code point's representation.
990
+
991
+ The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
992
+ be a surrogate nor a non-character code point. Thus this excludes any code
993
+ point from Perl's extended UTF-8.
994
+
995
+ This is used to efficiently decide if the next few bytes in C<s> is
996
+ legal Unicode-acceptable UTF-8 for a single character.
997
+
998
+ =cut
999
+ */
1000
+
1001
+ #define isSTRICT_UTF8_CHAR (s , e ) \
1002
+ (UNLIKELY((e) <= (s)) \
1003
+ ? 0 \
1004
+ : (UTF8_IS_INVARIANT(*s)) \
1005
+ ? 1 \
1006
+ : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
1007
+ ? 0 \
1008
+ : is_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
1009
+
935
1010
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
936
1011
* retained solely for backwards compatibility */
937
1012
#define IS_UTF8_CHAR (p , n ) (isUTF8_CHAR(p, (p) + (n)) == n)
0 commit comments