Skip to content

Commit 0d545a2

Browse files
author
Branislav Zahradník
committed
malformed utf8 message: handy macros handling malformed utf8
- transpose die/warn argument into symbol (macro name) - make `flags` argument optional - encapsulate lookup-and-die combo into single symbol (macro)
1 parent d7ec8f5 commit 0d545a2

File tree

7 files changed

+77
-65
lines changed

7 files changed

+77
-65
lines changed

doop.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,8 @@ S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const invmap)
369369
else {
370370
from = utf8_to_uvchr_buf(s, send, &s_len);
371371
if (from == 0 && *s != '\0') {
372-
_force_out_malformed_utf8_message(s, send, 0, MALFORMED_UTF8_DIE);
372+
FORCE_OUT_MALFORMED_UTF8_DIE(s, send);
373+
NOT_REACHED;
373374
}
374375
}
375376

@@ -486,7 +487,8 @@ S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap)
486487
else {
487488
from = utf8_to_uvchr_buf(s, send, &s_len);
488489
if (from == 0 && *s != '\0') {
489-
_force_out_malformed_utf8_message(s, send, 0, MALFORMED_UTF8_DIE);
490+
FORCE_OUT_MALFORMED_UTF8_DIE(s, send);
491+
NOT_REACHED;
490492
}
491493
}
492494

handy.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2277,15 +2277,15 @@ END_EXTERN_C
22772277

22782278
#define generic_utf8_safe_(classnum, p, e, above_latin1) \
22792279
((! _utf8_safe_assert(p, e)) \
2280-
? (_force_out_malformed_utf8_message((U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)\
2280+
? (FORCE_OUT_MALFORMED_UTF8_DIE((U8 *) (p), (U8 *) (e)), 0) \
22812281
: (UTF8_IS_INVARIANT(*(p))) \
22822282
? generic_isCC_(*(p), classnum) \
22832283
: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
22842284
? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
22852285
? generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \
22862286
classnum) \
2287-
: (_force_out_malformed_utf8_message( \
2288-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
2287+
: (FORCE_OUT_MALFORMED_UTF8_DIE( \
2288+
(U8 *) (p), (U8 *) (e)), 0)) \
22892289
: above_latin1))
22902290
/* Like the above, but calls 'above_latin1(p)' to get the utf8 value.
22912291
* 'above_latin1' can be a macro */
@@ -2294,8 +2294,8 @@ END_EXTERN_C
22942294
#define generic_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
22952295
generic_utf8_safe_(classnum, p, e, \
22962296
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2297-
? (_force_out_malformed_utf8_message( \
2298-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
2297+
? (FORCE_OUT_MALFORMED_UTF8_DIE( \
2298+
(U8 *) (p), (U8 *) (e)), 0) \
22992299
: above_latin1(p)))
23002300
/* Like the above, but passes classnum to _isFOO_utf8(), instead of having an
23012301
* 'above_latin1' parameter */
@@ -2384,8 +2384,8 @@ END_EXTERN_C
23842384
#define isXDIGIT_utf8_safe(p, e) \
23852385
generic_utf8_safe_no_upper_latin1_(CC_XDIGIT_, p, e, \
23862386
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2387-
? (_force_out_malformed_utf8_message( \
2388-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
2387+
? (FORCE_OUT_MALFORMED_UTF8_DIE( \
2388+
(U8 *) (p), (U8 *) (e)), 0) \
23892389
: is_XDIGIT_high(p)))
23902390

23912391
#define toFOLD_utf8(p,e,s,l) toFOLD_utf8_safe(p,e,s,l)
@@ -2433,8 +2433,8 @@ END_EXTERN_C
24332433
: (UTF8_IS_DOWNGRADEABLE_START(*(p)) \
24342434
? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \
24352435
? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \
2436-
: (_force_out_malformed_utf8_message( \
2437-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0)) \
2436+
: (FORCE_OUT_MALFORMED_UTF8_DIE( \
2437+
(U8 *) (p), (U8 *) (e)), 0)) \
24382438
: above_latin1))
24392439

24402440
#define generic_LC_invlist_utf8_safe_(macro, classnum, p, e) \
@@ -2447,8 +2447,8 @@ END_EXTERN_C
24472447
#define generic_LC_non_invlist_utf8_safe_(classnum, above_latin1, p, e) \
24482448
generic_LC_utf8_safe_(classnum, p, e, \
24492449
(UNLIKELY((e) - (p) < UTF8SKIP(p)) \
2450-
? (_force_out_malformed_utf8_message( \
2451-
(U8 *) (p), (U8 *) (e), 0, MALFORMED_UTF8_DIE), 0) \
2450+
? (FORCE_OUT_MALFORMED_UTF8_DIE( \
2451+
(U8 *) (p), (U8 *) (e)), 0) \
24522452
: above_latin1(p)))
24532453

24542454
#define isALPHANUMERIC_LC_utf8_safe(p, e) \

pp_pack.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3161,16 +3161,8 @@ PP_wrapped(pp_pack, 0, 1)
31613161
if (SvUTF8(cat)) {
31623162
STRLEN result_len;
31633163
const char * result = SvPV_nomg(cat, result_len);
3164-
const U8 * error_pos;
3165-
3166-
if (! is_utf8_string_loc((U8 *) result, result_len, &error_pos)) {
3167-
_force_out_malformed_utf8_message(error_pos,
3168-
(U8 *) result + result_len,
3169-
0, /* no flags */
3170-
MALFORMED_UTF8_DIE
3171-
);
3172-
NOT_REACHED; /* NOTREACHED */
3173-
}
3164+
3165+
ENSURE_NOT_MALFORMED_UTF8((U8 *) result, result_len);
31743166
}
31753167

31763168
SvSETMAGIC(cat);

regexec.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10919,9 +10919,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
1091910919
const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
1092010920
c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
1092110921
if (c_len == (STRLEN)-1) {
10922-
_force_out_malformed_utf8_message(p, p_end,
10923-
utf8n_flags,
10924-
MALFORMED_UTF8_DIE);
10922+
FORCE_OUT_MALFORMED_UTF8_DIE_FLAGS(p, p_end, utf8n_flags);
1092510923
NOT_REACHED; /* NOTREACHED */
1092610924
}
1092710925
if ( c > 255

toke.c

Lines changed: 9 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -902,20 +902,11 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
902902

903903
if (line) {
904904
Size_t len;
905-
const U8* first_bad_char_loc;
906905

907906
s = SvPV_const(line, len);
908907

909-
if ( SvUTF8(line)
910-
&& UNLIKELY(! is_utf8_string_loc((U8 *) s,
911-
SvCUR(line),
912-
&first_bad_char_loc)))
913-
{
914-
_force_out_malformed_utf8_message(first_bad_char_loc,
915-
(U8 *) s + SvCUR(line),
916-
0,
917-
MALFORMED_UTF8_DIE);
918-
NOT_REACHED; /* NOTREACHED */
908+
if (SvUTF8(line)) {
909+
ENSURE_NOT_MALFORMED_UTF8((U8 *) s, SvCUR(line));
919910
}
920911

921912
parser->linestr = flags & LEX_START_COPIED
@@ -1542,18 +1533,10 @@ Perl_lex_next_chunk(pTHX_ U32 flags)
15421533
PL_parser->bufptr = buf + bufptr_pos;
15431534

15441535
if (UTF) {
1545-
const U8* first_bad_char_loc;
1546-
if (UNLIKELY(! is_utf8_string_loc(
1547-
(U8 *) PL_parser->bufptr,
1548-
PL_parser->bufend - PL_parser->bufptr,
1549-
&first_bad_char_loc)))
1550-
{
1551-
_force_out_malformed_utf8_message(first_bad_char_loc,
1552-
(U8 *) PL_parser->bufend,
1553-
0,
1554-
MALFORMED_UTF8_DIE);
1555-
NOT_REACHED; /* NOTREACHED */
1556-
}
1536+
ENSURE_NOT_MALFORMED_UTF8(
1537+
(U8 *) PL_parser->bufptr,
1538+
PL_parser->bufend - PL_parser->bufptr
1539+
);
15571540
}
15581541

15591542
PL_parser->oldbufptr = buf + oldbufptr_pos;
@@ -1631,10 +1614,7 @@ Perl_lex_peek_unichar(pTHX_ U32 flags)
16311614
}
16321615
unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
16331616
if (retlen == (STRLEN)-1) {
1634-
_force_out_malformed_utf8_message((U8 *) s,
1635-
(U8 *) bufend,
1636-
0,
1637-
MALFORMED_UTF8_DIE);
1617+
FORCE_OUT_MALFORMED_UTF8_DIE((U8 *) s, (U8 *) bufend);
16381618
NOT_REACHED; /* NOTREACHED */
16391619
}
16401620
return unichar;
@@ -9695,16 +9675,8 @@ Perl_yylex(pTHX)
96959675
char *s = PL_bufptr;
96969676

96979677
if (UNLIKELY(PL_parser->recheck_utf8_validity)) {
9698-
const U8* first_bad_char_loc;
9699-
if (UTF && UNLIKELY(! is_utf8_string_loc((U8 *) PL_bufptr,
9700-
PL_bufend - PL_bufptr,
9701-
&first_bad_char_loc)))
9702-
{
9703-
_force_out_malformed_utf8_message(first_bad_char_loc,
9704-
(U8 *) PL_bufend,
9705-
0,
9706-
MALFORMED_UTF8_DIE);
9707-
NOT_REACHED; /* NOTREACHED */
9678+
if (UTF) {
9679+
ENSURE_NOT_MALFORMED_UTF8((U8 *) PL_bufptr, PL_bufend - PL_bufptr);
97089680
}
97099681
PL_parser->recheck_utf8_validity = FALSE;
97109682
}

utf8.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3308,7 +3308,7 @@ S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
33083308
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
33093309

33103310
if (cp == 0 && (p >= e || *p != '\0')) {
3311-
_force_out_malformed_utf8_message(p, e, 0, MALFORMED_UTF8_DIE);
3311+
FORCE_OUT_MALFORMED_UTF8_DIE(p, e);
33123312
NOT_REACHED; /* NOTREACHED */
33133313
}
33143314

@@ -3853,7 +3853,8 @@ S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
38533853
STRLEN len_result; \
38543854
result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
38553855
if (len_result == (STRLEN) -1) { \
3856-
_force_out_malformed_utf8_message(p, e, 0, MALFORMED_UTF8_DIE ); \
3856+
FORCE_OUT_MALFORMED_UTF8_DIE(p, e); \
3857+
NOT_REACHED; \
38573858
}
38583859

38593860
#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \

utf8.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,53 @@ point's representation.
13151315
#define MALFORMED_UTF8_DIE TRUE
13161316
#define MALFORMED_UTF8_WARN FALSE
13171317

1318+
#define FORCE_OUT_MALFORMED_UTF8_DIE(start_pos, end_post) \
1319+
FORCE_OUT_MALFORMED_UTF8_DIE_FLAGS( \
1320+
(start_pos), \
1321+
(end_post), \
1322+
0 \
1323+
)
1324+
1325+
#define FORCE_OUT_MALFORMED_UTF8_DIE_FLAGS(start_pos, end_post, flags) \
1326+
_force_out_malformed_utf8_message( \
1327+
(start_pos), \
1328+
(end_post), \
1329+
flags, \
1330+
MALFORMED_UTF8_DIE \
1331+
)
1332+
1333+
#define FORCE_OUT_MALFORMED_UTF8_WARN(start_pos, end_post) \
1334+
FORCE_OUT_MALFORMED_UTF8_WARN_FLAGS( \
1335+
(start_pos), \
1336+
(end_post), \
1337+
0 \
1338+
)
1339+
1340+
#define FORCE_OUT_MALFORMED_UTF8_WARN_FLAGS(start_pos, end_post, flags) \
1341+
_force_out_malformed_utf8_message( \
1342+
(start_pos), \
1343+
(end_post), \
1344+
flags, \
1345+
MALFORMED_UTF8_WARN \
1346+
)
1347+
1348+
#define ENSURE_NOT_MALFORMED_UTF8(start_pos, length) \
1349+
if (1) { \
1350+
const U8* first_malformed_char_location; \
1351+
const bool has_malformed_char = ! is_utf8_string_loc( \
1352+
(start_pos), \
1353+
(length), \
1354+
&first_malformed_char_location \
1355+
); \
1356+
if (UNLIKELY(has_malformed_char)) { \
1357+
FORCE_OUT_MALFORMED_UTF8_DIE( \
1358+
first_malformed_char_location, \
1359+
(start_pos) + (length) \
1360+
); \
1361+
NOT_REACHED; \
1362+
} \
1363+
}
1364+
13181365
#endif /* PERL_UTF8_H_ */
13191366

13201367
/*

0 commit comments

Comments
 (0)