Skip to content

Commit 8c9a529

Browse files
committed
Fix \b and \B with PCRE2_EXTRA_ASCII_BSW in the interpreters
1 parent 26d5cd9 commit 8c9a529

File tree

7 files changed

+104
-29
lines changed

7 files changed

+104
-29
lines changed

src/pcre2_compile.c

+31-11
Original file line numberDiff line numberDiff line change
@@ -4950,6 +4950,8 @@ for (;;)
49504950

49514951
case OP_WORD_BOUNDARY:
49524952
case OP_NOT_WORD_BOUNDARY:
4953+
case OP_UCP_WORD_BOUNDARY:
4954+
case OP_NOT_UCP_WORD_BOUNDARY:
49534955
if (!skipassert) return code;
49544956
/* Fall through */
49554957

@@ -8032,23 +8034,41 @@ for (;; pptr++)
80328034

80338035
/* For the rest (including \X when Unicode is supported - if not it's
80348036
faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8035-
not set; if it is set, these escapes do not show up here because they are
8036-
converted into Unicode property tests in parse_regex(). Note that \b and \B
8037-
do a one-character lookbehind, and \A also behaves as if it does. */
8037+
not set; if it is set, most of them do not show up here because they are
8038+
converted into Unicode property tests in parse_regex().
80388039
8039-
if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
8040-
if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
8041-
cb->max_lookbehind == 0)
8042-
cb->max_lookbehind = 1;
8040+
In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8041+
instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8042+
There are special UCP codes for \B and \b which are used in UCP mode unless
8043+
"word" matching is being forced to ASCII.
80438044
8044-
/* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8045-
instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
8045+
Note that \b and \B do a one-character lookbehind, and \A also behaves as
8046+
if it does. */
80468047

8048+
switch(meta_arg)
8049+
{
8050+
case ESC_C:
8051+
cb->external_flags |= PCRE2_HASBKC; /* Record */
80478052
#if PCRE2_CODE_UNIT_WIDTH == 32
8048-
*code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
8053+
meta_arg = OP_ALLANY;
80498054
#else
8050-
*code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
8055+
if (!utf) meta_arg = OP_ALLANY;
80518056
#endif
8057+
break;
8058+
8059+
case ESC_B:
8060+
case ESC_b:
8061+
if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8062+
meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8063+
OP_UCP_WORD_BOUNDARY;
8064+
/* Fall through */
8065+
8066+
case ESC_A:
8067+
if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8068+
break;
8069+
}
8070+
8071+
*code++ = meta_arg;
80528072
break; /* End META_ESCAPE */
80538073

80548074

src/pcre2_dfa_match.c

+14-6
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2016-2022 University of Cambridge
10+
New API code Copyright (c) 2016-2023 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -187,7 +187,8 @@ static const uint8_t coptable[] = {
187187
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
188188
0, 0, /* COMMIT, COMMIT_ARG */
189189
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
190-
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
190+
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
191+
0, 0 /* \B and \b in UCP mode */
191192
};
192193

193194
/* This table identifies those opcodes that inspect a character. It is used to
@@ -264,7 +265,8 @@ static const uint8_t poptable[] = {
264265
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
265266
0, 0, /* COMMIT, COMMIT_ARG */
266267
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
267-
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
268+
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
269+
1, 1 /* \B and \b in UCP mode */
268270
};
269271

270272
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@@ -1100,6 +1102,8 @@ for (;;)
11001102
/*-----------------------------------------------------------------*/
11011103
case OP_WORD_BOUNDARY:
11021104
case OP_NOT_WORD_BOUNDARY:
1105+
case OP_NOT_UCP_WORD_BOUNDARY:
1106+
case OP_UCP_WORD_BOUNDARY:
11031107
{
11041108
int left_word, right_word;
11051109

@@ -1112,7 +1116,8 @@ for (;;)
11121116
#endif
11131117
GETCHARTEST(d, temp);
11141118
#ifdef SUPPORT_UNICODE
1115-
if ((mb->poptions & PCRE2_UCP) != 0)
1119+
if (codevalue == OP_UCP_WORD_BOUNDARY ||
1120+
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
11161121
{
11171122
if (d == '_') left_word = TRUE; else
11181123
{
@@ -1137,7 +1142,8 @@ for (;;)
11371142
mb->last_used_ptr = temp;
11381143
}
11391144
#ifdef SUPPORT_UNICODE
1140-
if ((mb->poptions & PCRE2_UCP) != 0)
1145+
if (codevalue == OP_UCP_WORD_BOUNDARY ||
1146+
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
11411147
{
11421148
if (c == '_') right_word = TRUE; else
11431149
{
@@ -1151,7 +1157,9 @@ for (;;)
11511157
}
11521158
else right_word = FALSE;
11531159

1154-
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1160+
if ((left_word == right_word) ==
1161+
(codevalue == OP_NOT_WORD_BOUNDARY ||
1162+
codevalue == OP_NOT_UCP_WORD_BOUNDARY))
11551163
{ ADD_ACTIVE(state_offset + 1, 0); }
11561164
}
11571165
break;

src/pcre2_internal.h

+12-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2016-2022 University of Cambridge
10+
New API code Copyright (c) 2016-2023 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -1372,8 +1372,8 @@ enum {
13721372
OP_SOD, /* 1 Start of data: \A */
13731373
OP_SOM, /* 2 Start of match (subject + offset): \G */
13741374
OP_SET_SOM, /* 3 Set start of match (\K) */
1375-
OP_NOT_WORD_BOUNDARY, /* 4 \B */
1376-
OP_WORD_BOUNDARY, /* 5 \b */
1375+
OP_NOT_WORD_BOUNDARY, /* 4 \B -- see also OP_NOT_UCP_WORD_BOUNDARY */
1376+
OP_WORD_BOUNDARY, /* 5 \b -- see also OP_UCP_WORD_BOUNDARY */
13771377
OP_NOT_DIGIT, /* 6 \D */
13781378
OP_DIGIT, /* 7 \d */
13791379
OP_NOT_WHITESPACE, /* 8 \S */
@@ -1620,6 +1620,12 @@ enum {
16201620

16211621
OP_DEFINE, /* 167 */
16221622

1623+
/* These opcodes replace their normal counterparts in UCP mode when
1624+
PCRE2_EXTRA_ASCII_BSW is not set. */
1625+
1626+
OP_NOT_UCP_WORD_BOUNDARY, /* 168 */
1627+
OP_UCP_WORD_BOUNDARY, /* 169 */
1628+
16231629
/* This is not an opcode, but is used to check that tables indexed by opcode
16241630
are the correct length, in order to catch updating errors - there have been
16251631
some in the past. */
@@ -1679,7 +1685,7 @@ some cases doesn't actually use these names at all). */
16791685
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
16801686
"*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \
16811687
"*ACCEPT", "*ASSERT_ACCEPT", \
1682-
"Close", "Skip zero", "Define"
1688+
"Close", "Skip zero", "Define", "\\B (ucp)", "\\b (ucp)"
16831689

16841690

16851691
/* This macro defines the length of fixed length operations in the compiled
@@ -1775,7 +1781,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
17751781
1, 3, /* COMMIT, COMMIT_ARG */ \
17761782
1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \
17771783
1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \
1778-
1 /* DEFINE */
1784+
1, /* DEFINE */ \
1785+
1, 1 /* \B and \b in UCP mode */
17791786

17801787
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
17811788

src/pcre2_match.c

+7-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2015-2022 University of Cambridge
10+
New API code Copyright (c) 2015-2023 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -6060,6 +6060,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
60606060

60616061
case OP_NOT_WORD_BOUNDARY:
60626062
case OP_WORD_BOUNDARY:
6063+
case OP_NOT_UCP_WORD_BOUNDARY:
6064+
case OP_UCP_WORD_BOUNDARY:
60636065
if (Feptr == mb->check_subject) prev_is_word = FALSE; else
60646066
{
60656067
PCRE2_SPTR lastptr = Feptr - 1;
@@ -6074,7 +6076,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
60746076
fc = *lastptr;
60756077
if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
60766078
#ifdef SUPPORT_UNICODE
6077-
if ((mb->poptions & PCRE2_UCP) != 0)
6079+
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
60786080
{
60796081
if (fc == '_') prev_is_word = TRUE; else
60806082
{
@@ -6108,7 +6110,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
61086110
fc = *Feptr;
61096111
if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
61106112
#ifdef SUPPORT_UNICODE
6111-
if ((mb->poptions & PCRE2_UCP) != 0)
6113+
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
61126114
{
61136115
if (fc == '_') cur_is_word = TRUE; else
61146116
{
@@ -6123,7 +6125,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
61236125

61246126
/* Now see if the situation is what we want */
61256127

6126-
if ((*Fecode++ == OP_WORD_BOUNDARY)?
6128+
if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?
61276129
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
61286130
RRETURN(MATCH_NOMATCH);
61296131
break;
@@ -6853,7 +6855,7 @@ if (heapframes_size / 1024 > mb->heap_limit)
68536855
if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT;
68546856
heapframes_size = max_size;
68556857
}
6856-
6858+
68576859
/* If an existing frame vector in the match_data block is large enough, we can
68586860
use it. Otherwise, free any pre-existing vector and get a new one. */
68596861

src/pcre2_study.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2016-2021 University of Cambridge
10+
New API code Copyright (c) 2016-2023 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -273,6 +273,8 @@ for (;;)
273273
case OP_DOLLM:
274274
case OP_NOT_WORD_BOUNDARY:
275275
case OP_WORD_BOUNDARY:
276+
case OP_NOT_UCP_WORD_BOUNDARY:
277+
case OP_UCP_WORD_BOUNDARY:
276278
cc += PRIV(OP_lengths)[*cc];
277279
break;
278280

@@ -1101,6 +1103,8 @@ do
11011103

11021104
case OP_WORD_BOUNDARY:
11031105
case OP_NOT_WORD_BOUNDARY:
1106+
case OP_UCP_WORD_BOUNDARY:
1107+
case OP_NOT_UCP_WORD_BOUNDARY:
11041108
tcode++;
11051109
break;
11061110

testdata/testinput5

+15
Original file line numberDiff line numberDiff line change
@@ -2411,6 +2411,21 @@
24112411
\= Expect no match
24122412
\x{660}\x{c0}\x{c0}
24132413

2414+
# WORD BOUNDARY
2415+
2416+
/\bABC\b/utf
2417+
\x{c0}ABC\x{d0}
2418+
2419+
/\bABC\b/utf,ucp
2420+
\= Expect no match
2421+
\x{c0}ABC\x{d0}
2422+
2423+
/\bABC\b/utf,ucp,ascii_bsw
2424+
\x{c0}ABC\x{d0}\=no_jit
2425+
2426+
/\bABC\b/utf,ucp,ascii_all
2427+
\x{c0}ABC\x{d0}\=no_jit
2428+
24142429
# POSIX
24152430

24162431
/[[:digit:]]+/utf,ucp

testdata/testoutput5

+20-1
Original file line numberDiff line numberDiff line change
@@ -4016,7 +4016,7 @@ MK: a\x{12345}b\x{09}(d)c
40164016
/(*UCP)(*UTF)[[:>:]]X/B
40174017
------------------------------------------------------------------
40184018
Bra
4019-
\b
4019+
\b (ucp)
40204020
Assert back
40214021
Reverse
40224022
prop Xwd
@@ -5334,6 +5334,25 @@ No match
53345334
\x{660}\x{c0}\x{c0}
53355335
No match
53365336

5337+
# WORD BOUNDARY
5338+
5339+
/\bABC\b/utf
5340+
\x{c0}ABC\x{d0}
5341+
0: ABC
5342+
5343+
/\bABC\b/utf,ucp
5344+
\= Expect no match
5345+
\x{c0}ABC\x{d0}
5346+
No match
5347+
5348+
/\bABC\b/utf,ucp,ascii_bsw
5349+
\x{c0}ABC\x{d0}\=no_jit
5350+
0: ABC
5351+
5352+
/\bABC\b/utf,ucp,ascii_all
5353+
\x{c0}ABC\x{d0}\=no_jit
5354+
0: ABC
5355+
53375356
# POSIX
53385357

53395358
/[[:digit:]]+/utf,ucp

0 commit comments

Comments
 (0)