Skip to content

Commit ff43150

Browse files
committed
regcomp.c - Resolve issues clearing buffers in CURLYX (MAJOR-CHANGE)
CURLYX doesn't reset capture buffers properly. It is possible for multiple buffers to be defined at once with values from different iterations of the loop, which doesn't make sense really. An example is this: "foobarfoo"=~/((foo)|(bar))+/ after this matches $1 should equal $2 and $3 should be undefined, or $1 should equal $3 and $2 should be undefined. Prior to this patch this would not be the case. The solution that this patches uses is to introduce a form of "layered transactional storage" for paren data. The existing pair of start/end data for capture data is extended with a start_new/end_new pair. When the vast majority of our code wants to check if a given capture buffer is defined they first check "start_new/end_new", if either is -1 then they fall back to whatever is in start/end. When a capture buffer is CLOSEd the data is written into the start_new/end_new pair instead of the start/end pair. When a CURLYX loop is executing and has matched something (at least one "A" in /A*B/ -- thus actually in WHILEM) it "commits" the start_new/end_new data by writing it into start/end. When we begin a new iteration of the loop we clear the start_new/end_new pairs that are contained by the loop, by setting them to -1. If the loop fails then we roll back as we used to. If the loop succeeds we continue. When we hit an END block we commit everything. Consider the example above. We start off with everything set to -1. $1 = (-1,-1):(-1,-1) $2 = (-1,-1):(-1,-1) $3 = (-1,-1):(-1,-1) In the first iteration we have matched "foo" and end up with this: $1 = (-1,-1):( 0, 3) $2 = (-1,-1):( 0, 3) $3 = (-1,-1):(-1,-1) We commit the results of $2 and $3, and then clear the new data in the beginning of the next loop: $1 = (-1,-1):( 0, 3) $2 = ( 0, 3):(-1,-1) $3 = (-1,-1):(-1,-1) We then match "bar": $1 = (-1,-1):( 0, 3) $2 = ( 0, 3):(-1,-1) $3 = (-1,-1):( 3, 7) and then commit the result and clear the new data: $1 = (-1,-1):( 0, 3) $2 = (-1,-1):(-1,-1) $3 = ( 3, 7):(-1,-1) and then we match "foo" again: $1 = (-1,-1):( 0, 3) $2 = (-1,-1):( 7,10) $3 = ( 3, 7):(-1,-1) And we then commit. We do a regcppush here as normal. $1 = (-1,-1):( 0, 3) $2 = ( 7,10):( 7,10) $3 = (-1,-1):(-1,-1) We then clear it again, but since we don't match when we regcppop we store the buffers back to the above layout. When we finally hit the END buffer we also do a commit as well on all buffers, including the 0th (for the full match). Fixes GH Issue #18865, and adds tests for it and other things.
1 parent fcf5001 commit ff43150

File tree

12 files changed

+221
-50
lines changed

12 files changed

+221
-50
lines changed

pod/perldebguts.pod

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -752,13 +752,13 @@ will be lost.
752752
PLUS node Match this (simple) thing 1 or more times:
753753
/A{1,}B/ where A is width 1 char
754754

755-
CURLY sv 2 Match this (simple) thing {n,m} times:
755+
CURLY sv 4 Match this (simple) thing {n,m} times:
756756
/A{m,n}B/ where A is width 1 char
757-
CURLYN no 2 Capture next-after-this simple thing:
757+
CURLYN no 4 Capture next-after-this simple thing:
758758
/(A){m,n}B/ where A is width 1 char
759-
CURLYM no 2 Capture this medium-complex thing {n,m}
759+
CURLYM no 4 Capture this medium-complex thing {n,m}
760760
times: /(A){m,n}B/ where A is fixed-length
761-
CURLYX sv 2 Match/Capture this complex thing {n,m}
761+
CURLYX sv 4 Match/Capture this complex thing {n,m}
762762
times.
763763

764764
# This terminator creates a loop structure for CURLYX

pp_ctl.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,9 +381,9 @@ Perl_rxres_save(pTHX_ void **rsp, REGEXP *rx)
381381

382382
if (!p || p[1] < RX_NPARENS(rx)) {
383383
#ifdef PERL_ANY_COW
384-
i = 7 + (RX_NPARENS(rx)+1) * 2;
384+
i = 7 + (RX_NPARENS(rx)+1) * 4;
385385
#else
386-
i = 6 + (RX_NPARENS(rx)+1) * 2;
386+
i = 6 + (RX_NPARENS(rx)+1) * 4;
387387
#endif
388388
if (!p)
389389
Newx(p, i, UV);
@@ -409,6 +409,8 @@ Perl_rxres_save(pTHX_ void **rsp, REGEXP *rx)
409409
for (i = 0; i <= RX_NPARENS(rx); ++i) {
410410
*p++ = (UV)RX_OFFSp(rx)[i].start;
411411
*p++ = (UV)RX_OFFSp(rx)[i].end;
412+
*p++ = (UV)RX_OFFSp(rx)[i].start_new;
413+
*p++ = (UV)RX_OFFSp(rx)[i].end_new;
412414
}
413415
}
414416

@@ -440,6 +442,8 @@ S_rxres_restore(pTHX_ void **rsp, REGEXP *rx)
440442
for (i = 0; i <= RX_NPARENS(rx); ++i) {
441443
RX_OFFSp(rx)[i].start = (I32)(*p++);
442444
RX_OFFSp(rx)[i].end = (I32)(*p++);
445+
RX_OFFSp(rx)[i].start_new = (I32)(*p++);
446+
RX_OFFSp(rx)[i].end_new = (I32)(*p++);
443447
}
444448
}
445449

regcomp.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4494,6 +4494,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
44944494
const char * const origparse = RExC_parse;
44954495
I32 min;
44964496
I32 max = REG_INFTY;
4497+
I32 npar_before = RExC_npar-1;
44974498

44984499
/* Save the original in case we change the emitted regop to a FAIL. */
44994500
const regnode_offset orig_emit = RExC_emit;
@@ -4509,6 +4510,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
45094510
RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
45104511
FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
45114512
}
4513+
I32 npar_after = RExC_npar-1;
45124514

45134515
op = *RExC_parse;
45144516
switch (op) {
@@ -4674,6 +4676,17 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
46744676
ARG1_SET(REGNODE_p(ret), min);
46754677
ARG2_SET(REGNODE_p(ret), max);
46764678

4679+
/* if we had a npar_after then we need to increment npar_before,
4680+
* we want to track the range of parens we need to reset each iteration
4681+
*/
4682+
if (npar_after!=npar_before) {
4683+
ARG3_SET(REGNODE_p(ret), (U16)npar_before+1);
4684+
ARG4_SET(REGNODE_p(ret), (U16)npar_after);
4685+
} else {
4686+
ARG3_SET(REGNODE_p(ret), 0);
4687+
ARG4_SET(REGNODE_p(ret), 0);
4688+
}
4689+
46774690
done_main_op:
46784691

46794692
/* Process any greediness modifiers */

regcomp.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,10 @@ struct regnode_2L {
216216
I32 arg2;
217217
};
218218

219-
/* 'Two field' -- Two 32 bit signed args */
219+
/* 'Two field' -- Two 32 bit signed args.
220+
* First fields must match regnode. Currently unused except to
221+
* facilitate regnode_4 behavior. Not simplifying that as this
222+
* node type could still be useful for other regops. */
220223
struct regnode_2 {
221224
U8 flags;
222225
U8 type;
@@ -225,6 +228,19 @@ struct regnode_2 {
225228
I32 arg2;
226229
};
227230

231+
/* 'Four field' -- Two 32 bit signed args, Two 16 bit unsigned args
232+
* Used for CURLY and CURLYX node types to track min/max and
233+
* first_paren/last_paren. First fields must match regnode_2 */
234+
struct regnode_4 {
235+
U8 flags;
236+
U8 type;
237+
U16 next_off;
238+
I32 arg1;
239+
I32 arg2;
240+
U16 arg3;
241+
U16 arg4;
242+
};
243+
228244
#define REGNODE_BBM_BITMAP_LEN \
229245
/* 6 info bits requires 64 bits; 5 => 32 */ \
230246
((1 << (UTF_CONTINUATION_BYTE_INFO_BITS)) / CHARBITS)
@@ -346,11 +362,15 @@ struct regnode_ssc {
346362
#define ARGp(p) ARGp_VALUE_inline(p)
347363
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
348364
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
365+
#define ARG3(p) ARG_VALUE(ARG3_LOC(p))
366+
#define ARG4(p) ARG_VALUE(ARG4_LOC(p))
349367
#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p))
350368

351369
#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
352370
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
353371
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
372+
#define ARG3_SET(p, val) ARG__SET(ARG3_LOC(p), (val))
373+
#define ARG4_SET(p, val) ARG__SET(ARG4_LOC(p), (val))
354374
#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val))
355375
#define ARGp_SET(p, val) ARGp_SET_inline((p),(val))
356376

@@ -436,6 +456,8 @@ struct regnode_ssc {
436456
#define ARGp_BYTES_LOC(p) (((struct regnode_p *)p)->arg1_sv_ptr_bytes)
437457
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
438458
#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
459+
#define ARG3_LOC(p) (((struct regnode_4 *)p)->arg3)
460+
#define ARG4_LOC(p) (((struct regnode_4 *)p)->arg4)
439461
#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2)
440462

441463

regcomp.sym

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,10 +217,10 @@ TAIL NOTHING, no ; Match empty string. Can jump here from outsi
217217
STAR STAR, node 0 V ; Match this (simple) thing 0 or more times: /A{0,}B/ where A is width 1 char
218218
PLUS PLUS, node 0 V ; Match this (simple) thing 1 or more times: /A{1,}B/ where A is width 1 char
219219

220-
CURLY CURLY, sv 2 V ; Match this (simple) thing {n,m} times: /A{m,n}B/ where A is width 1 char
221-
CURLYN CURLY, no 2 V ; Capture next-after-this simple thing: /(A){m,n}B/ where A is width 1 char
222-
CURLYM CURLY, no 2 V ; Capture this medium-complex thing {n,m} times: /(A){m,n}B/ where A is fixed-length
223-
CURLYX CURLY, sv 2 V ; Match/Capture this complex thing {n,m} times.
220+
CURLY CURLY, sv 4 V ; Match this (simple) thing {n,m} times: /A{m,n}B/ where A is width 1 char
221+
CURLYN CURLY, no 4 V ; Capture next-after-this simple thing: /(A){m,n}B/ where A is width 1 char
222+
CURLYM CURLY, no 4 V ; Capture this medium-complex thing {n,m} times: /(A){m,n}B/ where A is fixed-length
223+
CURLYX CURLY, sv 4 V ; Match/Capture this complex thing {n,m} times.
224224

225225
#*This terminator creates a loop structure for CURLYX
226226
WHILEM WHILEM, no 0 V ; Do curly processing and see if rest matches.

regcomp_debug.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
464464
}
465465
} else if (k == CURLY) {
466466
U32 lo = ARG1(o), hi = ARG2(o);
467+
if (ARG3(o) || ARG4(o))
468+
Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3(o),ARG4(o)); /* paren before, paren after */
467469
if (op == CURLYM || op == CURLYN || op == CURLYX)
468470
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
469471
Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);

0 commit comments

Comments
 (0)