Skip to content

Commit 1d6325f

Browse files
committed
work in progress - reset branches properly
If a branch fails we should reset all the parens it contained before we execute the next branch. This is almost certainly not as efficient as it could be.
1 parent e169495 commit 1d6325f

File tree

10 files changed

+182
-31
lines changed

10 files changed

+182
-31
lines changed

pod/perldebguts.pod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,7 @@ will be lost.
668668
# pointer of each individual branch points; each branch
669669
# starts with the operand node of a BRANCH node.
670670
#
671-
BRANCH node Match this alternative, or the next...
671+
BRANCH node 1 Match this alternative, or the next...
672672

673673
# Literals
674674

@@ -796,7 +796,7 @@ will be lost.
796796

797797
# Support for long RE
798798
LONGJMP off 1 1 Jump far away.
799-
BRANCHJ off 1 1 BRANCH with long offset.
799+
BRANCHJ off 2L 1 BRANCH with long offset.
800800

801801
# Special Case Regops
802802
IFMATCH off 1 1 Succeeds if the following matches; non-zero

regcomp.c

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3975,6 +3975,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
39753975
parse_rest:
39763976
/* Pick up the branches, linking them together. */
39773977
segment_parse_start = RExC_parse;
3978+
I32 npar_before_regbranch = RExC_npar - 1;
39783979
br = regbranch(pRExC_state, &flags, 1, depth+1);
39793980

39803981
/* branch_len = (paren != 0); */
@@ -3986,9 +3987,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
39863987
if (*RExC_parse == '|') {
39873988
if (RExC_use_BRANCHJ) {
39883989
reginsert(pRExC_state, BRANCHJ, br, depth+1);
3990+
ARG2La_SET(REGNODE_p(br), npar_before_regbranch);
3991+
ARG2Lb_SET(REGNODE_p(br), (U16)RExC_npar - 1);
39893992
}
39903993
else {
39913994
reginsert(pRExC_state, BRANCH, br, depth+1);
3995+
ARGa_SET(REGNODE_p(br), (U16)npar_before_regbranch);
3996+
ARGb_SET(REGNODE_p(br), (U16)RExC_npar - 1);
39923997
}
39933998
have_branch = 1;
39943999
}
@@ -4031,6 +4036,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
40314036
if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
40324037
REQUIRE_BRANCHJ(flagp, 0);
40334038
}
4039+
assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4040+
assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4041+
if (OP(REGNODE_p(br)) == BRANCH) {
4042+
if (OP(REGNODE_p(lastbr)) == BRANCH)
4043+
ARGb_SET(REGNODE_p(lastbr),ARGa(REGNODE_p(br)));
4044+
else
4045+
ARG2Lb_SET(REGNODE_p(lastbr),ARGa(REGNODE_p(br)));
4046+
}
4047+
else
4048+
if (OP(REGNODE_p(br)) == BRANCHJ) {
4049+
if (OP(REGNODE_p(lastbr)) == BRANCH)
4050+
ARGb_SET(REGNODE_p(lastbr),ARG2La(REGNODE_p(br)));
4051+
else
4052+
ARG2Lb_SET(REGNODE_p(lastbr),ARG2La(REGNODE_p(br)));
4053+
}
4054+
40344055
lastbr = br;
40354056
*flagp |= flags & (HASWIDTH | POSTPONED);
40364057
}
@@ -4104,6 +4125,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
41044125
(IV)(ender - lastbr)
41054126
);
41064127
});
4128+
if (OP(REGNODE_p(lastbr)) == BRANCH) {
4129+
ARGb_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4130+
}
4131+
else
4132+
if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4133+
ARG2Lb_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4134+
}
4135+
41074136
if (! REGTAIL(pRExC_state, lastbr, ender)) {
41084137
REQUIRE_BRANCHJ(flagp, 0);
41094138
}
@@ -4247,6 +4276,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
42474276
regnode_offset ret;
42484277
regnode_offset chain = 0;
42494278
regnode_offset latest;
4279+
regnode *branch_node = NULL;
42504280
I32 flags = 0, c = 0;
42514281
DECLARE_AND_GET_RE_DEBUG_FLAGS;
42524282

@@ -4257,10 +4287,14 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
42574287
if (first)
42584288
ret = 0;
42594289
else {
4260-
if (RExC_use_BRANCHJ)
4261-
ret = reganode(pRExC_state, BRANCHJ, 0);
4262-
else {
4263-
ret = reg_node(pRExC_state, BRANCH);
4290+
if (RExC_use_BRANCHJ) {
4291+
ret = reg2Lanode(pRExC_state, BRANCHJ, 0, 0);
4292+
branch_node = REGNODE_p(ret);
4293+
ARG2La_SET(branch_node, (U16)RExC_npar-1);
4294+
} else {
4295+
ret = reganode(pRExC_state, BRANCH, 0);
4296+
branch_node = REGNODE_p(ret);
4297+
ARGa_SET(branch_node, (U16)RExC_npar-1);
42644298
}
42654299
}
42664300

@@ -4298,11 +4332,10 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
42984332
chain = reg_node(pRExC_state, NOTHING);
42994333
if (ret == 0)
43004334
ret = chain;
4301-
}
4335+
}
43024336
if (c == 1) {
4303-
*flagp |= flags&SIMPLE;
4337+
*flagp |= flags & SIMPLE;
43044338
}
4305-
43064339
return ret;
43074340
}
43084341

@@ -13191,6 +13224,10 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
1319113224
PerlMemShared_free(trie->bitmap);
1319213225
if (trie->jump)
1319313226
PerlMemShared_free(trie->jump);
13227+
if (trie->j_before_paren)
13228+
PerlMemShared_free(trie->j_before_paren);
13229+
if (trie->j_after_paren)
13230+
PerlMemShared_free(trie->j_after_paren);
1319413231
PerlMemShared_free(trie->wordinfo);
1319513232
/* do this last!!!! */
1319613233
PerlMemShared_free(ri->data->data[n]);

regcomp.h

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,13 @@ struct regnode_1 {
182182
U8 flags;
183183
U8 type;
184184
U16 next_off;
185-
U32 arg1;
185+
union {
186+
U32 arg1;
187+
struct {
188+
U16 arg1a;
189+
U16 arg1b;
190+
};
191+
};
186192
};
187193

188194
/* Node whose argument is 'SV *'. This needs to be used very carefully in
@@ -213,7 +219,13 @@ struct regnode_2L {
213219
U8 type;
214220
U16 next_off;
215221
U32 arg1;
216-
I32 arg2;
222+
union {
223+
I32 arg2;
224+
struct {
225+
U16 arg2a;
226+
U16 arg2b;
227+
};
228+
};
217229
};
218230

219231
/* 'Two field' -- Two 32 bit signed args */
@@ -357,16 +369,24 @@ struct regnode_ssc {
357369

358370
#define ARG(p) ARG_VALUE(ARG_LOC(p))
359371
#define ARGp(p) ARGp_VALUE_inline(p)
372+
#define ARGa(p) ARG_VALUE(ARGa_LOC(p))
373+
#define ARGb(p) ARG_VALUE(ARGb_LOC(p))
360374
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
361375
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
362376
#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p))
377+
#define ARG2La(p) ARG_VALUE(ARG2La_LOC(p))
378+
#define ARG2Lb(p) ARG_VALUE(ARG2Lb_LOC(p))
363379
#define ARG3(p) ARG_VALUE(ARG3_LOC(p))
364380
#define ARG4(p) ARG_VALUE(ARG4_LOC(p))
365381

366382
#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
383+
#define ARGa_SET(p, val) ARG__SET(ARGa_LOC(p), (val))
384+
#define ARGb_SET(p, val) ARG__SET(ARGb_LOC(p), (val))
367385
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
368386
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
369387
#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val))
388+
#define ARG2La_SET(p, val) ARG__SET(ARG2La_LOC(p), (val))
389+
#define ARG2Lb_SET(p, val) ARG__SET(ARG2Lb_LOC(p), (val))
370390
#define ARGp_SET(p, val) ARGp_SET_inline((p),(val))
371391
#define ARG3_SET(p, val) ARG__SET(ARG3_LOC(p), (val))
372392
#define ARG4_SET(p, val) ARG__SET(ARG4_LOC(p), (val))
@@ -450,10 +470,14 @@ struct regnode_ssc {
450470

451471
#define NODE_ALIGN(node)
452472
#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
473+
#define ARGa_LOC(p) (((struct regnode_1 *)p)->arg1a)
474+
#define ARGb_LOC(p) (((struct regnode_1 *)p)->arg1b)
453475
#define ARGp_BYTES_LOC(p) (((struct regnode_p *)p)->arg1_sv_ptr_bytes)
454476
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
455477
#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
456478
#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2)
479+
#define ARG2La_LOC(p) (((struct regnode_2L *)p)->arg2a)
480+
#define ARG2Lb_LOC(p) (((struct regnode_2L *)p)->arg2b)
457481
#define ARG3_LOC(p) (((struct regnode_4 *)p)->arg3)
458482
#define ARG4_LOC(p) (((struct regnode_4 *)p)->arg4)
459483

@@ -1143,6 +1167,11 @@ struct _reg_trie_data {
11431167
char *bitmap; /* stclass bitmap */
11441168
U16 *jump; /* optional 1 indexed array of offsets before tail
11451169
for the node following a given word. */
1170+
U16 *j_before_paren; /* optional 1 indexed array of parno reset data
1171+
for the given jump. */
1172+
U16 *j_after_paren; /* optional 1 indexed array of parno reset data
1173+
for the given jump. */
1174+
11461175
reg_trie_wordinfo *wordinfo; /* array of info per word */
11471176
U16 uniquecharcount; /* unique chars in trie (width of trans table) */
11481177
U32 startstate; /* initial state - used for common prefix optimisation */
@@ -1152,6 +1181,8 @@ struct _reg_trie_data {
11521181
U32 statecount; /* Build only - number of states in the states array
11531182
(including the unused zero state) */
11541183
U32 wordcount; /* Build only */
1184+
U16 before_paren;
1185+
U16 after_paren;
11551186
#ifdef DEBUGGING
11561187
STRLEN charcount; /* Build only */
11571188
#endif

regcomp.sym

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence
140140
#* pointer of each individual branch points; each branch
141141
#* starts with the operand node of a BRANCH node.
142142
#*
143-
BRANCH BRANCH, node 0 V ; Match this alternative, or the next...
143+
BRANCH BRANCH, node 1 V ; Match this alternative, or the next...
144144

145145
#*Literals
146146
# NOTE: the relative ordering of these types is important do not change it
@@ -252,7 +252,7 @@ REFFAN REF, num 1 V ; Match already matched string, using /aai rul
252252

253253
#*Support for long RE
254254
LONGJMP LONGJMP, off 1 . 1 ; Jump far away.
255-
BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset.
255+
BRANCHJ BRANCHJ, off 2L V 1 ; BRANCH with long offset.
256256

257257
#*Special Case Regops
258258
IFMATCH BRANCHJ, off 1 . 1 ; Succeeds if the following matches; non-zero flags "f", next_off "o" means lookbehind assertion starting "f..(f-o)" characters before current

regcomp_debug.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
404404
sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
405405

406406
k = REGNODE_TYPE(op);
407-
408-
if (k == EXACT) {
407+
if (op == BRANCH) {
408+
Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARGa(o),(IV)ARGb(o));
409+
}
410+
else if (op == BRANCHJ) {
411+
Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2La(o),(IV)ARG2Lb(o));
412+
}
413+
else if (k == EXACT) {
409414
sv_catpvs(sv, " ");
410415
/* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
411416
* is a crude hack but it may be the best for now since
@@ -458,6 +463,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
458463
);
459464
sv_catpvs(sv, "]");
460465
}
466+
if (trie->before_paren || trie->after_paren)
467+
Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")",
468+
(IV)trie->before_paren,(IV)trie->after_paren);
461469
} else if (k == CURLY) {
462470
U32 lo = ARG1(o), hi = ARG2(o);
463471
if (ARG3(o) || ARG4(o))

regcomp_trie.c

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -469,10 +469,26 @@ is the recommended Unicode-aware way of saying
469469
trie->wordinfo[curword].accept = state; \
470470
\
471471
if ( noper_next < tail ) { \
472-
if (!trie->jump) \
472+
if (!trie->jump) { \
473473
trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
474474
sizeof(U16) ); \
475+
trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
476+
sizeof(U16) ); \
477+
trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
478+
sizeof(U16) ); \
479+
} \
475480
trie->jump[curword] = (U16)(noper_next - convert); \
481+
U16 set_before_paren; \
482+
U16 set_after_paren; \
483+
if (OP(cur) == BRANCH) { \
484+
set_before_paren = ARGa(cur); \
485+
set_after_paren = ARGb(cur); \
486+
} else { \
487+
set_before_paren = ARG2La(cur); \
488+
set_after_paren = ARG2Lb(cur); \
489+
} \
490+
trie->j_before_paren[curword] = set_before_paren; \
491+
trie->j_after_paren[curword] = set_after_paren; \
476492
if (!jumper) \
477493
jumper = noper_next; \
478494
if (!nextbranch) \
@@ -533,6 +549,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
533549
U32 next_alloc = 0;
534550
regnode *jumper = NULL;
535551
regnode *nextbranch = NULL;
552+
regnode *lastbranch = NULL;
536553
regnode *convert = NULL;
537554
U32 *prev_states; /* temp array mapping each state to previous one */
538555
/* we just use folder as a flag in utf8 */
@@ -569,6 +586,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
569586
default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
570587
}
571588

589+
/* create the trie struct, all zeroed */
572590
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
573591
trie->refcount = 1;
574592
trie->startstate = 1;
@@ -639,6 +657,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
639657
STRLEN maxchars = 0;
640658
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
641659
bitmap?*/
660+
lastbranch = cur;
642661

643662
if (OP(noper) == NOTHING) {
644663
/* skip past a NOTHING at the start of an alternation
@@ -797,6 +816,13 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
797816
trie->maxlen = maxchars;
798817
}
799818
} /* end first pass */
819+
trie->before_paren = OP(first) == BRANCH
820+
? ARGa(first)
821+
: ARG2La(first); /* BRANCHJ */
822+
823+
trie->after_paren = OP(lastbranch) == BRANCH
824+
? ARGb(lastbranch)
825+
: ARG2Lb(lastbranch); /* BRANCHJ */
800826
DEBUG_TRIE_COMPILE_r(
801827
Perl_re_indentf( aTHX_
802828
"TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
@@ -1308,6 +1334,9 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
13081334
#ifdef DEBUGGING
13091335
regnode *optimize = NULL;
13101336
#endif /* DEBUGGING */
1337+
/* make sure we have enough room to inject the TRIE op */
1338+
assert((!trie->jump) || !trie->jump[1] ||
1339+
(trie->jump[1] >= (sizeof(tregnode_TRIE)/sizeof(struct regnode))));
13111340
/*
13121341
This means we convert either the first branch or the first Exact,
13131342
depending on whether the thing following (in 'last') is a branch
@@ -1478,10 +1507,10 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
14781507
*/
14791508
if ( !trie->states[trie->startstate].wordnum
14801509
&& trie->bitmap
1481-
&& ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
1510+
&& ( (char *)jumper - (char *)convert) >= (int)sizeof(tregnode_TRIEC) )
14821511
{
14831512
OP( convert ) = TRIEC;
1484-
Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1513+
Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
14851514
PerlMemShared_free(trie->bitmap);
14861515
trie->bitmap= NULL;
14871516
} else
@@ -1608,14 +1637,14 @@ Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *s
16081637
#endif
16091638

16101639
if ( OP(source) == TRIE ) {
1611-
struct regnode_1 *op = (struct regnode_1 *)
1612-
PerlMemShared_calloc(1, sizeof(struct regnode_1));
1613-
StructCopy(source, op, struct regnode_1);
1640+
tregnode_TRIE *op = (tregnode_TRIE *)
1641+
PerlMemShared_calloc(1, sizeof(tregnode_TRIE));
1642+
StructCopy(source, op, tregnode_TRIE);
16141643
stclass = (regnode *)op;
16151644
} else {
1616-
struct regnode_charclass *op = (struct regnode_charclass *)
1617-
PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
1618-
StructCopy(source, op, struct regnode_charclass);
1645+
tregnode_TRIEC *op = (tregnode_TRIEC *)
1646+
PerlMemShared_calloc(1, sizeof(tregnode_TRIEC));
1647+
StructCopy(source, op, tregnode_TRIEC);
16191648
stclass = (regnode *)op;
16201649
}
16211650
OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */

0 commit comments

Comments
 (0)