Skip to content

Commit ba6840f

Browse files
committed
fix Perl #126182, out of memory due to infinite pattern recursion
The way we tracked if pattern recursion was infinite did not work properly. A pattern like "a"=~/(.(?2))((?<=(?=(?1)).))/ would loop forever, slowly eat up all available ram as it added pattern recursion stack frames. This patch changes the rules for recursion so that recursively entering a given pattern "subroutine" twice from the same position fails the match. This means that where previously we might have seen fatal exception we will now simply fail. This means that "aaabbb"=~/a(?R)?b/ succeeds with $& equal to "aaabbb".
1 parent d5a00e4 commit ba6840f

File tree

5 files changed

+75
-18
lines changed

5 files changed

+75
-18
lines changed

regcomp.c

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4719,11 +4719,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
47194719
RExC_study_chunk_recursed_bytes, U8);
47204720
}
47214721
/* we havent recursed into this paren yet, so recurse into it */
4722-
DEBUG_STUDYDATA("set:", data,depth);
4722+
DEBUG_STUDYDATA("gosub-set:", data,depth);
47234723
PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
47244724
my_recursed_depth= recursed_depth + 1;
47254725
} else {
4726-
DEBUG_STUDYDATA("inf:", data,depth);
4726+
DEBUG_STUDYDATA("gosub-inf:", data,depth);
47274727
/* some form of infinite recursion, assume infinite length
47284728
* */
47294729
if (flags & SCF_DO_SUBSTR) {
@@ -7617,8 +7617,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
76177617
if (r->minlen < minlen)
76187618
r->minlen = minlen;
76197619

7620-
if (RExC_seen & REG_RECURSE_SEEN )
7620+
if (RExC_seen & REG_RECURSE_SEEN ) {
76217621
r->intflags |= PREGf_RECURSE_SEEN;
7622+
Newxz(r->recurse_locinput, r->nparens + 1, char *);
7623+
}
76227624
if (RExC_seen & REG_GPOS_SEEN)
76237625
r->intflags |= PREGf_GPOS_SEEN;
76247626
if (RExC_seen & REG_LOOKBEHIND_SEEN)
@@ -19041,6 +19043,8 @@ Perl_pregfree2(pTHX_ REGEXP *rx)
1904119043
#endif
1904219044
Safefree(r->offs);
1904319045
SvREFCNT_dec(r->qr_anoncv);
19046+
if (r->recurse_locinput)
19047+
Safefree(r->recurse_locinput);
1904419048
rx->sv_u.svu_rx = 0;
1904519049
}
1904619050

@@ -19124,6 +19128,8 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
1912419128
#endif
1912519129
ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
1912619130
SvREFCNT_inc_void(ret->qr_anoncv);
19131+
if (r->recurse_locinput)
19132+
Newxz(ret->recurse_locinput,r->nparens + 1,char *);
1912719133

1912819134
return ret_x;
1912919135
}
@@ -19262,7 +19268,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
1926219268
#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
1926319269

1926419270
/*
19265-
re_dup - duplicate a regexp.
19271+
re_dup_guts - duplicate a regexp.
1926619272

1926719273
This routine is expected to clone a given regexp structure. It is only
1926819274
compiled under USE_ITHREADS.
@@ -19330,6 +19336,8 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
1933019336

1933119337
RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
1933219338
ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
19339+
if (r->recurse_locinput)
19340+
Newxz(ret->recurse_locinput,r->nparens + 1,char *);
1933319341

1933419342
if (ret->pprivate)
1933519343
RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
@@ -19384,6 +19392,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
1938419392
char, regexp_internal);
1938519393
Copy(ri->program, reti->program, len+1, regnode);
1938619394

19395+
1938719396
reti->num_code_blocks = ri->num_code_blocks;
1938819397
if (ri->code_blocks) {
1938919398
int n;
@@ -19444,7 +19453,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
1944419453
d->data[i] = ri->data->data[i];
1944519454
break;
1944619455
default:
19447-
Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
19456+
Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
1944819457
ri->data->what[i]);
1944919458
}
1945019459
}

regcomp.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,6 @@
9090
/* This is the stuff that used to live in regexp.h that was truly
9191
private to the engine itself. It now lives here. */
9292

93-
94-
9593
typedef struct regexp_internal {
9694
int name_list_idx; /* Optional data index of an array of paren names */
9795
union {

regexec.c

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3126,6 +3126,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
31263126
));
31273127
}
31283128

3129+
if (prog->recurse_locinput)
3130+
Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3131+
31293132
/* Simplest case: anchored match need be tried only once, or with
31303133
* MBOL, only at the beginning of each line.
31313134
*
@@ -5184,7 +5187,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
51845187
bool is_utf8_pat = reginfo->is_utf8_pat;
51855188
bool match = FALSE;
51865189

5187-
51885190
#ifdef DEBUGGING
51895191
GET_RE_DEBUG_FLAGS_DECL;
51905192
#endif
@@ -6494,11 +6496,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
64946496
regexp *re;
64956497
regexp_internal *rei;
64966498
regnode *startpoint;
6499+
U32 arg;
64976500

64986501
case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
6499-
if (cur_eval && cur_eval->locinput==locinput) {
6500-
if ( EVAL_CLOSE_PAREN_IS( cur_eval, (U32)ARG(scan) ) )
6501-
Perl_croak(aTHX_ "Infinite recursion in regex");
6502+
arg= (U32)ARG(scan);
6503+
if (cur_eval && cur_eval->locinput == locinput) {
65026504
if ( ++nochange_depth > max_nochange_depth )
65036505
Perl_croak(aTHX_
65046506
"Pattern subroutine nesting without pos change"
@@ -6510,7 +6512,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
65106512
re = rex;
65116513
rei = rexi;
65126514
startpoint = scan + ARG2L(scan);
6513-
EVAL_CLOSE_PAREN_SET( st, ARG(scan) ); /* ST.close_paren = 1 + ARG(scan) */
6515+
EVAL_CLOSE_PAREN_SET( st, arg ); /* ST.close_paren = 1 + ARG(scan) */
6516+
/* Detect infinite recursion
6517+
*
6518+
* A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6519+
* or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6520+
* So we track the position in the string we are at each time
6521+
* we recurse and if we try to enter the same routine twice from
6522+
* the same position we fail. This means that a pattern like
6523+
* "aaabbb"=~/a(?R)?b/ works as expected and does not throw an
6524+
* error.
6525+
*/
6526+
if ( rex->recurse_locinput[arg] == locinput ) {
6527+
DEBUG_r({
6528+
GET_RE_DEBUG_FLAGS_DECL;
6529+
DEBUG_EXECUTE_r({
6530+
PerlIO_printf(Perl_debug_log,
6531+
"%*s pattern left-recursion without consuming input always fails...\n",
6532+
REPORT_CODE_OFF + depth*2, "");
6533+
});
6534+
});
6535+
/* this would be infinite recursion, so we fail */
6536+
sayNO;
6537+
} else {
6538+
ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6539+
rex->recurse_locinput[arg]= locinput;
6540+
}
65146541

65156542
/* Save all the positions seen so far. */
65166543
ST.cp = regcppush(rex, 0, maxopenparen);
@@ -6547,10 +6574,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
65476574
n = ARG(scan);
65486575

65496576
if (rexi->data->what[n] == 'r') { /* code from an external qr */
6550-
newcv = (ReANY(
6551-
(REGEXP*)(rexi->data->data[n])
6552-
))->qr_anoncv
6553-
;
6577+
newcv = (ReANY(
6578+
(REGEXP*)(rexi->data->data[n])
6579+
))->qr_anoncv;
65546580
nop = (OP*)rexi->data->data[n+1];
65556581
}
65566582
else if (rexi->data->what[n] == 'l') { /* literal code */
@@ -6771,6 +6797,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
67716797
startpoint = rei->program + 1;
67726798
EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
67736799
* close_paren only for GOSUB */
6800+
ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
67746801
/* Save all the seen positions so far. */
67756802
ST.cp = regcppush(rex, 0, maxopenparen);
67766803
REGCP_SET(ST.lastcp);
@@ -6812,6 +6839,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
68126839

68136840
case EVAL_AB: /* cleanup after a successful (??{A})B */
68146841
/* note: this is called twice; first after popping B, then A */
6842+
if ( cur_eval && cur_eval->u.eval.close_paren )
6843+
rex->recurse_locinput[cur_eval->u.eval.close_paren - 1] = cur_eval->u.eval.prev_recurse_locinput;
6844+
68156845
rex_sv = ST.prev_rex;
68166846
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
68176847
SET_reg_curpm(rex_sv);
@@ -6837,6 +6867,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
68376867

68386868
case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
68396869
/* note: this is called twice; first after popping B, then A */
6870+
if ( cur_eval && cur_eval->u.eval.close_paren )
6871+
rex->recurse_locinput[cur_eval->u.eval.close_paren - 1] = cur_eval->u.eval.prev_recurse_locinput;
6872+
68406873
rex_sv = ST.prev_rex;
68416874
is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
68426875
SET_reg_curpm(rex_sv);
@@ -7905,6 +7938,8 @@ NULL
79057938
fake_end:
79067939
if (cur_eval) {
79077940
/* we've just finished A in /(??{A})B/; now continue with B */
7941+
if ( cur_eval->u.eval.close_paren )
7942+
rex->recurse_locinput[cur_eval->u.eval.close_paren - 1] = cur_eval->u.eval.prev_recurse_locinput;
79087943

79097944
st->u.eval.prev_rex = rex_sv; /* inner */
79107945

regexp.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ struct reg_code_block {
102102
const struct regexp_engine* engine; \
103103
REGEXP *mother_re; /* what re is this a lightweight copy of? */ \
104104
HV *paren_names; /* Optional hash of paren names */ \
105+
/*--------------------------------------------------------*/ \
105106
/* Information about the match that the perl core uses to */ \
106107
/* manage things */ \
107108
U32 extflags; /* Flags used both externally and internally */ \
@@ -116,12 +117,15 @@ struct reg_code_block {
116117
U32 intflags; /* Engine Specific Internal flags */ \
117118
void *pprivate; /* Data private to the regex engine which */ \
118119
/* created this object. */ \
120+
/*--------------------------------------------------------*/ \
119121
/* Data about the last/current match. These are modified */ \
120122
/* during matching */ \
121123
U32 lastparen; /* last open paren matched */ \
122124
U32 lastcloseparen; /* last close paren matched */ \
123125
/* Array of offsets for (@-) and (@+) */ \
124126
regexp_paren_pair *offs; \
127+
char **recurse_locinput; /* used to detect infinite recursion, XXX: move to internal */ \
128+
/*--------------------------------------------------------*/ \
125129
/* saved or original string so \digit works forever. */ \
126130
char *subbeg; \
127131
SV_SAVED_COPY /* If non-NULL, SV which is COW from original */\
@@ -130,11 +134,13 @@ struct reg_code_block {
130134
SSize_t subcoffset; /* suboffset equiv, but in chars (for @-/@+) */ \
131135
/* Information about the match that isn't often used */ \
132136
SSize_t maxlen; /* mininum possible number of chars in string to match */\
137+
/*--------------------------------------------------------*/ \
133138
/* offset from wrapped to the start of precomp */ \
134139
PERL_BITFIELD32 pre_prefix:4; \
135140
/* original flags used to compile the pattern, may differ */ \
136141
/* from extflags in various ways */ \
137142
PERL_BITFIELD32 compflags:9; \
143+
/*--------------------------------------------------------*/ \
138144
CV *qr_anoncv /* the anon sub wrapped round qr/(?{..})/ */
139145

140146
typedef struct regexp {
@@ -657,7 +663,7 @@ typedef struct {
657663
/* structures for holding and saving the state maintained by regmatch() */
658664

659665
#ifndef MAX_RECURSE_EVAL_NOCHANGE_DEPTH
660-
#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 1000
666+
#define MAX_RECURSE_EVAL_NOCHANGE_DEPTH 10
661667
#endif
662668

663669
typedef I32 CHECKPOINT;
@@ -749,6 +755,7 @@ typedef struct regmatch_state {
749755
CHECKPOINT lastcp;
750756
U32 close_paren; /* which close bracket is our end (+1) */
751757
regnode *B; /* the node following us */
758+
char *prev_recurse_locinput;
752759
} eval;
753760

754761
struct {

t/re/pat.t

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ BEGIN {
2323
skip_all_without_unicode_tables();
2424
}
2525

26-
plan tests => 776; # Update this when adding/deleting tests.
26+
plan tests => 781; # Update this when adding/deleting tests.
2727

2828
run_tests() unless caller;
2929

@@ -1721,6 +1721,14 @@ EOP
17211721
fresh_perl_is($code, "", {},
17221722
"perl [#126406] panic");
17231723
}
1724+
{
1725+
# [perl #126182] test for infinite pattern recursion
1726+
ok("aaabbb"=~/a(?R)?b/, "optional self recursion works");
1727+
ok("aaabbb"=~/a(?R)?b/, "optional self recursion works");
1728+
ok(not("aa"=~/(?R)a/), "left-recursion fails fast");
1729+
ok("bbaa"=~/(?&x)(?(DEFINE)(?<x>(?&y)*a)(?<y>(?&x)*b))/,"inter-cyclic optional left recursion works");
1730+
ok(not("a"=~/(.(?2))((?<=(?=(?1)).))/),"look ahead left-recursion fails fast");
1731+
}
17241732
} # End of sub run_tests
17251733

17261734
1;

0 commit comments

Comments
 (0)