Skip to content

Regex bug fixes, refactoring (macroization) and code improvments #20918

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2b5d6c6
regexp.h - fixup mistake in comment
demerphq Jan 26, 2023
fc3bf60
t/re/regexp.t - in skip_amp tests (via _noamp.t) do not TODO tests wi…
demerphq Mar 9, 2023
34f8743
regcomp.c - track parens related to CURLYX and CURLYM
demerphq Mar 7, 2023
4c12e54
regexec.c - teach BRANCH and BRANCHJ nodes to reset capture buffers
demerphq Jan 9, 2023
805273d
regexec.c - incredibly inefficient solution to backref problem
demerphq Jan 14, 2023
9b7643d
regexec.c - make REF into a backtracking state
demerphq Jan 14, 2023
9939750
regex engine - simplify regnode structures and make them consistent
demerphq Jan 15, 2023
c41e6a0
regcomp.c - extend REF to hold the paren it needs to regcppush
demerphq Jan 15, 2023
4e47002
regcomp.c - Use RXp_OFFSp() to access offset data
demerphq Jan 25, 2023
b82f4ce
regexp.h - standardize macros, and parenthesize parameters
demerphq Jan 25, 2023
1065670
regexec.c - use RXp_LASTPAREN(rex) to access rex->lastparen
demerphq Jan 26, 2023
7568ad6
regexp.h - add missing defines
demerphq Jan 26, 2023
4a94860
dump.c - use RXp_ macros to access regexp struct members
demerphq Jan 26, 2023
e8a5523
regexec.c - use RXp_LASTCLOSEPAREN(r) to access r->lastcloseparen
demerphq Jan 26, 2023
3697eb9
regexec.c - use macro to access rex->subbeg
demerphq Jan 26, 2023
0bce65a
regexec.c - use RXp_SUBLEN(ret) for ret->sublen
demerphq Jan 26, 2023
3a05036
regexec.c - use RXp_SUBOFFSET(rx) instead of rx->suboffset
demerphq Jan 26, 2023
eb05738
regexec.c - use RXp_SUBCOFFSET instead of rx->subcoffset
demerphq Jan 26, 2023
04d422e
regexec.c - use RXp_SAVED_COPY(rex) instead of rex->saved_copy
demerphq Jan 26, 2023
e9a6ab1
regcomp.c - use macro wrappers to minimize impact of struct split
demerphq Jan 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -2653,61 +2653,61 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, I32 nest, I32 maxnest, bo
#undef SV_SET_STRINGIFY_I32_ARRAY

Perl_dump_indent(aTHX_ level, file, " LASTPAREN = %" UVuf "\n",
(UV)(r->lastparen));
(UV)(RXp_LASTPAREN(r)));
Perl_dump_indent(aTHX_ level, file, " LASTCLOSEPAREN = %" UVuf "\n",
(UV)(r->lastcloseparen));
(UV)(RXp_LASTCLOSEPAREN(r)));
Perl_dump_indent(aTHX_ level, file, " MINLEN = %" IVdf "\n",
(IV)(r->minlen));
(IV)(RXp_MINLEN(r)));
Perl_dump_indent(aTHX_ level, file, " MINLENRET = %" IVdf "\n",
(IV)(r->minlenret));
(IV)(RXp_MINLENRET(r)));
Perl_dump_indent(aTHX_ level, file, " GOFS = %" UVuf "\n",
(UV)(r->gofs));
(UV)(RXp_GOFS(r)));
Perl_dump_indent(aTHX_ level, file, " PRE_PREFIX = %" UVuf "\n",
(UV)(r->pre_prefix));
(UV)(RXp_PRE_PREFIX(r)));
Perl_dump_indent(aTHX_ level, file, " SUBLEN = %" IVdf "\n",
(IV)(r->sublen));
(IV)(RXp_SUBLEN(r)));
Perl_dump_indent(aTHX_ level, file, " SUBOFFSET = %" IVdf "\n",
(IV)(r->suboffset));
(IV)(RXp_SUBOFFSET(r)));
Perl_dump_indent(aTHX_ level, file, " SUBCOFFSET = %" IVdf "\n",
(IV)(r->subcoffset));
if (r->subbeg)
(IV)(RXp_SUBCOFFSET(r)));
if (RXp_SUBBEG(r))
Perl_dump_indent(aTHX_ level, file, " SUBBEG = 0x%" UVxf " %s\n",
PTR2UV(r->subbeg),
pv_display(d, r->subbeg, r->sublen, 50, pvlim));
PTR2UV(RXp_SUBBEG(r)),
pv_display(d, RXp_SUBBEG(r), RXp_SUBLEN(r), 50, pvlim));
else
Perl_dump_indent(aTHX_ level, file, " SUBBEG = 0x0\n");
Perl_dump_indent(aTHX_ level, file, " PAREN_NAMES = 0x%" UVxf "\n",
PTR2UV(r->paren_names));
PTR2UV(RXp_PAREN_NAMES(r)));
Perl_dump_indent(aTHX_ level, file, " SUBSTRS = 0x%" UVxf "\n",
PTR2UV(r->substrs));
PTR2UV(RXp_SUBSTRS(r)));
Perl_dump_indent(aTHX_ level, file, " PPRIVATE = 0x%" UVxf "\n",
PTR2UV(r->pprivate));
PTR2UV(RXp_PPRIVATE(r)));
Perl_dump_indent(aTHX_ level, file, " OFFS = 0x%" UVxf "\n",
PTR2UV(r->offs));
if (r->offs) {
PTR2UV(RXp_OFFSp(r)));
if (RXp_OFFSp(r)) {
U32 n;
sv_setpvs(d,"[ ");
/* note offs[0] is for the whole match, and
* the data for $1 is in offs[1]. Thus we have to
* show one more than we have nparens. */
for(n = 0; n <= r->nparens; n++) {
sv_catpvf(d,"%" IVdf ":%" IVdf "%s",
(IV)(r->offs[n].start), (IV)(r->offs[n].end),
(IV)RXp_OFFSp(r)[n].start, (IV)RXp_OFFSp(r)[n].end,
n+1 > r->nparens ? " ]\n" : ", ");
}
Perl_dump_indent(aTHX_ level, file, " %" SVf, d);
}
Perl_dump_indent(aTHX_ level, file, " QR_ANONCV = 0x%" UVxf "\n",
PTR2UV(r->qr_anoncv));
PTR2UV(RXp_QR_ANONCV(r)));
#ifdef PERL_ANY_COW
Perl_dump_indent(aTHX_ level, file, " SAVED_COPY = 0x%" UVxf "\n",
PTR2UV(r->saved_copy));
PTR2UV(RXp_SAVED_COPY(r)));
#endif
/* this should go LAST or the output gets really confusing */
Perl_dump_indent(aTHX_ level, file, " MOTHER_RE = 0x%" UVxf "\n",
PTR2UV(r->mother_re));
if (nest < maxnest && r->mother_re)
do_sv_dump(level+1, file, (SV *)r->mother_re, nest+1,
PTR2UV(RXp_MOTHER_RE(r)));
if (nest < maxnest && RXp_MOTHER_RE(r))
do_sv_dump(level+1, file, (SV *)RXp_MOTHER_RE(r), nest+1,
maxnest, dumpops, pvlim);
}
break;
Expand Down
15 changes: 7 additions & 8 deletions embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -5141,9 +5141,6 @@ ES |regnode_offset|reg |NN RExC_state_t *pRExC_state \
|I32 paren \
|NN I32 *flagp \
|U32 depth
ES |regnode_offset|reganode|NN RExC_state_t *pRExC_state \
|U8 op \
|U32 arg
ES |regnode_offset|regatom |NN RExC_state_t *pRExC_state \
|NN I32 *flagp \
|U32 depth
Expand All @@ -5167,11 +5164,6 @@ ES |void |reginsert |NN RExC_state_t *pRExC_state \
|const U8 op \
|const regnode_offset operand \
|const U32 depth
ES |regnode_offset|reg2Lanode \
|NN RExC_state_t *pRExC_state \
|const U8 op \
|const U32 arg1 \
|const I32 arg2
ES |regnode_offset|reg_la_NOTHING \
|NN RExC_state_t *pRExC_state \
|U32 flags \
Expand All @@ -5180,6 +5172,13 @@ ES |regnode_offset|reg_la_OPFAIL \
|NN RExC_state_t *pRExC_state \
|U32 flags \
|NN const char *type
ES |regnode_offset|reg1node|NN RExC_state_t *pRExC_state \
|U8 op \
|U32 arg
ES |regnode_offset|reg2node|NN RExC_state_t *pRExC_state \
|const U8 op \
|const U32 arg1 \
|const I32 arg2
ES |regnode_offset|reg_node|NN RExC_state_t *pRExC_state \
|U8 op
ES |regnode_offset|regnode_guts \
Expand Down
4 changes: 2 additions & 2 deletions embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -1845,13 +1845,13 @@
# define parse_lparen_question_flags(a) S_parse_lparen_question_flags(aTHX_ a)
# define parse_uniprop_string(a,b,c,d,e,f,g,h,i,j) S_parse_uniprop_string(aTHX_ a,b,c,d,e,f,g,h,i,j)
# define reg(a,b,c,d) S_reg(aTHX_ a,b,c,d)
# define reg2Lanode(a,b,c,d) S_reg2Lanode(aTHX_ a,b,c,d)
# define reg1node(a,b,c) S_reg1node(aTHX_ a,b,c)
# define reg2node(a,b,c,d) S_reg2node(aTHX_ a,b,c,d)
# define reg_la_NOTHING(a,b,c) S_reg_la_NOTHING(aTHX_ a,b,c)
# define reg_la_OPFAIL(a,b,c) S_reg_la_OPFAIL(aTHX_ a,b,c)
# define reg_node(a,b) S_reg_node(aTHX_ a,b)
# define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b)
# define reg_skipcomment S_reg_skipcomment
# define reganode(a,b,c) S_reganode(aTHX_ a,b,c)
# define regatom(a,b,c) S_regatom(aTHX_ a,b,c)
# define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d)
# define regclass(a,b,c,d,e,f,g,h,i) S_regclass(aTHX_ a,b,c,d,e,f,g,h,i)
Expand Down
38 changes: 19 additions & 19 deletions pod/perldebguts.pod
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ will be lost.
# pointer of each individual branch points; each branch
# starts with the operand node of a BRANCH node.
#
BRANCH node Match this alternative, or the next...
BRANCH node 1 Match this alternative, or the next...

# Literals

Expand Down Expand Up @@ -752,13 +752,13 @@ will be lost.
PLUS node Match this (simple) thing 1 or more times:
/A{1,}B/ where A is width 1 char

CURLY sv 2 Match this (simple) thing {n,m} times:
CURLY sv 3 Match this (simple) thing {n,m} times:
/A{m,n}B/ where A is width 1 char
CURLYN no 2 Capture next-after-this simple thing:
CURLYN no 3 Capture next-after-this simple thing:
/(A){m,n}B/ where A is width 1 char
CURLYM no 2 Capture this medium-complex thing {n,m}
CURLYM no 3 Capture this medium-complex thing {n,m}
times: /(A){m,n}B/ where A is fixed-length
CURLYX sv 2 Match/Capture this complex thing {n,m}
CURLYX sv 3 Match/Capture this complex thing {n,m}
times.

# This terminator creates a loop structure for CURLYX
Expand All @@ -773,30 +773,30 @@ will be lost.
SROPEN none Same as OPEN, but for script run
SRCLOSE none Close preceding SROPEN

REF num 1 Match some already matched string
REFF num 1 Match already matched string, using /di
REF num 2 Match some already matched string
REFF num 2 Match already matched string, using /di
rules.
REFFL num 1 Match already matched string, using /li
REFFL num 2 Match already matched string, using /li
rules.
REFFU num 1 Match already matched string, usng /ui.
REFFA num 1 Match already matched string, using /aai
REFFU num 2 Match already matched string, usng /ui.
REFFA num 2 Match already matched string, using /aai
rules.

# Named references. Code in regcomp.c assumes that these all are after
# the numbered references
REFN no-sv 1 Match some already matched string
REFFN no-sv 1 Match already matched string, using /di
REFN no-sv 2 Match some already matched string
REFFN no-sv 2 Match already matched string, using /di
rules.
REFFLN no-sv 1 Match already matched string, using /li
REFFLN no-sv 2 Match already matched string, using /li
rules.
REFFUN num 1 Match already matched string, using /ui
REFFUN num 2 Match already matched string, using /ui
rules.
REFFAN num 1 Match already matched string, using /aai
REFFAN num 2 Match already matched string, using /aai
rules.

# Support for long RE
LONGJMP off 1 1 Jump far away.
BRANCHJ off 1 1 BRANCH with long offset.
BRANCHJ off 2 1 BRANCH with long offset.

# Special Case Regops
IFMATCH off 1 1 Succeeds if the following matches; non-zero
Expand All @@ -814,7 +814,7 @@ will be lost.
# The heavy worker

EVAL evl/flags Execute some Perl code.
2L
2

# Modifiers

Expand All @@ -825,7 +825,7 @@ will be lost.
RENUM off 1 1 Group with independently numbered parens.

# Regex Subroutines
GOSUB num/ofs 2L recurse to paren arg1 at (signed) ofs arg2
GOSUB num/ofs 2 recurse to paren arg1 at (signed) ofs arg2

# Special conditionals
GROUPPN no-sv 1 Whether the group matched.
Expand All @@ -836,7 +836,7 @@ will be lost.
ENDLIKE none Used only for the type field of verbs
OPFAIL no-sv 1 Same as (?!), but with verb arg
ACCEPT no-sv/num Accepts the current matched string, with
2L verbar
2 verbar

# Verbs With Arguments
VERB no-sv 1 Used only for the type field of verbs
Expand Down
4 changes: 2 additions & 2 deletions pp_ctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -381,9 +381,9 @@ Perl_rxres_save(pTHX_ void **rsp, REGEXP *rx)

if (!p || p[1] < RX_NPARENS(rx)) {
#ifdef PERL_ANY_COW
i = 7 + (RX_NPARENS(rx)+1) * 2;
i = 7 + (RX_NPARENS(rx)+1) * 4;
#else
i = 6 + (RX_NPARENS(rx)+1) * 2;
i = 6 + (RX_NPARENS(rx)+1) * 4;
#endif
if (!p)
Newx(p, i, UV);
Expand Down
14 changes: 7 additions & 7 deletions proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading