Skip to content

Commit 9baa5b2

Browse files
Issue #22437: Number of capturing groups in regular expression is no longer
limited by 100.
1 parent c31e622 commit 9baa5b2

File tree

8 files changed

+76
-27
lines changed

8 files changed

+76
-27
lines changed

Doc/whatsnew/3.5.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,12 @@ os
217217
* :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes`
218218
attribute on Windows (contributed by Ben Hoyt in :issue:`21719`).
219219

220+
re
221+
--
222+
223+
* Number of capturing groups in regular expression is no longer limited by 100.
224+
(Contributed by Serhiy Storchaka in :issue:`22437`.)
225+
220226
shutil
221227
------
222228

Lib/sre_compile.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -470,12 +470,6 @@ def compile(p, flags=0):
470470

471471
# print code
472472

473-
# XXX: <fl> get rid of this limitation!
474-
if p.pattern.groups > 100:
475-
raise AssertionError(
476-
"sorry, but this version only supports 100 named groups"
477-
)
478-
479473
# map in either direction
480474
groupindex = p.pattern.groupdict
481475
indexgroup = [None] * p.pattern.groups

Lib/sre_constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
MAGIC = 20031017
1717

18-
from _sre import MAXREPEAT
18+
from _sre import MAXREPEAT, MAXGROUPS
1919

2020
# SRE standard exception (access as sre.error)
2121
# should this really be here?

Lib/sre_parse.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ def __init__(self):
7272
def opengroup(self, name=None):
7373
gid = self.groups
7474
self.groups = gid + 1
75+
if self.groups > MAXGROUPS:
76+
raise error("groups number is too large")
7577
if name is not None:
7678
ogid = self.groupdict.get(name, None)
7779
if ogid is not None:
@@ -695,8 +697,14 @@ def _parse(source, state):
695697
else:
696698
try:
697699
condgroup = int(condname)
700+
if condgroup < 0:
701+
raise ValueError
698702
except ValueError:
699703
raise error("bad character in group name")
704+
if not condgroup:
705+
raise error("bad group number")
706+
if condgroup >= MAXGROUPS:
707+
raise error("the group number is too large")
700708
else:
701709
# flags
702710
if not source.next in FLAGS:
@@ -822,6 +830,8 @@ def addgroup(index):
822830
index = int(name)
823831
if index < 0:
824832
raise error("negative group number")
833+
if index >= MAXGROUPS:
834+
raise error("the group number is too large")
825835
except ValueError:
826836
if not name.isidentifier():
827837
raise error("bad character in group name")

Lib/test/test_re.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def test_bug_462270(self):
193193
def test_symbolic_groups(self):
194194
re.compile('(?P<a>x)(?P=a)(?(a)y)')
195195
re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
196+
re.compile('(?P<a1>x)\1(?(1)y)')
196197
self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
197198
self.assertRaises(re.error, re.compile, '(?Px)')
198199
self.assertRaises(re.error, re.compile, '(?P=)')
@@ -212,6 +213,10 @@ def test_symbolic_groups(self):
212213
re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
213214
re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
214215
self.assertRaises(re.error, re.compile, '(?P<©>x)')
216+
# Support > 100 groups.
217+
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
218+
pat = '(?:%s)(?(200)z|t)' % pat
219+
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
215220

216221
def test_symbolic_refs(self):
217222
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
@@ -228,6 +233,9 @@ def test_symbolic_refs(self):
228233
self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
229234
self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
230235
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
236+
# Support > 100 groups.
237+
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
238+
self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
231239

232240
def test_re_subn(self):
233241
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -404,6 +412,10 @@ def test_re_groupref_exists(self):
404412
self.assertIsNone(p.match('abd'))
405413
self.assertIsNone(p.match('ac'))
406414

415+
# Support > 100 groups.
416+
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
417+
pat = '(?:%s)(?(200)z)' % pat
418+
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
407419

408420
def test_re_groupref(self):
409421
self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
@@ -1070,8 +1082,10 @@ def test_dealloc(self):
10701082
# a RuntimeError is raised instead of OverflowError.
10711083
long_overflow = 2**128
10721084
self.assertRaises(TypeError, re.finditer, "a", {})
1073-
self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
1074-
self.assertRaises(TypeError, _sre.compile, {}, 0, [])
1085+
with self.assertRaises(OverflowError):
1086+
_sre.compile("abc", 0, [long_overflow], 0, [], [])
1087+
with self.assertRaises(TypeError):
1088+
_sre.compile({}, 0, [], 0, [], [])
10751089

10761090
def test_search_dot_unicode(self):
10771091
self.assertTrue(re.search("123.*-", '123abc-'))

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,9 @@ Core and Builtins
145145
Library
146146
-------
147147

148+
- Issue #22437: Number of capturing groups in regular expression is no longer
149+
limited by 100.
150+
148151
- Issue #17442: InteractiveInterpreter now displays the full chained traceback
149152
in its showtraceback method, to match the built in interactive interpreter.
150153

Modules/_sre.c

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
357357

358358
memset(state, 0, sizeof(SRE_STATE));
359359

360+
state->mark = PyMem_New(void *, pattern->groups * 2);
361+
if (!state->mark) {
362+
PyErr_NoMemory();
363+
goto err;
364+
}
360365
state->lastmark = -1;
361366
state->lastindex = -1;
362367

@@ -409,6 +414,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
409414

410415
return string;
411416
err:
417+
PyMem_Del(state->mark);
418+
state->mark = NULL;
412419
if (state->buffer.buf)
413420
PyBuffer_Release(&state->buffer);
414421
return NULL;
@@ -421,6 +428,8 @@ state_fini(SRE_STATE* state)
421428
PyBuffer_Release(&state->buffer);
422429
Py_XDECREF(state->string);
423430
data_stack_dealloc(state);
431+
PyMem_Del(state->mark);
432+
state->mark = NULL;
424433
}
425434

426435
/* calculate offset from start of string */
@@ -560,6 +569,7 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
560569
PyObject *pattern = NULL;
561570
SRE_STATE state;
562571
Py_ssize_t status;
572+
PyObject *match;
563573

564574
if (!PyArg_ParseTupleAndKeywords(args, kwargs,
565575
"|Onn$O:match", _keywords,
@@ -579,19 +589,22 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
579589
status = sre_match(&state, PatternObject_GetCode(self), 0);
580590

581591
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
582-
if (PyErr_Occurred())
592+
if (PyErr_Occurred()) {
593+
state_fini(&state);
583594
return NULL;
595+
}
584596

597+
match = pattern_new_match(self, &state, status);
585598
state_fini(&state);
586-
587-
return (PyObject *)pattern_new_match(self, &state, status);
599+
return match;
588600
}
589601

590602
static PyObject*
591603
pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
592604
{
593605
SRE_STATE state;
594606
Py_ssize_t status;
607+
PyObject *match;
595608

596609
PyObject *string = NULL, *string2 = NULL;
597610
Py_ssize_t start = 0;
@@ -616,19 +629,22 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
616629
status = sre_match(&state, PatternObject_GetCode(self), 1);
617630

618631
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
619-
if (PyErr_Occurred())
632+
if (PyErr_Occurred()) {
633+
state_fini(&state);
620634
return NULL;
635+
}
621636

637+
match = pattern_new_match(self, &state, status);
622638
state_fini(&state);
623-
624-
return pattern_new_match(self, &state, status);
639+
return match;
625640
}
626641

627642
static PyObject*
628643
pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
629644
{
630645
SRE_STATE state;
631646
Py_ssize_t status;
647+
PyObject *match;
632648

633649
PyObject *string = NULL, *string2 = NULL;
634650
Py_ssize_t start = 0;
@@ -652,12 +668,14 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
652668

653669
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
654670

655-
state_fini(&state);
656-
657-
if (PyErr_Occurred())
671+
if (PyErr_Occurred()) {
672+
state_fini(&state);
658673
return NULL;
674+
}
659675

660-
return pattern_new_match(self, &state, status);
676+
match = pattern_new_match(self, &state, status);
677+
state_fini(&state);
678+
return match;
661679
}
662680

663681
static PyObject*
@@ -1417,7 +1435,7 @@ _compile(PyObject* self_, PyObject* args)
14171435
PyObject* groupindex = NULL;
14181436
PyObject* indexgroup = NULL;
14191437

1420-
if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
1438+
if (!PyArg_ParseTuple(args, "OiO!nOO", &pattern, &flags,
14211439
&PyList_Type, &code, &groups,
14221440
&groupindex, &indexgroup))
14231441
return NULL;
@@ -1933,10 +1951,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
19331951
static int
19341952
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
19351953
{
1936-
if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
1954+
if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
1955+
code >= end || end[-1] != SRE_OP_SUCCESS)
19371956
FAIL;
1938-
if (groups == 0) /* fix for simplejson */
1939-
groups = 100; /* 100 groups should always be safe */
19401957
return _validate_inner(code, end-1, groups);
19411958
}
19421959

@@ -2747,6 +2764,12 @@ PyMODINIT_FUNC PyInit__sre(void)
27472764
Py_DECREF(x);
27482765
}
27492766

2767+
x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
2768+
if (x) {
2769+
PyDict_SetItemString(d, "MAXGROUPS", x);
2770+
Py_DECREF(x);
2771+
}
2772+
27502773
x = PyUnicode_FromString(copyright);
27512774
if (x) {
27522775
PyDict_SetItemString(d, "copyright", x);

Modules/sre.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
#define SRE_CODE Py_UCS4
1919
#if SIZEOF_SIZE_T > 4
2020
# define SRE_MAXREPEAT (~(SRE_CODE)0)
21+
# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2)
2122
#else
2223
# define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
24+
# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2)
2325
#endif
2426

2527
typedef struct {
@@ -52,9 +54,6 @@ typedef struct {
5254

5355
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
5456

55-
/* FIXME: <fl> shouldn't be a constant, really... */
56-
#define SRE_MARK_SIZE 200
57-
5857
typedef struct SRE_REPEAT_T {
5958
Py_ssize_t count;
6059
SRE_CODE* pattern; /* points to REPEAT operator arguments */
@@ -76,7 +75,7 @@ typedef struct {
7675
/* registers */
7776
Py_ssize_t lastindex;
7877
Py_ssize_t lastmark;
79-
void* mark[SRE_MARK_SIZE];
78+
void** mark;
8079
/* dynamically allocated stuff */
8180
char* data_stack;
8281
size_t data_stack_size;

0 commit comments

Comments
 (0)