Skip to content

gh-122358: Remove re._compile #122357

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
107 changes: 53 additions & 54 deletions Lib/re/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,17 @@ class RegexFlag:
def match(pattern, string, flags=0):
"""Try to apply the pattern at the start of the string, returning
a Match object, or None if no match was found."""
return _compile(pattern, flags).match(string)
return compile(pattern, flags).match(string)

def fullmatch(pattern, string, flags=0):
"""Try to apply the pattern to all of the string, returning
a Match object, or None if no match was found."""
return _compile(pattern, flags).fullmatch(string)
return compile(pattern, flags).fullmatch(string)

def search(pattern, string, flags=0):
"""Scan through string looking for a match to the pattern, returning
a Match object, or None if no match was found."""
return _compile(pattern, flags).search(string)
return compile(pattern, flags).search(string)

class _ZeroSentinel(int):
pass
Expand Down Expand Up @@ -205,7 +205,7 @@ def sub(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel
DeprecationWarning, stacklevel=2
)

return _compile(pattern, flags).sub(repl, string, count)
return compile(pattern, flags).sub(repl, string, count)
sub.__text_signature__ = '(pattern, repl, string, count=0, flags=0)'

def subn(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel):
Expand Down Expand Up @@ -235,7 +235,7 @@ def subn(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentine
DeprecationWarning, stacklevel=2
)

return _compile(pattern, flags).subn(repl, string, count)
return compile(pattern, flags).subn(repl, string, count)
subn.__text_signature__ = '(pattern, repl, string, count=0, flags=0)'

def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel):
Expand Down Expand Up @@ -264,7 +264,7 @@ def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel)
DeprecationWarning, stacklevel=2
)

return _compile(pattern, flags).split(string, maxsplit)
return compile(pattern, flags).split(string, maxsplit)
split.__text_signature__ = '(pattern, string, maxsplit=0, flags=0)'

def findall(pattern, string, flags=0):
Expand All @@ -275,60 +275,17 @@ def findall(pattern, string, flags=0):
has more than one group.

Empty matches are included in the result."""
return _compile(pattern, flags).findall(string)
return compile(pattern, flags).findall(string)

def finditer(pattern, string, flags=0):
"""Return an iterator over all non-overlapping matches in the
string. For each match, the iterator returns a Match object.

Empty matches are included in the result."""
return _compile(pattern, flags).finditer(string)
return compile(pattern, flags).finditer(string)

def compile(pattern, flags=0):
"Compile a regular expression pattern, returning a Pattern object."
return _compile(pattern, flags)

def purge():
"Clear the regular expression caches"
_cache.clear()
_cache2.clear()
_compile_template.cache_clear()


# SPECIAL_CHARS
# closing ')', '}' and ']'
# '-' (a range in character set)
# '&', '~', (extended character set operations)
# '#' (comment) and WHITESPACE (ignored) in verbose mode
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'}

def escape(pattern):
"""
Escape special characters in a string.
"""
if isinstance(pattern, str):
return pattern.translate(_special_chars_map)
else:
pattern = str(pattern, 'latin1')
return pattern.translate(_special_chars_map).encode('latin1')

Pattern = type(_compiler.compile('', 0))
Match = type(_compiler.compile('', 0).match(''))

# --------------------------------------------------------------------
# internals

# Use the fact that dict keeps the insertion order.
# _cache2 uses the simple FIFO policy which has better latency.
# _cache uses the LRU policy which has better hit rate.
_cache = {} # LRU
_cache2 = {} # FIFO
_MAXCACHE = 512
_MAXCACHE2 = 256
assert _MAXCACHE2 < _MAXCACHE

def _compile(pattern, flags):
# internal: compile pattern
"""Compile a regular expression pattern, returning a Pattern object."""
if isinstance(flags, RegexFlag):
flags = flags.value
try:
Expand Down Expand Up @@ -371,6 +328,45 @@ def _compile(pattern, flags):
_cache2[key] = p
return p

def purge():
"Clear the regular expression caches"
_cache.clear()
_cache2.clear()
_compile_template.cache_clear()


# SPECIAL_CHARS
# closing ')', '}' and ']'
# '-' (a range in character set)
# '&', '~', (extended character set operations)
# '#' (comment) and WHITESPACE (ignored) in verbose mode
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'}

def escape(pattern):
"""
Escape special characters in a string.
"""
if isinstance(pattern, str):
return pattern.translate(_special_chars_map)
else:
pattern = str(pattern, 'latin1')
return pattern.translate(_special_chars_map).encode('latin1')

Pattern = type(_compiler.compile('', 0))
Match = type(_compiler.compile('', 0).match(''))

# --------------------------------------------------------------------
# internals

# Use the fact that dict keeps the insertion order.
# _cache2 uses the simple FIFO policy which has better latency.
# _cache uses the LRU policy which has better hit rate.
_cache = {} # LRU
_cache2 = {} # FIFO
_MAXCACHE = 512
_MAXCACHE2 = 256
assert _MAXCACHE2 < _MAXCACHE

@functools.lru_cache(_MAXCACHE)
def _compile_template(pattern, repl):
# internal: compile replacement pattern
Expand All @@ -381,9 +377,12 @@ def _compile_template(pattern, repl):
import copyreg

def _pickle(p):
return _compile, (p.pattern, p.flags)
return compile, (p.pattern, p.flags)

# compatibility alias to deserialize old pickles
_compile = compile

copyreg.pickle(Pattern, _pickle, _compile)
copyreg.pickle(Pattern, _pickle, compile)

# --------------------------------------------------------------------
# experimental stuff (see python-dev discussions for details)
Expand Down
9 changes: 6 additions & 3 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

# XXX: show string offset and offending character for all errors

import os
from ._constants import *

SPECIAL_CHARS = ".\\[{()*+?^$|"
Expand Down Expand Up @@ -508,6 +509,8 @@ def _parse_sub(source, state, verbose, nested):
subpattern.append((BRANCH, (None, items)))
return subpattern

_warn_skips = (os.path.dirname(__file__),)

def _parse(source, state, verbose, nested, first=False):
# parse a simple pattern
subpattern = SubPattern(state)
Expand Down Expand Up @@ -557,7 +560,7 @@ def _parse(source, state, verbose, nested, first=False):
import warnings
warnings.warn(
'Possible nested set at position %d' % source.tell(),
FutureWarning, stacklevel=nested + 6
FutureWarning, skip_file_prefixes=_warn_skips
)
negate = sourcematch("^")
# check remaining characters
Expand All @@ -580,7 +583,7 @@ def _parse(source, state, verbose, nested, first=False):
'symmetric difference' if this == '~' else
'union',
source.tell() - 1),
FutureWarning, stacklevel=nested + 6
FutureWarning, skip_file_prefixes=_warn_skips
)
code1 = LITERAL, _ord(this)
if sourcematch("-"):
Expand All @@ -603,7 +606,7 @@ def _parse(source, state, verbose, nested, first=False):
warnings.warn(
'Possible set difference at position %d' % (
source.tell() - 2),
FutureWarning, stacklevel=nested + 6
FutureWarning, skip_file_prefixes=_warn_skips
)
code2 = LITERAL, _ord(that)
if code1[0] != LITERAL or code2[0] != LITERAL:
Expand Down
17 changes: 16 additions & 1 deletion Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1362,8 +1362,23 @@ def test_pickling(self):
pickled = pickle.dumps(oldpat, proto)
newpat = pickle.loads(pickled)
self.assertEqual(newpat, oldpat)
# current pickle expects the _compile() reconstructor in re module

def test_unpickling(self):
import pickle
pat = re.compile(".*")
from re import _compile # noqa: F401
# previous pickles may expect the _compile() reconstructor in re module.
# the four pickles below are examples of this at various protocol versions.
pickles = [
b'cre\n_compile\np0\n(V.*\np1\nI32\ntp2\nRp3\n.',
b'cre\n_compile\nq\x00(X\x02\x00\x00\x00.*q\x01K tq\x02Rq\x03.',
b'\x80\x03cre\n_compile\nq\x00X\x02\x00\x00\x00.*q\x01K \x86q\x02Rq\x03.',
b'\x80\x04\x95\x1e\x00\x00\x00\x00\x00\x00\x00\x8c\x02re\x94\x8c\x08'
b'_compile\x94\x93\x94\x8c\x02.*\x94K \x86\x94R\x94.',
]
for pickled in pickles:
unpickled = pickle.loads(pickled)
self.assertEqual(unpickled, pat)

def test_copying(self):
import copy
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove :func:`!re._compile`, leaving a compatibility alias to :func:`re.compile`.
Loading