Skip to content

bpo-30349: Raise FutureWarning for nested sets and set operations #1553

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,20 @@ The special characters are:
place it at the beginning of the set. For example, both ``[()[\]{}]`` and
``[]()[{}]`` will both match a parenthesis.

* Support of nested sets and set operations as in `Unicode Technical
Standard #18`_ might be added in the future. This would change the
syntax, so to facilitate this change a :exc:`FutureWarning` will be raised
in ambiguous cases for the time being.
That include sets starting with a literal ``'['`` or containing literal
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
avoid a warning escape them with a backslash.

.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/

.. versionchanged:: 3.7
:exc:`FutureWarning` is raised if a character set contains constructs
that will change semantically in the future.

``|``
``A|B``, where *A* and *B* can be arbitrary REs, creates a regular expression that
will match either *A* or *B*. An arbitrary number of REs can be separated by the
Expand Down Expand Up @@ -829,7 +843,7 @@ form.

>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
>>> print('[%s]+' % re.escape(legal_chars))
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%\&'\*\+\-\.\^_`\|\~:]+

>>> operators = ['+', '-', '*', '/', '**']
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
Expand Down
2 changes: 1 addition & 1 deletion Doc/tools/susp-ignored.csv
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
whatsnew/3.2,,:location,zope9-location = ${zope9:location}
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
library/re,,`,!#$%&'*+-.^_`|~:
library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
library/re,,`,!\#\$%\&'\*\+\-\.\^_`\|\~:
library/tarfile,,:xz,'x:xz'
library/xml.etree.elementtree,,:sometag,prefix:sometag
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
Expand Down
11 changes: 11 additions & 0 deletions Doc/whatsnew/3.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,17 @@ Changes in the Python API
argument ``os.scandir`` instead of ``os.listdir`` when listing the direcory
is failed.

* Support of nested sets and set operations in regular expressions as in
`Unicode Technical Standard #18`_ might be added in the future. This would
change the syntax, so to facilitate this change a :exc:`FutureWarning` will
be raised in ambiguous cases for the time being.
That include sets starting with a literal ``'['`` or containing literal
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
avoid a warning escape them with a backslash.
(Contributed by Serhiy Storchaka in :issue:`30349`.)

.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/


Changes in the C API
--------------------
Expand Down
9 changes: 4 additions & 5 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1354,15 +1354,14 @@ def __str__(self):

_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
_non_atom_end_matcher = re.compile(r"[^{}]+".format(
''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
re.escape(''.join(ATOM_ENDS)))).match
_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
_non_token_end_matcher = re.compile(r"[^{}]+".format(
''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
re.escape(''.join(TOKEN_ENDS)))).match
_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
re.escape(''.join(ATTRIBUTE_ENDS)))).match
_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
'\\','\\\\').replace(']',r'\]'))).match
re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match

def _validate_xtext(xtext):
"""If input token contains ASCII non-printables, register a defect."""
Expand Down
3 changes: 2 additions & 1 deletion Lib/re.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,9 @@ def template(pattern, flags=0):
# SPECIAL_CHARS
# closing ')', '}' and ']'
# '-' (a range in character set)
# '&', '~', (extended character set operations)
# '#' (comment) and WHITESPACE (ignored) in verbose mode
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'}

def escape(pattern):
"""
Expand Down
24 changes: 24 additions & 0 deletions Lib/sre_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,12 @@ def _parse(source, state, verbose, nested, first=False):
setappend = set.append
## if sourcematch(":"):
## pass # handle character classes
if source.next == '[':
import warnings
warnings.warn(
'Possible nested set at position %d' % source.tell(),
FutureWarning, stacklevel=nested + 6
)
negate = sourcematch("^")
# check remaining characters
while True:
Expand All @@ -529,6 +535,17 @@ def _parse(source, state, verbose, nested, first=False):
elif this[0] == "\\":
code1 = _class_escape(source, this)
else:
if set and this in '-&~|' and source.next == this:
import warnings
warnings.warn(
'Possible set %s at position %d' % (
'difference' if this == '-' else
'intersection' if this == '&' else
'symmetric difference' if this == '~' else
'union',
source.tell() - 1),
FutureWarning, stacklevel=nested + 6
)
code1 = LITERAL, _ord(this)
if sourcematch("-"):
# potential range
Expand All @@ -545,6 +562,13 @@ def _parse(source, state, verbose, nested, first=False):
if that[0] == "\\":
code2 = _class_escape(source, that)
else:
if that == '-':
import warnings
warnings.warn(
'Possible set difference at position %d' % (
source.tell() - 2),
FutureWarning, stacklevel=nested + 6
)
code2 = LITERAL, _ord(that)
if code1[0] != LITERAL or code2[0] != LITERAL:
msg = "bad character range %s-%s" % (this, that)
Expand Down
47 changes: 46 additions & 1 deletion Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,51 @@ def test_not_literal(self):
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")

def test_possible_set_operations(self):
s = bytes(range(128)).decode()
with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9--1]')
self.assertEqual(p.findall(s), list('-./0123456789'))
self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[%--1]')
self.assertEqual(p.findall(s), list("%&'()*+,-1"))
with self.assertWarns(FutureWarning):
p = re.compile(r'[%--]')
self.assertEqual(p.findall(s), list("%&'()*+,-"))

with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9&&1]')
self.assertEqual(p.findall(s), list('&0123456789'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[\d&&1]')
self.assertEqual(p.findall(s), list('&0123456789'))
self.assertEqual(re.findall(r'[&&1]', s), list('&1'))

with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9||a]')
self.assertEqual(p.findall(s), list('0123456789a|'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[\d||a]')
self.assertEqual(p.findall(s), list('0123456789a|'))
self.assertEqual(re.findall(r'[||1]', s), list('1|'))

with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9~~1]')
self.assertEqual(p.findall(s), list('0123456789~'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[\d~~1]')
self.assertEqual(p.findall(s), list('0123456789~'))
self.assertEqual(re.findall(r'[~~1]', s), list('1~'))

with self.assertWarns(FutureWarning):
p = re.compile(r'[[0-9]|]')
self.assertEqual(p.findall(s), list('0123456789[]'))

with self.assertWarns(FutureWarning):
p = re.compile(r'[[:digit:]|]')
self.assertEqual(p.findall(s), list(':[]dgit'))

def test_search_coverage(self):
self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
Expand All @@ -932,7 +977,7 @@ def assertMatch(self, pattern, text, match=None, span=None,
self.assertEqual(m.group(), match)
self.assertEqual(m.span(), span)

LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'
LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'

def test_re_escape(self):
p = ''.join(chr(i) for i in range(256))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FutureWarning is now emitted if a regular expression contains character set
constructs that will change semantically in the future (nested sets and set
operations).