Skip to content

Commit

Permalink
pythongh-121188: Sanitize invalid XML characters in regrtest (python#…
Browse files Browse the repository at this point in the history
…121195)

When creating the JUnit XML file, regrtest now escapes characters
which are invalid in XML, such as the chr(27) control character used
in ANSI escape sequences.
  • Loading branch information
vstinner authored Jul 1, 2024
1 parent f80376b commit af8c3d7
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 5 deletions.
12 changes: 7 additions & 5 deletions Lib/test/libregrtest/testresult.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import traceback
import unittest
from test import support
from test.libregrtest.utils import sanitize_xml

class RegressionTestResult(unittest.TextTestResult):
USE_XML = False
Expand Down Expand Up @@ -65,23 +66,24 @@ def _add_result(self, test, capture=False, **args):
if capture:
if self._stdout_buffer is not None:
stdout = self._stdout_buffer.getvalue().rstrip()
ET.SubElement(e, 'system-out').text = stdout
ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
if self._stderr_buffer is not None:
stderr = self._stderr_buffer.getvalue().rstrip()
ET.SubElement(e, 'system-err').text = stderr
ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)

for k, v in args.items():
if not k or not v:
continue

e2 = ET.SubElement(e, k)
if hasattr(v, 'items'):
for k2, v2 in v.items():
if k2:
e2.set(k2, str(v2))
e2.set(k2, sanitize_xml(str(v2)))
else:
e2.text = str(v2)
e2.text = sanitize_xml(str(v2))
else:
e2.text = str(v)
e2.text = sanitize_xml(str(v))

@classmethod
def __makeErrorDict(cls, err_type, err_value, err_tb):
Expand Down
22 changes: 22 additions & 0 deletions Lib/test/libregrtest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os.path
import platform
import random
import re
import shlex
import signal
import subprocess
Expand Down Expand Up @@ -712,3 +713,24 @@ def get_signal_name(exitcode):
pass

return None


ILLEGAL_XML_CHARS_RE = re.compile(
'['
# Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
'\x00-\x08\x0B\x0C\x0E-\x1F'
# Surrogate characters
'\uD800-\uDFFF'
# Special Unicode characters
'\uFFFE'
'\uFFFF'
# Match multiple sequential invalid characters for better effiency
']+')

def _sanitize_xml_replace(regs):
text = regs[0]
return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
for ch in text)

def sanitize_xml(text):
return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)
59 changes: 59 additions & 0 deletions Lib/test/test_regrtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import tempfile
import textwrap
import unittest
from xml.etree import ElementTree

from test import support
from test.support import import_helper
from test.support import os_helper
Expand Down Expand Up @@ -2254,6 +2256,44 @@ def test_pass(self):
self.check_executed_tests(output, testname, stats=1, parallel=True)
self.assertNotIn('SPAM SPAM SPAM', output)

def test_xml(self):
code = textwrap.dedent(r"""
import unittest
from test import support
class VerboseTests(unittest.TestCase):
def test_failed(self):
print("abc \x1b def")
self.fail()
""")
testname = self.create_test(code=code)

# Run sequentially
filename = os_helper.TESTFN
self.addCleanup(os_helper.unlink, filename)

output = self.run_tests(testname, "--junit-xml", filename,
exitcode=EXITCODE_BAD_TEST)
self.check_executed_tests(output, testname,
failed=testname,
stats=TestStats(1, 1, 0))

# Test generated XML
with open(filename, encoding="utf8") as fp:
content = fp.read()

testsuite = ElementTree.fromstring(content)
self.assertEqual(int(testsuite.get('tests')), 1)
self.assertEqual(int(testsuite.get('errors')), 0)
self.assertEqual(int(testsuite.get('failures')), 1)

testcase = testsuite[0][0]
self.assertEqual(testcase.get('status'), 'run')
self.assertEqual(testcase.get('result'), 'completed')
self.assertGreater(float(testcase.get('time')), 0)
for out in testcase.iter('system-out'):
self.assertEqual(out.text, r"abc \x1b def")


class TestUtils(unittest.TestCase):
def test_format_duration(self):
Expand Down Expand Up @@ -2437,6 +2477,25 @@ def id(self):
self.assertTrue(match_test(test_chdir))
self.assertFalse(match_test(test_copy))

def test_sanitize_xml(self):
sanitize_xml = utils.sanitize_xml

# escape invalid XML characters
self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
r'abc \x1b\x1f def')
self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
r'nul:\x00, bell:\x07')
self.assertEqual(sanitize_xml('surrogate:\uDC80'),
r'surrogate:\udc80')
self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
r'illegal \ufffe and \uffff')

# no escape for valid XML characters
self.assertEqual(sanitize_xml('a\n\tb'),
'a\n\tb')
self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
'valid t\xe9xt \u20ac')


if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
When creating the JUnit XML file, regrtest now escapes characters which are
invalid in XML, such as the chr(27) control character used in ANSI escape
sequences. Patch by Victor Stinner.

0 comments on commit af8c3d7

Please sign in to comment.