-
-
Notifications
You must be signed in to change notification settings - Fork 34.1k
gh-140875: Fix handling of unclosed charrefs before EOF in HTMLParser #140904
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fe8bcb2
8105625
ebc287b
5dedffc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -109,12 +109,13 @@ def get_events(self): | |
|
|
||
| class TestCaseBase(unittest.TestCase): | ||
|
|
||
| def get_collector(self): | ||
| return EventCollector(convert_charrefs=False) | ||
| def get_collector(self, convert_charrefs=False): | ||
| return EventCollector(convert_charrefs=convert_charrefs) | ||
|
|
||
| def _run_check(self, source, expected_events, collector=None): | ||
| def _run_check(self, source, expected_events, | ||
| *, collector=None, convert_charrefs=False): | ||
| if collector is None: | ||
| collector = self.get_collector() | ||
| collector = self.get_collector(convert_charrefs=convert_charrefs) | ||
| parser = collector | ||
| for s in source: | ||
| parser.feed(s) | ||
|
|
@@ -128,7 +129,7 @@ def _run_check(self, source, expected_events, collector=None): | |
|
|
||
| def _run_check_extra(self, source, events): | ||
| self._run_check(source, events, | ||
| EventCollectorExtra(convert_charrefs=False)) | ||
| collector=EventCollectorExtra(convert_charrefs=False)) | ||
|
|
||
|
|
||
| class HTMLParserTestCase(TestCaseBase): | ||
|
|
@@ -187,10 +188,87 @@ def test_malformatted_charref(self): | |
| ]) | ||
|
|
||
| def test_unclosed_entityref(self): | ||
| self._run_check("&entityref foo", [ | ||
| ("entityref", "entityref"), | ||
| ("data", " foo"), | ||
| ]) | ||
| self._run_check('> <', [('entityref', 'gt'), ('data', ' '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('> <', [('data', '> <')], convert_charrefs=True) | ||
|
|
||
| self._run_check('&undefined <', | ||
| [('entityref', 'undefined'), ('data', ' '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('&undefined <', [('data', '&undefined <')], | ||
| convert_charrefs=True) | ||
|
|
||
| self._run_check('>undefined <', | ||
| [('entityref', 'gtundefined'), ('data', ' '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('>undefined <', [('data', '>undefined <')], | ||
| convert_charrefs=True) | ||
|
|
||
| self._run_check('& <', [('data', '& '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('& <', [('data', '& <')], convert_charrefs=True) | ||
|
|
||
| def test_eof_in_entityref(self): | ||
| self._run_check('>', [('entityref', 'gt')], convert_charrefs=False) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was |
||
| self._run_check('>', [('data', '>')], convert_charrefs=True) | ||
|
|
||
| self._run_check('&g', [('entityref', 'g')], convert_charrefs=False) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ampersand was only swallowed in the case of 1-character name before EOF. |
||
| self._run_check('&g', [('data', '&g')], convert_charrefs=True) | ||
|
|
||
| self._run_check('&undefined', [('entityref', 'undefined')], | ||
| convert_charrefs=False) | ||
| self._run_check('&undefined', [('data', '&undefined')], | ||
| convert_charrefs=True) | ||
|
|
||
| self._run_check('>undefined', [('entityref', 'gtundefined')], | ||
| convert_charrefs=False) | ||
| self._run_check('>undefined', [('data', '>undefined')], | ||
| convert_charrefs=True) | ||
|
|
||
| self._run_check('&', [('data', '&')], convert_charrefs=False) | ||
| self._run_check('&', [('data', '&')], convert_charrefs=True) | ||
|
|
||
| def test_unclosed_charref(self): | ||
| self._run_check('{ <', [('charref', '123'), ('data', ' '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('{ <', [('data', '{ <')], convert_charrefs=True) | ||
| self._run_check('« <', [('charref', 'xab'), ('data', ' '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('« <', [('data', '\xab <')], convert_charrefs=True) | ||
|
|
||
| self._run_check('� <', | ||
| [('charref', '123456789'), ('data', ' '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('� <', [('data', '\ufffd <')], | ||
| convert_charrefs=True) | ||
| self._run_check('� <', | ||
| [('charref', 'x123456789'), ('data', ' '), ('entityref', 'lt')], | ||
| convert_charrefs=False) | ||
| self._run_check('� <', [('data', '\ufffd <')], | ||
| convert_charrefs=True) | ||
|
|
||
| self._run_check('&# <', [('data', '&# '), ('entityref', 'lt')], convert_charrefs=False) | ||
| self._run_check('&# <', [('data', '&# <')], convert_charrefs=True) | ||
| self._run_check('&#x <', [('data', '&#x '), ('entityref', 'lt')], convert_charrefs=False) | ||
| self._run_check('&#x <', [('data', '&#x <')], convert_charrefs=True) | ||
|
|
||
| def test_eof_in_charref(self): | ||
| self._run_check('{', [('charref', '123')], convert_charrefs=False) | ||
| self._run_check('{', [('data', '{')], convert_charrefs=True) | ||
| self._run_check('«', [('charref', 'xab')], convert_charrefs=False) | ||
| self._run_check('«', [('data', '\xab')], convert_charrefs=True) | ||
|
|
||
| self._run_check('�', [('charref', '123456789')], | ||
| convert_charrefs=False) | ||
| self._run_check('�', [('data', '\ufffd')], convert_charrefs=True) | ||
| self._run_check('�', [('charref', 'x123456789')], | ||
| convert_charrefs=False) | ||
| self._run_check('�', [('data', '\ufffd')], convert_charrefs=True) | ||
|
|
||
| self._run_check('&#', [('data', '&#')], convert_charrefs=False) | ||
| self._run_check('&#', [('data', '&#')], convert_charrefs=True) | ||
| self._run_check('&#x', [('data', '&#x')], convert_charrefs=False) | ||
| self._run_check('&#x', [('data', '&#x')], convert_charrefs=True) | ||
|
|
||
| def test_bad_nesting(self): | ||
| # Strangely, this *is* supposed to test that overlapping | ||
|
|
@@ -762,20 +840,6 @@ def test_correct_detection_of_start_tags(self): | |
| ] | ||
| self._run_check(html, expected) | ||
|
|
||
| def test_EOF_in_charref(self): | ||
| # see #17802 | ||
| # This test checks that the UnboundLocalError reported in the issue | ||
| # is not raised, however I'm not sure the returned values are correct. | ||
| # Maybe HTMLParser should use self.unescape for these | ||
| data = [ | ||
| ('a&', [('data', 'a&')]), | ||
| ('a&b', [('data', 'ab')]), | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the reported bug. It was in the tests! |
||
| ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]), | ||
| ('a&b;', [('data', 'a'), ('entityref', 'b')]), | ||
| ] | ||
| for html, expected in data: | ||
| self._run_check(html, expected) | ||
|
|
||
| def test_eof_in_comments(self): | ||
| data = [ | ||
| ('<!--', [('comment', '')]), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| Fix handling of unclosed character references (named and numerical) | ||
| followed by the end of file in :class:`html.parser.HTMLParser` with | ||
| ``convert_charrefs=False``. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reason for emitting
&#as data now, instead of emitting it later with the rest of the data? IOW, in the case of'&x y', this will emithandle_data('&#')+handle_data(' y')instead of a singlehandle_data('&# y').Having multiple
handle_datais not wrong per se, but if we remove theelifblock and let theelsebreak, we can simplify the code and emit a singlehandle_data.The same might apply below, where a single
&is emitted. Also note that for' z &x y', the firsthandle_datagets called with only'z ', even if followed by one or more additionalhandle_data.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Try
&#x <. With this PR it emitshandle_data('&#'),handle_data('x '),handle_entityref('lt')which is not optimal, but correct. If remove thiselif, it will emithandle_data('&#x <'), which is incorrect.