diff --git a/tornado/httputil.py b/tornado/httputil.py index b21d8046c4..9ce992d82b 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -62,6 +62,9 @@ from asyncio import Future # noqa: F401 import unittest # noqa: F401 +# To be used with str.strip() and related methods. +HTTP_WHITESPACE = " \t" + @lru_cache(1000) def _normalize_header(name: str) -> str: @@ -171,7 +174,7 @@ def parse_line(self, line: str) -> None: # continuation of a multi-line header if self._last_key is None: raise HTTPInputError("first header line cannot start with whitespace") - new_part = " " + line.lstrip() + new_part = " " + line.lstrip(HTTP_WHITESPACE) self._as_list[self._last_key][-1] += new_part self._dict[self._last_key] += new_part else: @@ -179,7 +182,7 @@ def parse_line(self, line: str) -> None: name, value = line.split(":", 1) except ValueError: raise HTTPInputError("no colon in header line") - self.add(name, value.strip()) + self.add(name, value.strip(HTTP_WHITESPACE)) @classmethod def parse(cls, headers: str) -> "HTTPHeaders": diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index aa9b6ee253..6d618839e0 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -334,6 +334,25 @@ def test_unicode_newlines(self): gen_log.warning("failed while trying %r in %s", newline, encoding) raise + def test_unicode_whitespace(self): + # Only tabs and spaces are to be stripped according to the HTTP standard. + # Other unicode whitespace is to be left as-is. In the context of headers, + # this specifically means the whitespace characters falling within the + # latin1 charset. + whitespace = [ + (" ", True), # SPACE + ("\t", True), # TAB + ("\u00a0", False), # NON-BREAKING SPACE + ("\u0085", False), # NEXT LINE + ] + for c, stripped in whitespace: + headers = HTTPHeaders.parse("Transfer-Encoding: %schunked" % c) + if stripped: + expected = [("Transfer-Encoding", "chunked")] + else: + expected = [("Transfer-Encoding", "%schunked" % c)] + self.assertEqual(expected, list(headers.get_all())) + def test_optional_cr(self): # Both CRLF and LF should be accepted as separators. CR should not be # part of the data when followed by LF, but it is a normal char