Skip to content

httputil: Forbid control chars and CR in header values #3489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion tornado/httputil.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import unicodedata
from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl

from tornado.escape import native_str, parse_qs_bytes, utf8
from tornado.escape import native_str, parse_qs_bytes, utf8, to_unicode
from tornado.log import gen_log
from tornado.util import ObjectDict, unicode_type

Expand Down Expand Up @@ -100,6 +100,12 @@ class _ABNF:
# RFC 9110 (HTTP Semantics)
obs_text = re.compile(r"[\x80-\xFF]")
field_vchar = re.compile(rf"(?:{VCHAR.pattern}|{obs_text.pattern})")
# Not exactly from the RFC to simplify and combine field-content and field-value.
field_value = re.compile(
rf"|"
rf"{field_vchar.pattern}|"
rf"{field_vchar.pattern}(?:{field_vchar.pattern}| |\t)*{field_vchar.pattern}"
)
tchar = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]")
token = re.compile(rf"{tchar.pattern}+")
field_name = token
Expand Down Expand Up @@ -195,6 +201,10 @@ def add(self, name: str, value: str) -> None:
"""Adds a new value for the given key."""
if not _ABNF.field_name.fullmatch(name):
raise HTTPInputError("Invalid header name %r" % name)
if not _ABNF.field_value.fullmatch(to_unicode(value)):
# TODO: the fact we still support bytes here (contrary to type annotations)
# and still test for it should probably be changed.
raise HTTPInputError("Invalid header value %r" % value)
norm_name = _normalize_header(name)
self._last_key = norm_name
if norm_name in self:
Expand Down Expand Up @@ -254,6 +264,8 @@ def parse_line(self, line: str) -> None:
if self._last_key is None:
raise HTTPInputError("first header line cannot start with whitespace")
new_part = " " + line.strip(HTTP_WHITESPACE)
if not _ABNF.field_value.fullmatch(new_part[1:]):
raise HTTPInputError("Invalid header continuation %r" % new_part)
self._as_list[self._last_key][-1] += new_part
self._dict[self._last_key] += new_part
else:
Expand Down
33 changes: 25 additions & 8 deletions tornado/test/httputil_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,17 @@ def test_continuation(self):
data = "Foo: bar\r\n\fasdf"
self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)

def test_forbidden_ascii_characters(self):
# Control characters and ASCII whitespace other than space, tab, and CRLF are not allowed in
# headers.
for c in range(0xFF):
data = f"Foo: bar{chr(c)}baz\r\n"
if c == 0x09 or (c >= 0x20 and c != 0x7F):
headers = HTTPHeaders.parse(data)
self.assertEqual(headers["Foo"], f"bar{chr(c)}baz")
else:
self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)

def test_unicode_newlines(self):
# Ensure that only \r\n is recognized as a header separator, and not
# the other newline-like unicode characters.
Expand All @@ -311,10 +322,13 @@ def test_unicode_newlines(self):
# and cpython's unicodeobject.c (which defines the implementation
# of unicode_type.splitlines(), and uses a different list than TR13).
newlines = [
"\u001b", # VERTICAL TAB
"\u001c", # FILE SEPARATOR
"\u001d", # GROUP SEPARATOR
"\u001e", # RECORD SEPARATOR
# The following ascii characters are sometimes treated as newline-like,
# but they're disallowed in HTTP headers. This test covers unicode
# characters that are permitted in headers (under the obs-text rule).
# "\u001b", # VERTICAL TAB
# "\u001c", # FILE SEPARATOR
# "\u001d", # GROUP SEPARATOR
# "\u001e", # RECORD SEPARATOR
"\u0085", # NEXT LINE
"\u2028", # LINE SEPARATOR
"\u2029", # PARAGRAPH SEPARATOR
Expand Down Expand Up @@ -363,13 +377,16 @@ def test_unicode_whitespace(self):
self.assertEqual(expected, list(headers.get_all()))

def test_optional_cr(self):
# Bare CR is not a valid line separator
with self.assertRaises(HTTPInputError):
HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")

# Both CRLF and LF should be accepted as separators. CR should not be
# part of the data when followed by LF, but it is a normal char
# otherwise (or should bare CR be an error?)
headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")
# part of the data when followed by LF.
headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nMore: more\r\n")
self.assertEqual(
sorted(headers.get_all()),
[("Cr", "cr\rMore: more"), ("Crlf", "crlf"), ("Lf", "lf")],
[("Crlf", "crlf"), ("Lf", "lf"), ("More", "more")],
)

def test_copy(self):
Expand Down