Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing the Forwarded header #2173

Merged
merged 1 commit into from
Aug 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Thomas Grainger
Tolga Tezel
Vaibhav Sagar
Vamsi Krishna Avula
Vasiliy Faronov
Vasyl Baran
Vikas Kawadia
Vitalik Verhovodov
Expand Down
76 changes: 40 additions & 36 deletions aiohttp/web_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
_TCHAR = string.digits + string.ascii_letters + r"!#$%&'*+.^_`|~-"
# '-' at the end to prevent interpretation as range in a char class

_TOKEN = r'[{tchar}]*'.format(tchar=_TCHAR)
_TOKEN = r'[{tchar}]+'.format(tchar=_TCHAR)

_QDTEXT = r'[{}]'.format(
r''.join(chr(c) for c in (0x09, 0x20, 0x21) + tuple(range(0x23, 0x7F))))
Expand All @@ -39,12 +39,8 @@
_QUOTED_STRING = r'"(?:{quoted_pair}|{qdtext})*"'.format(
qdtext=_QDTEXT, quoted_pair=_QUOTED_PAIR)

_FORWARDED_PARAMS = (
r'[bB][yY]|[fF][oO][rR]|[hH][oO][sS][tT]|[pP][rR][oO][tT][oO]')

_FORWARDED_PAIR = (
r'^({forwarded_params})=({token}|{quoted_string})$'.format(
forwarded_params=_FORWARDED_PARAMS,
r'({token})=({token}|{quoted_string})'.format(
token=_TOKEN,
quoted_string=_QUOTED_STRING))

Expand Down Expand Up @@ -208,37 +204,45 @@ def forwarded(self):

Returns a tuple containing one or more immutable dicts
"""
def _parse_forwarded(forwarded_headers):
for field_value in forwarded_headers:
# by=...;for=..., For=..., BY=...
for forwarded_elm in field_value.split(','):
# by=...;for=...
fvparams = dict()
forwarded_pairs = (
_FORWARDED_PAIR_RE.findall(pair)
for pair in forwarded_elm.strip().split(';'))
for forwarded_pair in forwarded_pairs:
# by=...
if len(forwarded_pair) != 1:
# non-compliant syntax
break
param, value = forwarded_pair[0]
if param.lower() in fvparams:
# duplicate param in field-value
break
if value and value[0] == '"':
# quoted string: replace quotes and escape
# sequences
value = _QUOTED_PAIR_REPLACE_RE.sub(
r'\1', value[1:-1])
fvparams[param.lower()] = value
elems = []
for field_value in self._message.headers.getall(hdrs.FORWARDED, ()):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fafhrd91 @asvetlov
It is easily possible to build FSM which will parse this structure using regex module. Unfortunatelly built-in regexps are not able to capture all repeating matches.

Pros:

  • Less code
  • Less error-prone
  • Faster
  • Ability to parse other structures using higly-efficient FSMs

Cons:

  • Dependency on regex module
  • More difficult to understand (unsure, maybe more simplier)
  • Non-pythonic way.

(Anyway, maybe merge as shown here, but re-write later, I can do that)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@socketpair please merge the PR first than feel free to create a new one for regexps based approach.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@asvetlov how can I sure that other members agree with my decision to merge ? (sorry, I'm asking BEFORE making something awful)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@socketpair I’m not sure an FSM can handle my test_single_forwarded_header_injection1 case, which necessarily involves a form of backtracking. And I think this case is important to handle, at least until deployment experience shows that all reverse proxies take precautions against that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@socketpair the trick borrowed from CPython works: declare intention to merge (LGTM or github pull request approval), wait a day and do merge tomorrow if nobody objects.

length = len(field_value)
pos = 0
need_separator = False
elem = {}
elems.append(types.MappingProxyType(elem))
while 0 <= pos < length:
match = _FORWARDED_PAIR_RE.match(field_value, pos)
if match is not None: # got a valid forwarded-pair
if need_separator:
# bad syntax here, skip to next comma
pos = field_value.find(',', pos)
else:
yield types.MappingProxyType(fvparams)
continue
yield dict()

return tuple(
_parse_forwarded(self._message.headers.getall(hdrs.FORWARDED, ())))
(name, value) = match.groups()
if value[0] == '"':
# quoted string: remove quotes and unescape
value = _QUOTED_PAIR_REPLACE_RE.sub(r'\1',
value[1:-1])
elem[name.lower()] = value
pos += len(match.group(0))
need_separator = True
elif field_value[pos] == ',': # next forwarded-element
need_separator = False
elem = {}
elems.append(types.MappingProxyType(elem))
pos += 1
elif field_value[pos] == ';': # next forwarded-pair
need_separator = False
pos += 1
elif field_value[pos] in ' \t':
# Allow whitespace even between forwarded-pairs, though
# RFC 7239 doesn't. This simplifies code and is in line
# with Postel's law.
pos += 1
else:
# bad syntax here, skip to next comma
pos = field_value.find(',', pos)
return tuple(elems)

@reify
def _scheme(self):
Expand Down
1 change: 1 addition & 0 deletions changes/2170.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix parsing the Forwarded header according to RFC 7239.
88 changes: 85 additions & 3 deletions tests/test_web_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,61 @@ def test_single_forwarded_header_quoted_escaped():
assert req.forwarded[0]['proto'] == 'lala land~ 123!&'


def test_single_forwarded_header_custom_param():
header = r'BY=identifier;PROTO=https;SOME="other, \"value\""'
req = make_mocked_request('GET', '/',
headers=CIMultiDict({'Forwarded': header}))
assert len(req.forwarded) == 1
assert req.forwarded[0]['by'] == 'identifier'
assert req.forwarded[0]['proto'] == 'https'
assert req.forwarded[0]['some'] == 'other, "value"'


def test_single_forwarded_header_empty_params():
# This is allowed by the grammar given in RFC 7239
header = ';For=identifier;;PROTO=https;;;'
req = make_mocked_request('GET', '/',
headers=CIMultiDict({'Forwarded': header}))
assert req.forwarded[0]['for'] == 'identifier'
assert req.forwarded[0]['proto'] == 'https'


def test_single_forwarded_header_bad_separator():
header = 'BY=identifier PROTO=https'
req = make_mocked_request('GET', '/',
headers=CIMultiDict({'Forwarded': header}))
assert 'proto' not in req.forwarded[0]


def test_single_forwarded_header_injection1():
# We might receive a header like this if we're sitting behind a reverse
# proxy that blindly appends a forwarded-element without checking
# the syntax of existing field-values. We should be able to recover
# the appended element anyway.
header = 'for=_injected;by=", for=_real'
req = make_mocked_request('GET', '/',
headers=CIMultiDict({'Forwarded': header}))
assert len(req.forwarded) == 2
assert 'by' not in req.forwarded[0]
assert req.forwarded[1]['for'] == '_real'


def test_single_forwarded_header_injection2():
header = 'very bad syntax, for=_real'
req = make_mocked_request('GET', '/',
headers=CIMultiDict({'Forwarded': header}))
assert len(req.forwarded) == 2
assert 'for' not in req.forwarded[0]
assert req.forwarded[1]['for'] == '_real'


def test_single_forwarded_header_long_quoted_string():
header = 'for="' + '\\\\' * 5000 + '"'
req = make_mocked_request('GET', '/',
headers=CIMultiDict({'Forwarded': header}))
assert req.forwarded[0]['for'] == '\\' * 5000


def test_multiple_forwarded_headers():
headers = CIMultiDict()
headers.add('Forwarded', 'By=identifier1;for=identifier2, BY=identifier3')
Expand All @@ -303,10 +358,37 @@ def test_multiple_forwarded_headers():
assert req.forwarded[2]['for'] == 'identifier5'


def test_multiple_forwarded_headers_bad_syntax():
headers = CIMultiDict()
headers.add('Forwarded', 'for=_1;by=_2')
headers.add('Forwarded', 'invalid value')
headers.add('Forwarded', '')
headers.add('Forwarded', 'for=_3;by=_4')
req = make_mocked_request('GET', '/', headers=headers)
assert len(req.forwarded) == 4
assert req.forwarded[0]['for'] == '_1'
assert 'for' not in req.forwarded[1]
assert 'for' not in req.forwarded[2]
assert req.forwarded[3]['by'] == '_4'


def test_multiple_forwarded_headers_injection():
headers = CIMultiDict()
# This could be sent by an attacker, hoping to "shadow" the second header.
headers.add('Forwarded', 'for=_injected;by="')
# This is added by our trusted reverse proxy.
headers.add('Forwarded', 'for=_real;by=_actual_proxy')
req = make_mocked_request('GET', '/', headers=headers)
assert len(req.forwarded) == 2
assert 'by' not in req.forwarded[0]
assert req.forwarded[1]['for'] == '_real'
assert req.forwarded[1]['by'] == '_actual_proxy'


def test_https_scheme_by_forwarded_header():
header = 'by=_1;for=_2;host=_3;proto=https'
req = make_mocked_request('GET', '/',
headers=CIMultiDict(
{'Forwarded': 'by=;for=;host=;proto=https'}))
headers=CIMultiDict({'Forwarded': header}))
assert "https" == req.scheme
assert req.secure is True

Expand Down Expand Up @@ -338,7 +420,7 @@ def test_https_scheme_by_x_forwarded_proto_header_no_tls():
def test_host_by_forwarded_header():
headers = CIMultiDict()
headers.add('Forwarded', 'By=identifier1;for=identifier2, BY=identifier3')
headers.add('Forwarded', 'by=;for=;host=example.com')
headers.add('Forwarded', 'by=unknown;for=unknown;host=example.com')
req = make_mocked_request('GET', '/', headers=headers)
assert req.host == 'example.com'

Expand Down