diff --git a/CHANGES/4972.bugfix b/CHANGES/4972.bugfix new file mode 100644 index 00000000000..6654f8a645d --- /dev/null +++ b/CHANGES/4972.bugfix @@ -0,0 +1 @@ +Fix inconsistency between Python and C http request parsers in parsing pct-encoded URL. diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 93d4ab5b1ed..d712817b89c 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -250,6 +250,7 @@ Sergey Ninua Sergey Skripnick Serhii Charykov Serhii Kostel +Serhiy Storchaka Simon Kennedy Sin-Woo Bang Stanislas Plum diff --git a/aiohttp/_http_parser.pyx b/aiohttp/_http_parser.pyx index eb2157f6bb7..04360b89009 100644 --- a/aiohttp/_http_parser.pyx +++ b/aiohttp/_http_parser.pyx @@ -868,7 +868,7 @@ cdef _parse_url(char* buf_data, size_t length): return URL_build(scheme=schema, user=user, password=password, host=host, port=port, - path=path, query=query, fragment=fragment) + path=path, query_string=query, fragment=fragment, encoded=True) else: raise InvalidURLError("invalid url {!r}".format(buf_data)) finally: diff --git a/aiohttp/web_urldispatcher.py b/aiohttp/web_urldispatcher.py index 8494e7aa52f..08b989e8022 100644 --- a/aiohttp/web_urldispatcher.py +++ b/aiohttp/web_urldispatcher.py @@ -31,6 +31,7 @@ from typing_extensions import TypedDict from yarl import URL +from yarl import __version__ as yarl_version # type: ignore from . import hdrs from .abc import AbstractMatchInfo, AbstractRouter, AbstractView @@ -61,6 +62,8 @@ else: BaseDict = dict +YARL_VERSION = tuple(map(int, yarl_version.split('.')[:2])) + HTTP_METHOD_RE = re.compile(r"^[0-9A-Za-z!#\$%&'\*\+\-\.\^_`\|~]+$") ROUTE_RE = re.compile(r'(\{[_a-zA-Z][^{}]*(?:\{[^{}]*\}[^{}]*)*\})') PATH_SEP = re.escape('/') @@ -421,9 +424,9 @@ def __init__(self, path: str, *, name: Optional[str]=None) -> None: if '{' in part or '}' in part: raise ValueError("Invalid path '{}'['{}']".format(path, part)) - path = URL.build(path=part).raw_path - formatter += path - pattern += re.escape(path) + part = _requote_path(part) + formatter += part + pattern += re.escape(part) try: compiled = re.compile(pattern) @@ -451,7 +454,7 @@ def _match(self, path: str) -> Optional[Dict[str, str]]: if match is None: return None else: - return {key: URL.build(path=value, encoded=True).path + return {key: _unquote_path(value) for key, value in match.groupdict().items()} def raw_match(self, path: str) -> bool: @@ -462,9 +465,9 @@ def get_info(self) -> _InfoDict: 'pattern': self._pattern} def url_for(self, **parts: str) -> URL: - url = self._formatter.format_map({k: URL.build(path=v).raw_path + url = self._formatter.format_map({k: _quote_path(v) for k, v in parts.items()}) - return URL.build(path=url) + return URL.build(path=url, encoded=True) def __repr__(self) -> str: name = "'" + self.name + "' " if self.name is not None else "" @@ -478,7 +481,7 @@ def __init__(self, prefix: str, *, name: Optional[str]=None) -> None: assert not prefix or prefix.startswith('/'), prefix assert prefix in ('', '/') or not prefix.endswith('/'), prefix super().__init__(name=name) - self._prefix = URL.build(path=prefix).raw_path + self._prefix = _requote_path(prefix) @property def canonical(self) -> str: @@ -535,17 +538,17 @@ def url_for(self, *, filename: Union[str, Path], # type: ignore append_version = self._append_version if isinstance(filename, Path): filename = str(filename) - while filename.startswith('/'): - filename = filename[1:] - filename = '/' + filename + filename = filename.lstrip('/') + url = URL.build(path=self._prefix, encoded=True) # filename is not encoded - url = URL.build(path=self._prefix + filename) + if YARL_VERSION < (1, 6): + url = url / filename.replace('%', '%25') + else: + url = url / filename if append_version: try: - if filename.startswith('/'): - filename = filename[1:] filepath = self._directory.joinpath(filename).resolve() if not self._follow_symlinks: filepath.relative_to(self._directory) @@ -592,8 +595,7 @@ async def resolve(self, request: Request) -> _Resolve: if method not in allowed_methods: return None, allowed_methods - match_dict = {'filename': URL.build(path=path[len(self._prefix)+1:], - encoded=True).path} + match_dict = {'filename': _unquote_path(path[len(self._prefix)+1:])} return (UrlMappingMatchInfo(match_dict, self._routes[method]), allowed_methods) @@ -1032,8 +1034,7 @@ def add_resource(self, path: str, *, if resource.name == name and resource.raw_match(path): return cast(Resource, resource) if not ('{' in path or '}' in path or ROUTE_RE.search(path)): - url = URL.build(path=path) - resource = PlainResource(url.raw_path, name=name) + resource = PlainResource(_requote_path(path), name=name) self.register_resource(resource) return resource resource = DynamicResource(path, name=name) @@ -1152,3 +1153,22 @@ def add_routes(self, for route_def in routes: registered_routes.extend(route_def.register(self)) return registered_routes + + +def _quote_path(value: str) -> str: + if YARL_VERSION < (1, 6): + value = value.replace('%', '%25') + return URL.build(path=value, encoded=False).raw_path + + +def _unquote_path(value: str) -> str: + return URL.build(path=value, encoded=True).path + + +def _requote_path(value: str) -> str: + # Quote non-ascii characters and other characters which must be quoted, + # but preserve existing %-sequences. + result = _quote_path(value) + if '%' in value: + result = result.replace('%25', '%') + return result diff --git a/requirements/ci-wheel.txt b/requirements/ci-wheel.txt index f33579936ea..9178daea2eb 100644 --- a/requirements/ci-wheel.txt +++ b/requirements/ci-wheel.txt @@ -11,7 +11,7 @@ pytest==6.1.1 pytest-cov==2.10.1 pytest-mock==3.3.1 typing_extensions==3.7.4.3 -yarl==1.4.2 +yarl==1.6.1 # Using PEP 508 env markers to control dependency on runtimes: diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index a282d52af43..ac4d5f03c76 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -2,6 +2,7 @@ import asyncio from unittest import mock +from urllib.parse import quote import pytest from multidict import CIMultiDict @@ -787,6 +788,55 @@ def test_url_parse_non_strict_mode(parser) -> None: assert payload.is_eof() +@pytest.mark.parametrize( + ('uri', 'path', 'query', 'fragment'), + [ + ('/path%23frag', '/path#frag', {}, ''), + ('/path%2523frag', '/path%23frag', {}, ''), + ('/path?key=value%23frag', '/path', {'key': 'value#frag'}, ''), + ('/path?key=value%2523frag', '/path', {'key': 'value%23frag'}, ''), + ('/path#frag%20', '/path', {}, 'frag '), + ('/path#frag%2520', '/path', {}, 'frag%20'), + ] +) +def test_parse_uri_percent_encoded(parser, uri, path, query, fragment) -> None: + text = ('GET %s HTTP/1.1\r\n\r\n' % (uri,)).encode() + messages, upgrade, tail = parser.feed_data(text) + msg = messages[0][0] + + assert msg.path == uri + assert msg.url == URL(uri) + assert msg.url.path == path + assert msg.url.query == query + assert msg.url.fragment == fragment + + +def test_parse_uri_utf8(parser) -> None: + text = ('GET /путь?ключ=знач#фраг HTTP/1.1\r\n\r\n').encode() + messages, upgrade, tail = parser.feed_data(text) + msg = messages[0][0] + + assert msg.path == '/путь?ключ=знач#фраг' + assert msg.url.path == '/путь' + assert msg.url.query == {'ключ': 'знач'} + assert msg.url.fragment == 'фраг' + + +def test_parse_uri_utf8_percent_encoded(parser) -> None: + text = ( + 'GET %s HTTP/1.1\r\n\r\n' % + quote('/путь?ключ=знач#фраг', safe='/?=#') + ).encode() + messages, upgrade, tail = parser.feed_data(text) + msg = messages[0][0] + + assert msg.path == quote('/путь?ключ=знач#фраг', safe='/?=#') + assert msg.url == URL('/путь?ключ=знач#фраг') + assert msg.url.path == '/путь' + assert msg.url.query == {'ключ': 'знач'} + assert msg.url.fragment == 'фраг' + + @pytest.mark.skipif('HttpRequestParserC' not in dir(aiohttp.http_parser), reason="C based HTTP parser not available") def test_parse_bad_method_for_c_parser_raises(loop, protocol): diff --git a/tests/test_urldispatch.py b/tests/test_urldispatch.py index a05f1d9c7d8..5aa25c76d83 100644 --- a/tests/test_urldispatch.py +++ b/tests/test_urldispatch.py @@ -467,6 +467,20 @@ def test_add_static_append_version_not_follow_symlink(router, assert '/st/append_version_symlink/data.unknown_mime_type' == str(url) +def test_add_static_quoting(router) -> None: + resource = router.add_static('/пре %2Fфикс', + pathlib.Path(aiohttp.__file__).parent, + name='static') + assert router['static'] is resource + url = resource.url_for(filename='/1 2/файл%2F.txt') + assert url.path == '/пре /фикс/1 2/файл%2F.txt' + assert str(url) == ( + '/%D0%BF%D1%80%D0%B5%20%2F%D1%84%D0%B8%D0%BA%D1%81' + '/1%202/%D1%84%D0%B0%D0%B9%D0%BB%252F.txt' + ) + assert len(resource) == 2 + + def test_plain_not_match(router) -> None: handler = make_handler() router.add_route('GET', '/get/path', handler, name='name') @@ -629,10 +643,14 @@ def test_route_dynamic_with_regex(router) -> None: def test_route_dynamic_quoting(router) -> None: handler = make_handler() - route = router.add_route('GET', r'/{arg}', handler) - - url = route.url_for(arg='1 2/текст') - assert '/1%202/%D1%82%D0%B5%D0%BA%D1%81%D1%82' == str(url) + route = router.add_route('GET', r'/пре %2Fфикс/{arg}', handler) + + url = route.url_for(arg='1 2/текст%2F') + assert url.path == '/пре /фикс/1 2/текст%2F' + assert str(url) == ( + '/%D0%BF%D1%80%D0%B5%20%2F%D1%84%D0%B8%D0%BA%D1%81' + '/1%202/%D1%82%D0%B5%D0%BA%D1%81%D1%82%252F' + ) async def test_regular_match_info(router) -> None: diff --git a/vendor/http-parser b/vendor/http-parser index 2343fd6b521..77310eeb839 160000 --- a/vendor/http-parser +++ b/vendor/http-parser @@ -1 +1 @@ -Subproject commit 2343fd6b5214b2ded2cdcf76de2bf60903bb90cd +Subproject commit 77310eeb839c4251c07184a5db8885a572a08352