Fix inconsistency between Python and C http request parsers. (#4973)

They both now correctly parse URLs containing pct-encoded sequences and work with yarl 1.6.0. Co-authored-by: Sviatoslav Sydorenko <wk.cvs.github@sydorenko.org.ua> Co-authored-by: Andrew Svetlov <andrew.svetlov@gmail.com>
aio-libs · Oct 21, 2020 · d321923 · d321923
1 parent 967b1b9
commit d321923
Show file tree

Hide file tree

Showing 8 changed files with 114 additions and 24 deletions.
diff --git a/CHANGES/4972.bugfix b/CHANGES/4972.bugfix
@@ -0,0 +1 @@
+Fix inconsistency between Python and C http request parsers in parsing pct-encoded URL.
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
@@ -250,6 +250,7 @@ Sergey Ninua
 Sergey Skripnick
 Serhii Charykov
 Serhii Kostel
+Serhiy Storchaka
 Simon Kennedy
 Sin-Woo Bang
 Stanislas Plum

diff --git a/aiohttp/_http_parser.pyx b/aiohttp/_http_parser.pyx
@@ -868,7 +868,7 @@ cdef _parse_url(char* buf_data, size_t length):
 
             return URL_build(scheme=schema,
                              user=user, password=password, host=host, port=port,
-                             path=path, query=query, fragment=fragment)
+                             path=path, query_string=query, fragment=fragment, encoded=True)
         else:
             raise InvalidURLError("invalid url {!r}".format(buf_data))
     finally:

diff --git a/aiohttp/web_urldispatcher.py b/aiohttp/web_urldispatcher.py
@@ -31,6 +31,7 @@
 
 from typing_extensions import TypedDict
 from yarl import URL
+from yarl import __version__ as yarl_version  # type: ignore
 
 from . import hdrs
 from .abc import AbstractMatchInfo, AbstractRouter, AbstractView
@@ -61,6 +62,8 @@
 else:
     BaseDict = dict
 
+YARL_VERSION = tuple(map(int, yarl_version.split('.')[:2]))
+
 HTTP_METHOD_RE = re.compile(r"^[0-9A-Za-z!#\$%&'\*\+\-\.\^_`\|~]+$")
 ROUTE_RE = re.compile(r'(\{[_a-zA-Z][^{}]*(?:\{[^{}]*\}[^{}]*)*\})')
 PATH_SEP = re.escape('/')
@@ -421,9 +424,9 @@ def __init__(self, path: str, *, name: Optional[str]=None) -> None:
             if '{' in part or '}' in part:
                 raise ValueError("Invalid path '{}'['{}']".format(path, part))
 
-            path = URL.build(path=part).raw_path
-            formatter += path
-            pattern += re.escape(path)
+            part = _requote_path(part)
+            formatter += part
+            pattern += re.escape(part)
 
         try:
             compiled = re.compile(pattern)
@@ -451,7 +454,7 @@ def _match(self, path: str) -> Optional[Dict[str, str]]:
         if match is None:
             return None
         else:
-            return {key: URL.build(path=value, encoded=True).path
+            return {key: _unquote_path(value)
                     for key, value in match.groupdict().items()}
 
     def raw_match(self, path: str) -> bool:
@@ -462,9 +465,9 @@ def get_info(self) -> _InfoDict:
                 'pattern': self._pattern}
 
     def url_for(self, **parts: str) -> URL:
-        url = self._formatter.format_map({k: URL.build(path=v).raw_path
+        url = self._formatter.format_map({k: _quote_path(v)
                                           for k, v in parts.items()})
-        return URL.build(path=url)
+        return URL.build(path=url, encoded=True)
 
     def __repr__(self) -> str:
         name = "'" + self.name + "' " if self.name is not None else ""
@@ -478,7 +481,7 @@ def __init__(self, prefix: str, *, name: Optional[str]=None) -> None:
         assert not prefix or prefix.startswith('/'), prefix
         assert prefix in ('', '/') or not prefix.endswith('/'), prefix
         super().__init__(name=name)
-        self._prefix = URL.build(path=prefix).raw_path
+        self._prefix = _requote_path(prefix)
 
     @property
     def canonical(self) -> str:
@@ -535,17 +538,17 @@ def url_for(self, *, filename: Union[str, Path],  # type: ignore
             append_version = self._append_version
         if isinstance(filename, Path):
             filename = str(filename)
-        while filename.startswith('/'):
-            filename = filename[1:]
-        filename = '/' + filename
+        filename = filename.lstrip('/')
 
+        url = URL.build(path=self._prefix, encoded=True)
         # filename is not encoded
-        url = URL.build(path=self._prefix + filename)
+        if YARL_VERSION < (1, 6):
+            url = url / filename.replace('%', '%25')
+        else:
+            url = url / filename
 
         if append_version:
             try:
-                if filename.startswith('/'):
-                    filename = filename[1:]
                 filepath = self._directory.joinpath(filename).resolve()
                 if not self._follow_symlinks:
                     filepath.relative_to(self._directory)
@@ -592,8 +595,7 @@ async def resolve(self, request: Request) -> _Resolve:
         if method not in allowed_methods:
             return None, allowed_methods
 
-        match_dict = {'filename': URL.build(path=path[len(self._prefix)+1:],
-                                            encoded=True).path}
+        match_dict = {'filename': _unquote_path(path[len(self._prefix)+1:])}
         return (UrlMappingMatchInfo(match_dict, self._routes[method]),
                 allowed_methods)
 
@@ -1032,8 +1034,7 @@ def add_resource(self, path: str, *,
             if resource.name == name and resource.raw_match(path):
                 return cast(Resource, resource)
         if not ('{' in path or '}' in path or ROUTE_RE.search(path)):
-            url = URL.build(path=path)
-            resource = PlainResource(url.raw_path, name=name)
+            resource = PlainResource(_requote_path(path), name=name)
             self.register_resource(resource)
             return resource
         resource = DynamicResource(path, name=name)
@@ -1152,3 +1153,22 @@ def add_routes(self,
         for route_def in routes:
             registered_routes.extend(route_def.register(self))
         return registered_routes
+
+
+def _quote_path(value: str) -> str:
+    if YARL_VERSION < (1, 6):
+        value = value.replace('%', '%25')
+    return URL.build(path=value, encoded=False).raw_path
+
+
+def _unquote_path(value: str) -> str:
+    return URL.build(path=value, encoded=True).path
+
+
+def _requote_path(value: str) -> str:
+    # Quote non-ascii characters and other characters which must be quoted,
+    # but preserve existing %-sequences.
+    result = _quote_path(value)
+    if '%' in value:
+        result = result.replace('%25', '%')
+    return result
diff --git a/requirements/ci-wheel.txt b/requirements/ci-wheel.txt
@@ -11,7 +11,7 @@ pytest==6.1.1
 pytest-cov==2.10.1
 pytest-mock==3.3.1
 typing_extensions==3.7.4.3
-yarl==1.4.2
+yarl==1.6.1
 
 # Using PEP 508 env markers to control dependency on runtimes:
 

diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py
@@ -2,6 +2,7 @@
 
 import asyncio
 from unittest import mock
+from urllib.parse import quote
 
 import pytest
 from multidict import CIMultiDict
@@ -787,6 +788,55 @@ def test_url_parse_non_strict_mode(parser) -> None:
     assert payload.is_eof()
 
 
+@pytest.mark.parametrize(
+    ('uri', 'path', 'query', 'fragment'),
+    [
+        ('/path%23frag', '/path#frag', {}, ''),
+        ('/path%2523frag', '/path%23frag', {}, ''),
+        ('/path?key=value%23frag', '/path', {'key': 'value#frag'}, ''),
+        ('/path?key=value%2523frag', '/path', {'key': 'value%23frag'}, ''),
+        ('/path#frag%20', '/path', {}, 'frag '),
+        ('/path#frag%2520', '/path', {}, 'frag%20'),
+    ]
+)
+def test_parse_uri_percent_encoded(parser, uri, path, query, fragment) -> None:
+    text = ('GET %s HTTP/1.1\r\n\r\n' % (uri,)).encode()
+    messages, upgrade, tail = parser.feed_data(text)
+    msg = messages[0][0]
+
+    assert msg.path == uri
+    assert msg.url == URL(uri)
+    assert msg.url.path == path
+    assert msg.url.query == query
+    assert msg.url.fragment == fragment
+
+
+def test_parse_uri_utf8(parser) -> None:
+    text = ('GET /путь?ключ=знач#фраг HTTP/1.1\r\n\r\n').encode()
+    messages, upgrade, tail = parser.feed_data(text)
+    msg = messages[0][0]
+
+    assert msg.path == '/путь?ключ=знач#фраг'
+    assert msg.url.path == '/путь'
+    assert msg.url.query == {'ключ': 'знач'}
+    assert msg.url.fragment == 'фраг'
+
+
+def test_parse_uri_utf8_percent_encoded(parser) -> None:
+    text = (
+        'GET %s HTTP/1.1\r\n\r\n' %
+        quote('/путь?ключ=знач#фраг', safe='/?=#')
+    ).encode()
+    messages, upgrade, tail = parser.feed_data(text)
+    msg = messages[0][0]
+
+    assert msg.path == quote('/путь?ключ=знач#фраг', safe='/?=#')
+    assert msg.url == URL('/путь?ключ=знач#фраг')
+    assert msg.url.path == '/путь'
+    assert msg.url.query == {'ключ': 'знач'}
+    assert msg.url.fragment == 'фраг'
+
+
 @pytest.mark.skipif('HttpRequestParserC' not in dir(aiohttp.http_parser),
                     reason="C based HTTP parser not available")
 def test_parse_bad_method_for_c_parser_raises(loop, protocol):

diff --git a/tests/test_urldispatch.py b/tests/test_urldispatch.py
@@ -467,6 +467,20 @@ def test_add_static_append_version_not_follow_symlink(router,
     assert '/st/append_version_symlink/data.unknown_mime_type' == str(url)
 
 
+def test_add_static_quoting(router) -> None:
+    resource = router.add_static('/пре %2Fфикс',
+                                 pathlib.Path(aiohttp.__file__).parent,
+                                 name='static')
+    assert router['static'] is resource
+    url = resource.url_for(filename='/1 2/файл%2F.txt')
+    assert url.path == '/пре /фикс/1 2/файл%2F.txt'
+    assert str(url) == (
+        '/%D0%BF%D1%80%D0%B5%20%2F%D1%84%D0%B8%D0%BA%D1%81'
+        '/1%202/%D1%84%D0%B0%D0%B9%D0%BB%252F.txt'
+    )
+    assert len(resource) == 2
+
+
 def test_plain_not_match(router) -> None:
     handler = make_handler()
     router.add_route('GET', '/get/path', handler, name='name')
@@ -629,10 +643,14 @@ def test_route_dynamic_with_regex(router) -> None:
 
 def test_route_dynamic_quoting(router) -> None:
     handler = make_handler()
-    route = router.add_route('GET', r'/{arg}', handler)
-
-    url = route.url_for(arg='1 2/текст')
-    assert '/1%202/%D1%82%D0%B5%D0%BA%D1%81%D1%82' == str(url)
+    route = router.add_route('GET', r'/пре %2Fфикс/{arg}', handler)
+
+    url = route.url_for(arg='1 2/текст%2F')
+    assert url.path == '/пре /фикс/1 2/текст%2F'
+    assert str(url) == (
+        '/%D0%BF%D1%80%D0%B5%20%2F%D1%84%D0%B8%D0%BA%D1%81'
+        '/1%202/%D1%82%D0%B5%D0%BA%D1%81%D1%82%252F'
+    )
 
 
 async def test_regular_match_info(router) -> None:

diff --git a/vendor/http-parser b/vendor/http-parser
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fix inconsistency between Python and C http request parsers in parsing pct-encoded URL.
+6 −6		Makefile
+44 −150		http_parser.c
+5 −12		http_parser.h
+4 −132		test.c