Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def test_safe_url_string(self):

self.assertTrue(isinstance(safe_url_string(b'http://example.com/'), str))

def test_safe_url_string_unsafe_chars(self):
safeurl = safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|")
self.assertEqual(safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|")

def test_safe_url_string_with_query(self):
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")
self.assertTrue(isinstance(safeurl, str))
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ envlist = py27, pypy, py33, py34, py35, py36

[testenv]
deps =
pytest
pytest !=3.1.1, !=3.1.2
pytest-cov
commands =
py.test \
Expand Down
96 changes: 51 additions & 45 deletions w3lib/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
import re
import posixpath
import warnings
import six
import string
from collections import namedtuple
import six
from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
urldefrag, urlencode, urlparse,
quote, parse_qs, parse_qsl,
Expand All @@ -24,51 +25,14 @@ def _quote_byte(error):

codecs.register_error('percentencode', _quote_byte)

# constants from RFC 3986, Section 2.2 and 2.3
RFC3986_GEN_DELIMS = b':/?#[]@'
RFC3986_SUB_DELIMS = b"!$&'()*+,;="
RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode('ascii')
EXTRA_SAFE_CHARS = b'|' # see https://github.com/scrapy/w3lib/pull/25

# Python 2.x urllib.always_safe become private in Python 3.x;
# its content is copied here
_ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
b'abcdefghijklmnopqrstuvwxyz'
b'0123456789' b'_.-')


def urljoin_rfc(base, ref, encoding='utf-8'):
r"""
.. warning::

This function is deprecated and will be removed in future.
It is not supported with Python 3.
Please use ``urlparse.urljoin`` instead.

Same as urlparse.urljoin but supports unicode values in base and ref
parameters (in which case they will be converted to str using the given
encoding).

Always returns a str.

>>> import w3lib.url
>>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
'http://www.example.com/otherpath/index2.html'
>>>

>>> # Note: the following does not work in Python 3
>>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
>>>


"""

warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
DeprecationWarning)

str_base = to_bytes(base, encoding)
str_ref = to_bytes(ref, encoding)
return urljoin(str_base, str_ref)

_reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax)
_unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3
_safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'

def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
"""Convert the given URL into a legal URL by escaping unsafe characters
Expand Down Expand Up @@ -117,6 +81,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
quote(to_bytes(parts.fragment, encoding), _safe_chars),
))


_parent_dirs = re.compile(r'/?(\.\./)+')

def safe_download_url(url):
Expand All @@ -137,9 +102,11 @@ def safe_download_url(url):
path = '/'
return urlunsplit((scheme, netloc, path, query, ''))


def is_url(text):
return text.partition("://")[0] in ('file', 'http', 'https')


def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
"""Return the value of a url parameter, given the url and parameter name

Expand Down Expand Up @@ -175,6 +142,7 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
)
return queryparams.get(parameter, [default])[0]


def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
"""Clean URL arguments leaving only those passed in the parameterlist keeping order

Expand Down Expand Up @@ -229,6 +197,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
url += '#' + fragment
return url


def add_or_replace_parameter(url, name, new_value):
"""Add or remove a parameter to a given url

Expand Down Expand Up @@ -270,13 +239,15 @@ def path_to_file_uri(path):
x = x.replace('|', ':') # http://bugs.python.org/issue5861
return 'file:///%s' % x.lstrip('/')


def file_uri_to_path(uri):
"""Convert File URI to local filesystem path according to:
http://en.wikipedia.org/wiki/File_URI_scheme
"""
uri_path = urlparse(uri).path
return url2pathname(uri_path)


def any_to_uri(uri_or_path):
"""If given a path name, return its File URI, otherwise return it
unmodified
Expand Down Expand Up @@ -584,3 +555,38 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
value = _coerce_result(value)
r.append((name, value))
return r


def urljoin_rfc(base, ref, encoding='utf-8'):
r"""
.. warning::

This function is deprecated and will be removed in future.
It is not supported with Python 3.
Please use ``urlparse.urljoin`` instead.

Same as urlparse.urljoin but supports unicode values in base and ref
parameters (in which case they will be converted to str using the given
encoding).

Always returns a str.

>>> import w3lib.url
>>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
'http://www.example.com/otherpath/index2.html'
>>>

>>> # Note: the following does not work in Python 3
>>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
>>>


"""

warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
DeprecationWarning)

str_base = to_bytes(base, encoding)
str_ref = to_bytes(ref, encoding)
return urljoin(str_base, str_ref)