diff --git a/debian/control b/debian/control index 85ecdd13518..4be62895ff3 100644 --- a/debian/control +++ b/debian/control @@ -9,7 +9,7 @@ Homepage: http://scrapy.org/ Package: scrapy-SUFFIX Architecture: all Depends: ${python:Depends}, python-lxml, python-twisted, python-openssl, - python-w3lib (>= 1.2), python-queuelib, python-cssselect (>= 0.9), python-six (>=1.5.2) + python-w3lib (>= 1.8.0), python-queuelib, python-cssselect (>= 0.9), python-six (>=1.5.2) Recommends: python-setuptools Conflicts: python-scrapy, scrapy, scrapy-0.11 Provides: python-scrapy, scrapy diff --git a/requirements.txt b/requirements.txt index 0df9a558ce9..005b8f4f5d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ Twisted>=10.0.0 lxml pyOpenSSL cssselect>=0.9 -w3lib>=1.2 +w3lib>=1.8.0 queuelib six>=1.5.2 diff --git a/scrapy/contrib/downloadermiddleware/ajaxcrawl.py b/scrapy/contrib/downloadermiddleware/ajaxcrawl.py index c2ab67ae7cf..fcbfdb1e7a9 100644 --- a/scrapy/contrib/downloadermiddleware/ajaxcrawl.py +++ b/scrapy/contrib/downloadermiddleware/ajaxcrawl.py @@ -84,6 +84,6 @@ def _has_ajaxcrawlable_meta(text): text = _script_re.sub(u'', text) text = _noscript_re.sub(u'', text) - text = html.remove_comments(html.remove_entities(text)) + text = html.remove_comments(html.replace_entities(text)) return _ajax_crawlable_re.search(text) is not None diff --git a/scrapy/contrib/linkextractors/regex.py b/scrapy/contrib/linkextractors/regex.py index e9d77e618fa..905eb89692a 100644 --- a/scrapy/contrib/linkextractors/regex.py +++ b/scrapy/contrib/linkextractors/regex.py @@ -1,7 +1,7 @@ import re from six.moves.urllib.parse import urljoin -from w3lib.html import remove_tags, remove_entities, replace_escape_chars +from w3lib.html import remove_tags, replace_entities, replace_escape_chars from scrapy.link import Link from .sgml import SgmlLinkExtractor @@ -21,7 +21,7 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur if base_url is None: base_url = urljoin(response_url, self.base_url) if self.base_url else response_url - clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding)))) + clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) diff --git a/scrapy/utils/misc.py b/scrapy/utils/misc.py index e72a5d04235..be394eb1d7c 100644 --- a/scrapy/utils/misc.py +++ b/scrapy/utils/misc.py @@ -5,7 +5,7 @@ from pkgutil import iter_modules import six -from w3lib.html import remove_entities +from w3lib.html import replace_entities from scrapy.utils.python import flatten from scrapy.item import BaseItem @@ -94,9 +94,9 @@ def extract_regex(regex, text, encoding='utf-8'): strings = flatten(strings) if isinstance(text, unicode): - return [remove_entities(s, keep=['lt', 'amp']) for s in strings] + return [replace_entities(s, keep=['lt', 'amp']) for s in strings] else: - return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings] + return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings] def md5sum(file): diff --git a/setup.py b/setup.py index 6efe640740e..252068c2059 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,7 @@ def is_not_module(filename): else: setup_args['install_requires'] = [ 'Twisted>=10.0.0', - 'w3lib>=1.2', + 'w3lib>=1.8.0', 'queuelib', 'lxml', 'pyOpenSSL', diff --git a/tox.ini b/tox.ini index 20d54b6583b..38ef6c4cc21 100644 --- a/tox.ini +++ b/tox.ini @@ -46,7 +46,7 @@ deps = pyOpenSSL>=0.13.1 cssselect>=0.9 queuelib>=1.1.1 - w3lib>=1.5 + w3lib>=1.8.0 Pillow # tests requirements mock