From fa84730e702a1d94ddeee0acbb81ac7394348b98 Mon Sep 17 00:00:00 2001 From: tpeng Date: Wed, 12 Nov 2014 12:28:02 +0100 Subject: [PATCH 1/3] avoid download large response introduce DOWNLOAD_MAXSIZE and DOWNLOAD_WARNSIZE in settings and download_maxsize/download_warnsize in spider/request meta, so downloader stop downloading as soon as the received data exceed the limit. also check the twsisted response's length in advance to stop downloading as early as possible. --- docs/topics/settings.rst | 34 +++++++++++++ scrapy/core/downloader/handlers/http11.py | 45 +++++++++++++++-- scrapy/settings/default_settings.py | 3 ++ tests/test_downloader_handlers.py | 60 +++++++++++++++++++++++ 4 files changed, 137 insertions(+), 5 deletions(-) diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 9000f024284..4022267438a 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -422,6 +422,40 @@ The amount of time (in secs) that the downloader will wait before timing out. spider attribute and per-request using :reqmeta:`download_timeout` Request.meta key. +.. setting:: DOWNLOAD_MAXSIZE + +DOWNLOAD_MAXSIZE +---------------- + +Default: `1073741824` (1024Mb) + +The maximum response size (in bytes) that downloader will download. + +If you want to disable it set to 0. + +.. note:: + + This size can be set per spider using :attr:`download_maxsize` + spider attribute and per-request using :reqmeta:`download_maxsize` + Request.meta key. + +.. setting:: DOWNLOAD_WARNSIZE + +DOWNLOAD_WARNSIZE +---------------- + +Default: `33554432` (32Mb) + +The response size (in bytes) that downloader will start to warn. + +If you want to disable it set to 0. + +.. note:: + + This size can be set per spider using :attr:`download_warnsize` + spider attribute and per-request using :reqmeta:`download_warnsize` + Request.meta key. + .. setting:: DUPEFILTER_CLASS DUPEFILTER_CLASS diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 23cd07c5144..dd3ad488b9d 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -9,7 +9,7 @@ from zope.interface import implements from twisted.internet import defer, reactor, protocol from twisted.web.http_headers import Headers as TxHeaders -from twisted.web.iweb import IBodyProducer +from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH from twisted.internet.error import TimeoutError from twisted.web.http import PotentialDataLoss from scrapy.xlib.tx import Agent, ProxyAgent, ResponseDone, \ @@ -19,6 +19,7 @@ from scrapy.responsetypes import responsetypes from scrapy.core.downloader.webclient import _parse from scrapy.utils.misc import load_object +from scrapy import log class HTTP11DownloadHandler(object): @@ -29,10 +30,14 @@ def __init__(self, settings): self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() + self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') + self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') def download_request(self, request, spider): """Return a deferred for the HTTP download""" - agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool) + agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, + maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), + warnsize=getattr(spider, 'download_warnsize', self._default_warnsize)) return agent.download_request(request) def close(self): @@ -131,11 +136,14 @@ class ScrapyAgent(object): _ProxyAgent = ProxyAgent _TunnelingAgent = TunnelingAgent - def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None): + def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None, + maxsize=0, warnsize=0): self._contextFactory = contextFactory self._connectTimeout = connectTimeout self._bindAddress = bindAddress self._pool = pool + self._maxsize = maxsize + self._warnsize = warnsize def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress @@ -197,11 +205,25 @@ def _cb_bodyready(self, txresponse, request): if txresponse.length == 0: return txresponse, '', None + maxsize = request.meta.get('download_maxsize', self._maxsize) + warnsize = request.meta.get('download_warnsize', self._warnsize) + expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1 + + if maxsize and expected_size > maxsize: + log.msg("Expected response size (%s) larger than download max size (%s)." % (expected_size, maxsize), + logLevel=log.ERROR) + txresponse._transport._producer.loseConnection() + raise defer.CancelledError() + + if warnsize and expected_size > warnsize: + log.msg("Expected response size (%s) larger than downlod warn size (%s)." % (expected_size, warnsize), + logLevel=log.WARNING) + def _cancel(_): txresponse._transport._producer.loseConnection() d = defer.Deferred(_cancel) - txresponse.deliverBody(_ResponseReader(d, txresponse, request)) + txresponse.deliverBody(_ResponseReader(d, txresponse, request, maxsize, warnsize)) return d def _cb_bodydone(self, result, request, url): @@ -232,14 +254,27 @@ def stopProducing(self): class _ResponseReader(protocol.Protocol): - def __init__(self, finished, txresponse, request): + def __init__(self, finished, txresponse, request, maxsize, warnsize): self._finished = finished self._txresponse = txresponse self._request = request self._bodybuf = BytesIO() + self._maxsize = maxsize + self._warnsize = warnsize + self._bytes_received = 0 def dataReceived(self, bodyBytes): self._bodybuf.write(bodyBytes) + self._bytes_received += len(bodyBytes) + + if self._maxsize and self._bytes_received > self._maxsize: + log.msg("Received (%s) bytes larger than download max size (%s)." % (self._bytes_received, self._maxsize), + logLevel=log.ERROR) + self._finished.cancel() + + if self._warnsize and self._bytes_received > self._warnsize: + log.msg("Received (%s) bytes larger than download warn size (%s)." % (self._bytes_received, self._warnsize), + logLevel=log.WARNING) def connectionLost(self, reason): if self._finished.called: diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index f01203c420f..1b7b3bf2973 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -66,6 +66,9 @@ DOWNLOAD_TIMEOUT = 180 # 3mins +DOWNLOAD_MAXSIZE = 1073741824 # 1024m +DOWNLOAD_WARNSIZE = 33554432 # 32m + DOWNLOADER = 'scrapy.core.downloader.Downloader' DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory' diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py index c444d35fa0c..55bb7ccf73f 100644 --- a/tests/test_downloader_handlers.py +++ b/tests/test_downloader_handlers.py @@ -30,6 +30,8 @@ from scrapy.utils.test import get_crawler from scrapy.exceptions import NotConfigured +from tests.mockserver import MockServer +from tests.spiders import SingleRequestSpider class DummyDH(object): @@ -211,6 +213,64 @@ class Http11TestCase(HttpTestCase): if 'http11' not in optional_features: skip = 'HTTP1.1 not supported in twisted < 11.1.0' + def test_download_without_maxsize_limit(self): + request = Request(self.getURL('file')) + d = self.download_request(request, Spider('foo')) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEquals, "0123456789") + return d + + @defer.inlineCallbacks + def test_download_with_maxsize_per_req(self): + meta = {'download_maxsize': 2} + request = Request(self.getURL('file'), meta=meta) + d = self.download_request(request, Spider('foo')) + yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted) + + @defer.inlineCallbacks + def test_download_with_small_maxsize_per_spider(self): + request = Request(self.getURL('file')) + d = self.download_request(request, Spider('foo', download_maxsize=2)) + yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted) + + def test_download_with_large_maxsize_per_spider(self): + request = Request(self.getURL('file')) + d = self.download_request(request, Spider('foo', download_maxsize=100)) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEquals, "0123456789") + return d + + +class Http11MockServerTestCase(unittest.TestCase): + """HTTP 1.1 test case with MockServer""" + if 'http11' not in optional_features: + skip = 'HTTP1.1 not supported in twisted < 11.1.0' + + def setUp(self): + self.mockserver = MockServer() + self.mockserver.__enter__() + + def tearDown(self): + self.mockserver.__exit__(None, None, None) + + @defer.inlineCallbacks + def test_download_with_content_length(self): + crawler = get_crawler(SingleRequestSpider) + # http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid + # download it + yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000})) + failure = crawler.spider.meta['failure'] + self.assertIsInstance(failure.value, defer.CancelledError) + + @defer.inlineCallbacks + def test_download(self): + crawler = get_crawler(SingleRequestSpider) + yield crawler.crawl(seed=Request(url='http://localhost:8998')) + failure = crawler.spider.meta.get('failure') + self.assertTrue(failure == None) + reason = crawler.spider.meta['close_reason'] + self.assertTrue(reason, 'finished') + class UriResource(resource.Resource): """Return the full uri that was requested""" From a69f042d1064c4beea46fa70084835dd6c91c143 Mon Sep 17 00:00:00 2001 From: tpeng Date: Wed, 19 Nov 2014 11:50:07 +0100 Subject: [PATCH 2/3] add 2 more test cases and minor doc fixes --- docs/topics/settings.rst | 10 ++++++--- scrapy/settings/default_settings.py | 4 ++-- tests/mockserver.py | 7 ++++-- tests/test_downloader_handlers.py | 34 +++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 4022267438a..0e9e53de4e7 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -427,7 +427,7 @@ The amount of time (in secs) that the downloader will wait before timing out. DOWNLOAD_MAXSIZE ---------------- -Default: `1073741824` (1024Mb) +Default: `1073741824` (1024MB) The maximum response size (in bytes) that downloader will download. @@ -439,12 +439,14 @@ If you want to disable it set to 0. spider attribute and per-request using :reqmeta:`download_maxsize` Request.meta key. + This feature needs Twisted >= 11.1. + .. setting:: DOWNLOAD_WARNSIZE DOWNLOAD_WARNSIZE ----------------- +----------------- -Default: `33554432` (32Mb) +Default: `33554432` (32MB) The response size (in bytes) that downloader will start to warn. @@ -456,6 +458,8 @@ If you want to disable it set to 0. spider attribute and per-request using :reqmeta:`download_warnsize` Request.meta key. + This feature needs Twisted >= 11.1. + .. setting:: DUPEFILTER_CLASS DUPEFILTER_CLASS diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 1b7b3bf2973..cf216385a6e 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -66,8 +66,8 @@ DOWNLOAD_TIMEOUT = 180 # 3mins -DOWNLOAD_MAXSIZE = 1073741824 # 1024m -DOWNLOAD_WARNSIZE = 33554432 # 32m +DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m +DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m DOWNLOADER = 'scrapy.core.downloader.Downloader' diff --git a/tests/mockserver.py b/tests/mockserver.py index 6910532b633..2c0ad66fba5 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -1,9 +1,10 @@ from __future__ import print_function import sys, time, random, urllib, os, json from subprocess import Popen, PIPE -from twisted.web.server import Site, NOT_DONE_YET -from twisted.web.resource import Resource +from twisted.web.server import Site, NOT_DONE_YET, GzipEncoderFactory +from twisted.web.resource import Resource, EncodingResourceWrapper from twisted.internet import reactor, defer, ssl +from twisted.web.test.test_webclient import PayloadResource from scrapy import twisted_version @@ -167,6 +168,8 @@ def __init__(self): self.putChild("drop", Drop()) self.putChild("raw", Raw()) self.putChild("echo", Echo()) + self.putChild('payload', PayloadResource()) + self.putChild("xpayload", EncodingResourceWrapper(PayloadResource(), [GzipEncoderFactory()])) def getChild(self, name, request): return self diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py index 55bb7ccf73f..127c4a4bda2 100644 --- a/tests/test_downloader_handlers.py +++ b/tests/test_downloader_handlers.py @@ -220,6 +220,20 @@ def test_download_without_maxsize_limit(self): d.addCallback(self.assertEquals, "0123456789") return d + @defer.inlineCallbacks + def test_download_with_maxsize(self): + request = Request(self.getURL('file')) + + # 10 is minimal size for this request and the limit is only counted on + # response body. (regardless of headers) + d = self.download_request(request, Spider('foo', download_maxsize=10)) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEquals, "0123456789") + yield d + + d = self.download_request(request, Spider('foo', download_maxsize=9)) + yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted) + @defer.inlineCallbacks def test_download_with_maxsize_per_req(self): meta = {'download_maxsize': 2} @@ -271,6 +285,26 @@ def test_download(self): reason = crawler.spider.meta['close_reason'] self.assertTrue(reason, 'finished') + @defer.inlineCallbacks + def test_download_gzip_response(self): + crawler = get_crawler(SingleRequestSpider) + body = '1'*100 # PayloadResource requires body length to be 100 + request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50}) + yield crawler.crawl(seed=request) + failure = crawler.spider.meta['failure'] + # download_maxsize < 100, hence the CancelledError + self.assertIsInstance(failure.value, defer.CancelledError) + + request.headers.setdefault('Accept-Encoding', 'gzip,deflate') + request = request.replace(url='http://localhost:8998/xpayload') + yield crawler.crawl(seed=request) + + # download_maxsize = 50 is enough for the gzipped response + failure = crawler.spider.meta.get('failure') + self.assertTrue(failure == None) + reason = crawler.spider.meta['close_reason'] + self.assertTrue(reason, 'finished') + class UriResource(resource.Resource): """Return the full uri that was requested""" From cd193827546d2e20029c28961622ae7def7d541d Mon Sep 17 00:00:00 2001 From: tpeng Date: Tue, 25 Nov 2014 14:09:51 +0100 Subject: [PATCH 3/3] attemp to fix travis fails --- tests/mockserver.py | 15 ++++++++---- tests/test_downloader_handlers.py | 38 ++++++++++++++++++------------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 2c0ad66fba5..b73208c5cad 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -1,10 +1,10 @@ from __future__ import print_function import sys, time, random, urllib, os, json +import six from subprocess import Popen, PIPE -from twisted.web.server import Site, NOT_DONE_YET, GzipEncoderFactory -from twisted.web.resource import Resource, EncodingResourceWrapper +from twisted.web.server import Site, NOT_DONE_YET +from twisted.web.resource import Resource from twisted.internet import reactor, defer, ssl -from twisted.web.test.test_webclient import PayloadResource from scrapy import twisted_version @@ -168,8 +168,13 @@ def __init__(self): self.putChild("drop", Drop()) self.putChild("raw", Raw()) self.putChild("echo", Echo()) - self.putChild('payload', PayloadResource()) - self.putChild("xpayload", EncodingResourceWrapper(PayloadResource(), [GzipEncoderFactory()])) + + if six.PY2 and twisted_version > (12, 3, 0): + from twisted.web.test.test_webclient import PayloadResource + from twisted.web.server import GzipEncoderFactory + from twisted.web.resource import EncodingResourceWrapper + self.putChild('payload', PayloadResource()) + self.putChild("xpayload", EncodingResourceWrapper(PayloadResource(), [GzipEncoderFactory()])) def getChild(self, name, request): return self diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py index 127c4a4bda2..9021af3b403 100644 --- a/tests/test_downloader_handlers.py +++ b/tests/test_downloader_handlers.py @@ -1,5 +1,6 @@ import os import twisted +import six from twisted.trial import unittest from twisted.protocols.policies import WrappingFactory @@ -287,23 +288,28 @@ def test_download(self): @defer.inlineCallbacks def test_download_gzip_response(self): - crawler = get_crawler(SingleRequestSpider) - body = '1'*100 # PayloadResource requires body length to be 100 - request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50}) - yield crawler.crawl(seed=request) - failure = crawler.spider.meta['failure'] - # download_maxsize < 100, hence the CancelledError - self.assertIsInstance(failure.value, defer.CancelledError) - - request.headers.setdefault('Accept-Encoding', 'gzip,deflate') - request = request.replace(url='http://localhost:8998/xpayload') - yield crawler.crawl(seed=request) - # download_maxsize = 50 is enough for the gzipped response - failure = crawler.spider.meta.get('failure') - self.assertTrue(failure == None) - reason = crawler.spider.meta['close_reason'] - self.assertTrue(reason, 'finished') + if six.PY2 and twisted_version > (12, 3, 0): + + crawler = get_crawler(SingleRequestSpider) + body = '1'*100 # PayloadResource requires body length to be 100 + request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50}) + yield crawler.crawl(seed=request) + failure = crawler.spider.meta['failure'] + # download_maxsize < 100, hence the CancelledError + self.assertIsInstance(failure.value, defer.CancelledError) + + request.headers.setdefault('Accept-Encoding', 'gzip,deflate') + request = request.replace(url='http://localhost:8998/xpayload') + yield crawler.crawl(seed=request) + + # download_maxsize = 50 is enough for the gzipped response + failure = crawler.spider.meta.get('failure') + self.assertTrue(failure == None) + reason = crawler.spider.meta['close_reason'] + self.assertTrue(reason, 'finished') + else: + raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0 and python 2.x") class UriResource(resource.Resource):