diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 9000f024284..0e9e53de4e7 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -422,6 +422,44 @@ The amount of time (in secs) that the downloader will wait before timing out. spider attribute and per-request using :reqmeta:`download_timeout` Request.meta key. +.. setting:: DOWNLOAD_MAXSIZE + +DOWNLOAD_MAXSIZE +---------------- + +Default: `1073741824` (1024MB) + +The maximum response size (in bytes) that downloader will download. + +If you want to disable it set to 0. + +.. note:: + + This size can be set per spider using :attr:`download_maxsize` + spider attribute and per-request using :reqmeta:`download_maxsize` + Request.meta key. + + This feature needs Twisted >= 11.1. + +.. setting:: DOWNLOAD_WARNSIZE + +DOWNLOAD_WARNSIZE +----------------- + +Default: `33554432` (32MB) + +The response size (in bytes) that downloader will start to warn. + +If you want to disable it set to 0. + +.. note:: + + This size can be set per spider using :attr:`download_warnsize` + spider attribute and per-request using :reqmeta:`download_warnsize` + Request.meta key. + + This feature needs Twisted >= 11.1. + .. setting:: DUPEFILTER_CLASS DUPEFILTER_CLASS diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 23cd07c5144..dd3ad488b9d 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -9,7 +9,7 @@ from zope.interface import implements from twisted.internet import defer, reactor, protocol from twisted.web.http_headers import Headers as TxHeaders -from twisted.web.iweb import IBodyProducer +from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH from twisted.internet.error import TimeoutError from twisted.web.http import PotentialDataLoss from scrapy.xlib.tx import Agent, ProxyAgent, ResponseDone, \ @@ -19,6 +19,7 @@ from scrapy.responsetypes import responsetypes from scrapy.core.downloader.webclient import _parse from scrapy.utils.misc import load_object +from scrapy import log class HTTP11DownloadHandler(object): @@ -29,10 +30,14 @@ def __init__(self, settings): self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() + self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE') + self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE') def download_request(self, request, spider): """Return a deferred for the HTTP download""" - agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool) + agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool, + maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), + warnsize=getattr(spider, 'download_warnsize', self._default_warnsize)) return agent.download_request(request) def close(self): @@ -131,11 +136,14 @@ class ScrapyAgent(object): _ProxyAgent = ProxyAgent _TunnelingAgent = TunnelingAgent - def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None): + def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None, + maxsize=0, warnsize=0): self._contextFactory = contextFactory self._connectTimeout = connectTimeout self._bindAddress = bindAddress self._pool = pool + self._maxsize = maxsize + self._warnsize = warnsize def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress @@ -197,11 +205,25 @@ def _cb_bodyready(self, txresponse, request): if txresponse.length == 0: return txresponse, '', None + maxsize = request.meta.get('download_maxsize', self._maxsize) + warnsize = request.meta.get('download_warnsize', self._warnsize) + expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1 + + if maxsize and expected_size > maxsize: + log.msg("Expected response size (%s) larger than download max size (%s)." % (expected_size, maxsize), + logLevel=log.ERROR) + txresponse._transport._producer.loseConnection() + raise defer.CancelledError() + + if warnsize and expected_size > warnsize: + log.msg("Expected response size (%s) larger than downlod warn size (%s)." % (expected_size, warnsize), + logLevel=log.WARNING) + def _cancel(_): txresponse._transport._producer.loseConnection() d = defer.Deferred(_cancel) - txresponse.deliverBody(_ResponseReader(d, txresponse, request)) + txresponse.deliverBody(_ResponseReader(d, txresponse, request, maxsize, warnsize)) return d def _cb_bodydone(self, result, request, url): @@ -232,14 +254,27 @@ def stopProducing(self): class _ResponseReader(protocol.Protocol): - def __init__(self, finished, txresponse, request): + def __init__(self, finished, txresponse, request, maxsize, warnsize): self._finished = finished self._txresponse = txresponse self._request = request self._bodybuf = BytesIO() + self._maxsize = maxsize + self._warnsize = warnsize + self._bytes_received = 0 def dataReceived(self, bodyBytes): self._bodybuf.write(bodyBytes) + self._bytes_received += len(bodyBytes) + + if self._maxsize and self._bytes_received > self._maxsize: + log.msg("Received (%s) bytes larger than download max size (%s)." % (self._bytes_received, self._maxsize), + logLevel=log.ERROR) + self._finished.cancel() + + if self._warnsize and self._bytes_received > self._warnsize: + log.msg("Received (%s) bytes larger than download warn size (%s)." % (self._bytes_received, self._warnsize), + logLevel=log.WARNING) def connectionLost(self, reason): if self._finished.called: diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index f01203c420f..cf216385a6e 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -66,6 +66,9 @@ DOWNLOAD_TIMEOUT = 180 # 3mins +DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m +DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m + DOWNLOADER = 'scrapy.core.downloader.Downloader' DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory' diff --git a/tests/mockserver.py b/tests/mockserver.py index 6910532b633..b73208c5cad 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -1,5 +1,6 @@ from __future__ import print_function import sys, time, random, urllib, os, json +import six from subprocess import Popen, PIPE from twisted.web.server import Site, NOT_DONE_YET from twisted.web.resource import Resource @@ -168,6 +169,13 @@ def __init__(self): self.putChild("raw", Raw()) self.putChild("echo", Echo()) + if six.PY2 and twisted_version > (12, 3, 0): + from twisted.web.test.test_webclient import PayloadResource + from twisted.web.server import GzipEncoderFactory + from twisted.web.resource import EncodingResourceWrapper + self.putChild('payload', PayloadResource()) + self.putChild("xpayload", EncodingResourceWrapper(PayloadResource(), [GzipEncoderFactory()])) + def getChild(self, name, request): return self diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py index c444d35fa0c..9021af3b403 100644 --- a/tests/test_downloader_handlers.py +++ b/tests/test_downloader_handlers.py @@ -1,5 +1,6 @@ import os import twisted +import six from twisted.trial import unittest from twisted.protocols.policies import WrappingFactory @@ -30,6 +31,8 @@ from scrapy.utils.test import get_crawler from scrapy.exceptions import NotConfigured +from tests.mockserver import MockServer +from tests.spiders import SingleRequestSpider class DummyDH(object): @@ -211,6 +214,103 @@ class Http11TestCase(HttpTestCase): if 'http11' not in optional_features: skip = 'HTTP1.1 not supported in twisted < 11.1.0' + def test_download_without_maxsize_limit(self): + request = Request(self.getURL('file')) + d = self.download_request(request, Spider('foo')) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEquals, "0123456789") + return d + + @defer.inlineCallbacks + def test_download_with_maxsize(self): + request = Request(self.getURL('file')) + + # 10 is minimal size for this request and the limit is only counted on + # response body. (regardless of headers) + d = self.download_request(request, Spider('foo', download_maxsize=10)) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEquals, "0123456789") + yield d + + d = self.download_request(request, Spider('foo', download_maxsize=9)) + yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted) + + @defer.inlineCallbacks + def test_download_with_maxsize_per_req(self): + meta = {'download_maxsize': 2} + request = Request(self.getURL('file'), meta=meta) + d = self.download_request(request, Spider('foo')) + yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted) + + @defer.inlineCallbacks + def test_download_with_small_maxsize_per_spider(self): + request = Request(self.getURL('file')) + d = self.download_request(request, Spider('foo', download_maxsize=2)) + yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted) + + def test_download_with_large_maxsize_per_spider(self): + request = Request(self.getURL('file')) + d = self.download_request(request, Spider('foo', download_maxsize=100)) + d.addCallback(lambda r: r.body) + d.addCallback(self.assertEquals, "0123456789") + return d + + +class Http11MockServerTestCase(unittest.TestCase): + """HTTP 1.1 test case with MockServer""" + if 'http11' not in optional_features: + skip = 'HTTP1.1 not supported in twisted < 11.1.0' + + def setUp(self): + self.mockserver = MockServer() + self.mockserver.__enter__() + + def tearDown(self): + self.mockserver.__exit__(None, None, None) + + @defer.inlineCallbacks + def test_download_with_content_length(self): + crawler = get_crawler(SingleRequestSpider) + # http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid + # download it + yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000})) + failure = crawler.spider.meta['failure'] + self.assertIsInstance(failure.value, defer.CancelledError) + + @defer.inlineCallbacks + def test_download(self): + crawler = get_crawler(SingleRequestSpider) + yield crawler.crawl(seed=Request(url='http://localhost:8998')) + failure = crawler.spider.meta.get('failure') + self.assertTrue(failure == None) + reason = crawler.spider.meta['close_reason'] + self.assertTrue(reason, 'finished') + + @defer.inlineCallbacks + def test_download_gzip_response(self): + + if six.PY2 and twisted_version > (12, 3, 0): + + crawler = get_crawler(SingleRequestSpider) + body = '1'*100 # PayloadResource requires body length to be 100 + request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50}) + yield crawler.crawl(seed=request) + failure = crawler.spider.meta['failure'] + # download_maxsize < 100, hence the CancelledError + self.assertIsInstance(failure.value, defer.CancelledError) + + request.headers.setdefault('Accept-Encoding', 'gzip,deflate') + request = request.replace(url='http://localhost:8998/xpayload') + yield crawler.crawl(seed=request) + + # download_maxsize = 50 is enough for the gzipped response + failure = crawler.spider.meta.get('failure') + self.assertTrue(failure == None) + reason = crawler.spider.meta['close_reason'] + self.assertTrue(reason, 'finished') + else: + raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0 and python 2.x") + class UriResource(resource.Resource): """Return the full uri that was requested"""