Skip to content

Commit

Permalink
Merge pull request scrapy#946 from tpeng/limit-response-size
Browse files Browse the repository at this point in the history
avoid download large response
  • Loading branch information
pablohoffman committed Nov 25, 2014
2 parents 8d8e1b2 + cd19382 commit dedea72
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 5 deletions.
38 changes: 38 additions & 0 deletions docs/topics/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,44 @@ The amount of time (in secs) that the downloader will wait before timing out.
spider attribute and per-request using :reqmeta:`download_timeout`
Request.meta key.

.. setting:: DOWNLOAD_MAXSIZE

DOWNLOAD_MAXSIZE
----------------

Default: `1073741824` (1024MB)

The maximum response size (in bytes) that downloader will download.

If you want to disable it set to 0.

.. note::

This size can be set per spider using :attr:`download_maxsize`
spider attribute and per-request using :reqmeta:`download_maxsize`
Request.meta key.

This feature needs Twisted >= 11.1.

.. setting:: DOWNLOAD_WARNSIZE

DOWNLOAD_WARNSIZE
-----------------

Default: `33554432` (32MB)

The response size (in bytes) that downloader will start to warn.

If you want to disable it set to 0.

.. note::

This size can be set per spider using :attr:`download_warnsize`
spider attribute and per-request using :reqmeta:`download_warnsize`
Request.meta key.

This feature needs Twisted >= 11.1.

.. setting:: DUPEFILTER_CLASS

DUPEFILTER_CLASS
Expand Down
45 changes: 40 additions & 5 deletions scrapy/core/downloader/handlers/http11.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from zope.interface import implements
from twisted.internet import defer, reactor, protocol
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
from twisted.internet.error import TimeoutError
from twisted.web.http import PotentialDataLoss
from scrapy.xlib.tx import Agent, ProxyAgent, ResponseDone, \
Expand All @@ -19,6 +19,7 @@
from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.webclient import _parse
from scrapy.utils.misc import load_object
from scrapy import log


class HTTP11DownloadHandler(object):
Expand All @@ -29,10 +30,14 @@ def __init__(self, settings):
self._pool._factory.noisy = False
self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self._contextFactory = self._contextFactoryClass()
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')

def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool)
agent = ScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
return agent.download_request(request)

def close(self):
Expand Down Expand Up @@ -131,11 +136,14 @@ class ScrapyAgent(object):
_ProxyAgent = ProxyAgent
_TunnelingAgent = TunnelingAgent

def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None):
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
self._pool = pool
self._maxsize = maxsize
self._warnsize = warnsize

def _get_agent(self, request, timeout):
bindaddress = request.meta.get('bindaddress') or self._bindAddress
Expand Down Expand Up @@ -197,11 +205,25 @@ def _cb_bodyready(self, txresponse, request):
if txresponse.length == 0:
return txresponse, '', None

maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1

if maxsize and expected_size > maxsize:
log.msg("Expected response size (%s) larger than download max size (%s)." % (expected_size, maxsize),
logLevel=log.ERROR)
txresponse._transport._producer.loseConnection()
raise defer.CancelledError()

if warnsize and expected_size > warnsize:
log.msg("Expected response size (%s) larger than downlod warn size (%s)." % (expected_size, warnsize),
logLevel=log.WARNING)

def _cancel(_):
txresponse._transport._producer.loseConnection()

d = defer.Deferred(_cancel)
txresponse.deliverBody(_ResponseReader(d, txresponse, request))
txresponse.deliverBody(_ResponseReader(d, txresponse, request, maxsize, warnsize))
return d

def _cb_bodydone(self, result, request, url):
Expand Down Expand Up @@ -232,14 +254,27 @@ def stopProducing(self):

class _ResponseReader(protocol.Protocol):

def __init__(self, finished, txresponse, request):
def __init__(self, finished, txresponse, request, maxsize, warnsize):
self._finished = finished
self._txresponse = txresponse
self._request = request
self._bodybuf = BytesIO()
self._maxsize = maxsize
self._warnsize = warnsize
self._bytes_received = 0

def dataReceived(self, bodyBytes):
self._bodybuf.write(bodyBytes)
self._bytes_received += len(bodyBytes)

if self._maxsize and self._bytes_received > self._maxsize:
log.msg("Received (%s) bytes larger than download max size (%s)." % (self._bytes_received, self._maxsize),
logLevel=log.ERROR)
self._finished.cancel()

if self._warnsize and self._bytes_received > self._warnsize:
log.msg("Received (%s) bytes larger than download warn size (%s)." % (self._bytes_received, self._warnsize),
logLevel=log.WARNING)

def connectionLost(self, reason):
if self._finished.called:
Expand Down
3 changes: 3 additions & 0 deletions scrapy/settings/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@

DOWNLOAD_TIMEOUT = 180 # 3mins

DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m
DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m

DOWNLOADER = 'scrapy.core.downloader.Downloader'

DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory'
Expand Down
8 changes: 8 additions & 0 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import print_function
import sys, time, random, urllib, os, json
import six
from subprocess import Popen, PIPE
from twisted.web.server import Site, NOT_DONE_YET
from twisted.web.resource import Resource
Expand Down Expand Up @@ -168,6 +169,13 @@ def __init__(self):
self.putChild("raw", Raw())
self.putChild("echo", Echo())

if six.PY2 and twisted_version > (12, 3, 0):
from twisted.web.test.test_webclient import PayloadResource
from twisted.web.server import GzipEncoderFactory
from twisted.web.resource import EncodingResourceWrapper
self.putChild('payload', PayloadResource())
self.putChild("xpayload", EncodingResourceWrapper(PayloadResource(), [GzipEncoderFactory()]))

def getChild(self, name, request):
return self

Expand Down
100 changes: 100 additions & 0 deletions tests/test_downloader_handlers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import twisted
import six

from twisted.trial import unittest
from twisted.protocols.policies import WrappingFactory
Expand Down Expand Up @@ -30,6 +31,8 @@
from scrapy.utils.test import get_crawler
from scrapy.exceptions import NotConfigured

from tests.mockserver import MockServer
from tests.spiders import SingleRequestSpider

class DummyDH(object):

Expand Down Expand Up @@ -211,6 +214,103 @@ class Http11TestCase(HttpTestCase):
if 'http11' not in optional_features:
skip = 'HTTP1.1 not supported in twisted < 11.1.0'

def test_download_without_maxsize_limit(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo'))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
return d

@defer.inlineCallbacks
def test_download_with_maxsize(self):
request = Request(self.getURL('file'))

# 10 is minimal size for this request and the limit is only counted on
# response body. (regardless of headers)
d = self.download_request(request, Spider('foo', download_maxsize=10))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
yield d

d = self.download_request(request, Spider('foo', download_maxsize=9))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)

@defer.inlineCallbacks
def test_download_with_maxsize_per_req(self):
meta = {'download_maxsize': 2}
request = Request(self.getURL('file'), meta=meta)
d = self.download_request(request, Spider('foo'))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)

@defer.inlineCallbacks
def test_download_with_small_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=2))
yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)

def test_download_with_large_maxsize_per_spider(self):
request = Request(self.getURL('file'))
d = self.download_request(request, Spider('foo', download_maxsize=100))
d.addCallback(lambda r: r.body)
d.addCallback(self.assertEquals, "0123456789")
return d


class Http11MockServerTestCase(unittest.TestCase):
"""HTTP 1.1 test case with MockServer"""
if 'http11' not in optional_features:
skip = 'HTTP1.1 not supported in twisted < 11.1.0'

def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()

def tearDown(self):
self.mockserver.__exit__(None, None, None)

@defer.inlineCallbacks
def test_download_with_content_length(self):
crawler = get_crawler(SingleRequestSpider)
# http://localhost:8998/partial set Content-Length to 1024, use download_maxsize= 1000 to avoid
# download it
yield crawler.crawl(seed=Request(url='http://localhost:8998/partial', meta={'download_maxsize': 1000}))
failure = crawler.spider.meta['failure']
self.assertIsInstance(failure.value, defer.CancelledError)

@defer.inlineCallbacks
def test_download(self):
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=Request(url='http://localhost:8998'))
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')

@defer.inlineCallbacks
def test_download_gzip_response(self):

if six.PY2 and twisted_version > (12, 3, 0):

crawler = get_crawler(SingleRequestSpider)
body = '1'*100 # PayloadResource requires body length to be 100
request = Request('http://localhost:8998/payload', method='POST', body=body, meta={'download_maxsize': 50})
yield crawler.crawl(seed=request)
failure = crawler.spider.meta['failure']
# download_maxsize < 100, hence the CancelledError
self.assertIsInstance(failure.value, defer.CancelledError)

request.headers.setdefault('Accept-Encoding', 'gzip,deflate')
request = request.replace(url='http://localhost:8998/xpayload')
yield crawler.crawl(seed=request)

# download_maxsize = 50 is enough for the gzipped response
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
else:
raise unittest.SkipTest("xpayload and payload endpoint only enabled for twisted > 12.3.0 and python 2.x")


class UriResource(resource.Resource):
"""Return the full uri that was requested"""
Expand Down

0 comments on commit dedea72

Please sign in to comment.