Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@ to set ``meta['splash']['args']`` use ``SplashRequest(..., args=myargs)``.
and ``assert(splash:go(..))`` fails with an HTTP error
response.status is also set to HTTP error code.

Original URL, status and headers are available as ``response.real_url``,
``response.splash_response_status`` and ``response.splash_response_headers``.

This option is set to True by default if you use SplashRequest.
``render.json`` and ``execute`` endpoints may not have all the necessary
keys/values in the response.
Expand Down Expand Up @@ -631,7 +634,9 @@ aware of:

3. As seen by Scrapy, response.url is an URL of the Splash server.
scrapy-splash fixes it to be an URL of a requested page.
"Real" URL is still available as ``response.real_url``.
"Real" URL is still available as ``response.real_url``. scrapy-splash also
allows to handle ``response.status`` and ``response.headers`` transparently
on Scrapy side.

4. Some options depend on each other - for example, if you use timeout_
Splash option then you may want to set ``download_timeout``
Expand Down
7 changes: 4 additions & 3 deletions scrapy_splash/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
json_based_hash,
parse_x_splash_saved_arguments_header,
)
from scrapy_splash.response import get_splash_status, get_splash_headers


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -379,7 +380,7 @@ def process_response(self, request, response, spider):

# handle save_args/load_args
self._process_x_splash_saved_arguments(request, response)
if response.status == 498:
if get_splash_status(response) == 498:
logger.debug("Got HTTP 498 response for {}; "
"sending arguments again.".format(request),
extra={'spider': spider})
Expand All @@ -390,7 +391,7 @@ def process_response(self, request, response, spider):

response = self._change_response_class(request, response)

if self.log_400 and response.status == 400:
if self.log_400 and get_splash_status(response) == 400:
self._log_400(request, response, spider)

return response
Expand Down Expand Up @@ -423,7 +424,7 @@ def _log_400(self, request, response, spider):

def _process_x_splash_saved_arguments(self, request, response):
""" Keep track of arguments saved by Splash. """
saved_args = response.headers.get(b'X-Splash-Saved-Arguments')
saved_args = get_splash_headers(response).get(b'X-Splash-Saved-Arguments')
if not saved_args:
return
saved_args = parse_x_splash_saved_arguments_header(saved_args)
Expand Down
28 changes: 24 additions & 4 deletions scrapy_splash/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
from scrapy_splash.utils import headers_to_scrapy


def get_splash_status(resp):
return getattr(resp, 'splash_response_status', resp.status)


def get_splash_headers(resp):
return getattr(resp, 'splash_response_headers', resp.headers)


class _SplashResponseMixin(object):
"""
This mixin fixes response.url and adds response.real_url
Expand All @@ -30,14 +38,23 @@ def __init__(self, url, *args, **kwargs):
if _url is not None:
self.real_url = url
url = _url
self.splash_response_status = kwargs.pop('splash_response_status',
None)
self.splash_response_headers = kwargs.pop('splash_response_headers',
None)
super(_SplashResponseMixin, self).__init__(url, *args, **kwargs)
if self.splash_response_status is None:
self.splash_response_status = self.status
if self.splash_response_headers is None:
self.splash_response_headers = self.headers.copy()

def replace(self, *args, **kwargs):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body', 'request', 'flags',
'real_url']:
'real_url', 'splash_response_status',
'splash_response_headers']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
Expand Down Expand Up @@ -80,11 +97,14 @@ class SplashJsonResponse(SplashResponse):
(['splash']['magic_response'] is not False), several other response
attributes (headers, body, url, status code) are set automatically:

* response.headers are filled from 'headers' keys;
* response.url is set to the value of 'url' key;
* response.url is set to the value of 'url' key, original url is
available as ``responce.real_url``;
* response.headers are filled from 'headers' keys; original headers are
available as ``response.splash_response_headers``;
* response.status is set from the value of 'http_status' key; original
status is available as ``response.splash_response_status``;
* response.body is set to the value of 'html' key,
or to base64-decoded value of 'body' key;
* response.status is set from the value of 'http_status' key.
"""
def __init__(self, *args, **kwargs):
self.cookiejar = None
Expand Down
102 changes: 88 additions & 14 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
DEFAULT_SCRIPT = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(0.5))
}
local wait = tonumber(splash.args.wait or 0.5)
assert(splash:wait(wait))

local entries = splash:history()
local last_response = entries[#entries].response
Expand All @@ -40,6 +41,11 @@ class HelloWorld(HtmlResource):
extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}


class Http400Resource(HtmlResource):
status_code = 400
html = "Website returns HTTP 400 error"



class ManyCookies(Resource, object):
class SetMyCookie(HtmlResource):
Expand Down Expand Up @@ -94,6 +100,9 @@ def parse(self, response):
resp = items[0]['response']
assert resp.url == url
assert resp.css('body::text').get().strip() == "hello world!"
assert resp.status == resp.splash_response_status == 200
assert resp.headers == resp.splash_response_headers
assert resp.splash_response_headers['Content-Type'] == b"text/html; charset=utf-8"

resp2 = items[1]['response']
assert resp2.body == resp.body
Expand All @@ -118,12 +127,78 @@ def start_requests(self):
assert len(items) == 1
resp = items[0]['response']
assert resp.url == url + "/#foo"
assert resp.status == resp.splash_response_status == 200
assert resp.css('body::text').get().strip() == "hello world!"
assert resp.data['jsvalue'] == 3
assert resp.headers['X-MyHeader'] == b'my value'
assert resp.headers['Content-Type'] == b'text/html'
assert resp.splash_response_headers['Content-Type'] == b'application/json'
assert resp.data['args']['foo'] == 'bar'


@requires_splash
@inlineCallbacks
def test_bad_request(settings):
class BadRequestSpider(ResponseSpider):
custom_settings = {'HTTPERROR_ALLOW_ALL': True}

def start_requests(self):
yield SplashRequest(self.url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'})

class GoodRequestSpider(ResponseSpider):
custom_settings = {'HTTPERROR_ALLOW_ALL': True}

def start_requests(self):
yield SplashRequest(self.url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT})


items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld,
settings)
resp = items[0]['response']
assert resp.status == 400
assert resp.splash_response_status == 400

items, url, crawler = yield crawl_items(GoodRequestSpider, Http400Resource,
settings)
resp = items[0]['response']
assert resp.status == 400
assert resp.splash_response_status == 200


@requires_splash
@inlineCallbacks
def test_cache_args(settings):

class CacheArgsSpider(ResponseSpider):
def _request(self, url):
return SplashRequest(url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'},
cache_args=['lua_source'])

def start_requests(self):
yield self._request(self.url)

def parse(self, response):
yield {'response': response}
yield self._request(self.url + "#foo")


items, url, crawler = yield crawl_items(CacheArgsSpider, HelloWorld,
settings)
assert len(items) == 2
resp = items[0]['response']
assert b"function main(splash)" in resp.request.body
assert b"yy" in resp.request.body
print(resp.body, resp.request.body)

resp = items[1]['response']
assert b"function main(splash)" not in resp.request.body
assert b"yy" in resp.request.body
print(resp.body, resp.request.body)


@requires_splash
@inlineCallbacks
def test_cookies(settings):
Expand Down Expand Up @@ -171,7 +246,6 @@ def parse_3(self, response):
args={'lua_source': DEFAULT_SCRIPT},
cookies={'bomb': BOMB})


def parse_4(self, response):
yield {'response': response}

Expand All @@ -185,19 +259,19 @@ def _cookie_dict(har_cookies):

# cookie should be sent to remote website, not to Splash
resp = items[0]['response']
splash_headers = resp.request.headers
splash_request_headers = resp.request.headers
cookies = resp.data['args']['cookies']
print(splash_headers)
print(splash_request_headers)
print(cookies)
assert _cookie_dict(cookies) == {
# 'login': '1', # FIXME
'x-set-splash': '1'
}
assert splash_headers.get(b'Cookie') is None
assert splash_request_headers.get(b'Cookie') is None

# new cookie should be also sent to remote website, not to Splash
resp2 = items[1]['response']
splash_headers = resp2.request.headers
splash_request_headers = resp2.request.headers
headers = resp2.data['args']['headers']
cookies = resp2.data['args']['cookies']
assert canonicalize_url(headers['Referer']) == canonicalize_url(url)
Expand All @@ -206,29 +280,29 @@ def _cookie_dict(har_cookies):
'x-set-splash': '1',
'sessionid': 'ABCD'
}
print(splash_headers)
print(splash_request_headers)
print(headers)
print(cookies)
assert splash_headers.get(b'Cookie') is None
assert splash_request_headers.get(b'Cookie') is None

# TODO/FIXME: Cookies fetched when working with Splash should be picked up
# by Scrapy
resp3 = items[2]['response']
splash_headers = resp3.request.headers
cookie_header = splash_headers.get(b'Cookie')
splash_request_headers = resp3.request.headers
cookie_header = splash_request_headers.get(b'Cookie')
assert b'x-set-scrapy=1' in cookie_header
assert b'login=1' in cookie_header
assert b'x-set-splash=1' in cookie_header
# assert b'sessionid=ABCD' in cookie_header # FIXME

# cookie bomb shouldn't cause problems
resp4 = items[3]['response']
splash_headers = resp4.request.headers
splash_request_headers = resp4.request.headers
cookies = resp4.data['args']['cookies']
assert _cookie_dict(cookies) == {
# 'login': '1',
'x-set-splash': '1',
'sessionid': 'ABCD',
'bomb': BOMB,
}
assert splash_headers.get(b'Cookie') is None
assert splash_request_headers.get(b'Cookie') is None
11 changes: 8 additions & 3 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ def cb():
assert response2.text == response2.body_as_unicode() == res_body
assert response2.encoding == 'utf8'
assert response2.headers == {b'Content-Type': [b'application/json']}
assert response2.status == 200
assert response2.splash_response_headers == response2.headers
assert response2.status == response2.splash_response_status == 200


def test_magic_response():
Expand Down Expand Up @@ -233,7 +234,9 @@ def test_magic_response():
b'X-My-Header': [b'foo'],
b'Set-Cookie': [b'bar=baz'],
}
assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
assert resp2.status == 404
assert resp2.splash_response_status == 200
assert resp2.url == "http://exmaple.com/#id42"
assert len(resp2.cookiejar) == 3
cookies = [c for c in resp2.cookiejar]
Expand Down Expand Up @@ -359,7 +362,8 @@ def test_magic_response2():
assert resp2.data == resp_data
assert resp2.body == b'binary data'
assert resp2.headers == {b'Content-Type': [b'text/plain']}
assert resp2.status == 200
assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
assert resp2.status == resp2.splash_response_status == 200
assert resp2.url == "http://example.com/"


Expand Down Expand Up @@ -397,12 +401,13 @@ def test_magic_response_http_error():
"error": 400,
"type": "ScriptError"
}
resp = TextResponse("http://mysplash.example.com/execute",
resp = TextResponse("http://mysplash.example.com/execute", status=400,
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp = mw.process_response(req, resp, None)
assert resp.data == resp_data
assert resp.status == 404
assert resp.splash_response_status == 400
assert resp.url == "http://example.com/foo"


Expand Down
2 changes: 2 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ class HtmlResource(Resource):
content_type = 'text/html'
html = ''
extra_headers = {}
status_code = 200

def render_GET(self, request):
request.setHeader(b'content-type', to_bytes(self.content_type))
for name, value in self.extra_headers.items():
request.setHeader(to_bytes(name), to_bytes(value))
request.setResponseCode(self.status_code)
return to_bytes(self.html)


Expand Down