diff --git a/CHANGES.rst b/CHANGES.rst index 9cc25b7..95e9816 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,6 +4,19 @@ Changes 0.8.0 (2021-10-04) ------------------ +* **Security bug fix:** + + If you use :ref:`HttpAuthMiddleware` (i.e. the ``http_user`` and + ``http_pass`` spider attributes) for Splash authentication, any non-Splash + request will expose your credentials to the request target. This includes + ``robots.txt`` requests sent by Scrapy when the ``ROBOTSTXT_OBEY`` setting + is set to ``True``. + + Use the new ``SPLASH_USER`` and ``SPLASH_PASS`` settings instead to set + your Splash authentication credentials safely. + + .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth + * Responses now expose the HTTP status code and headers from Splash as ``response.splash_response_status`` and ``response.splash_response_headers`` (#158) diff --git a/README.rst b/README.rst index dce25f9..478d0fe 100644 --- a/README.rst +++ b/README.rst @@ -582,12 +582,31 @@ on Splash server and is not sent with each request (it requires Splash 2.1+):: HTTP Basic Auth =============== -If you need HTTP Basic Authentication to access Splash, use -Scrapy's HttpAuthMiddleware_. +If you need to use HTTP Basic Authentication to access Splash, use the +``SPLASH_USER`` and ``SPLASH_PASS`` optional settings:: + + SPLASH_USER = 'user' + SPLASH_PASS = 'userpass' Another option is ``meta['splash']['splash_headers']``: it allows to set custom headers which are sent to Splash server; add Authorization header -to ``splash_headers`` if HttpAuthMiddleware doesn't fit for some reason. +to ``splash_headers`` if you want to change credentials per-request:: + + import scrapy + from w3lib.http import basic_auth_header + + class MySpider(scrapy.Spider): + # ... + def start_requests(self): + auth = basic_auth_header('user', 'userpass') + yield SplashRequest(url, self.parse, + splash_headers={'Authorization': auth}) + +**WARNING:** Don't use :ref:`HttpAuthMiddleware` +(i.e. ``http_user`` / ``http_pass`` spider attributes) for Splash +authentication: if you occasionally send a non-Splash request from your spider, +you may expose Splash credentials to a remote website, as HttpAuthMiddleware +sets credentials for all requests unconditionally. .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth diff --git a/example/scrashtest/settings.py b/example/scrashtest/settings.py index c7aa603..30feb75 100644 --- a/example/scrashtest/settings.py +++ b/example/scrashtest/settings.py @@ -20,3 +20,4 @@ # SPLASH_URL = 'http://192.168.59.103:8050/' DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' +ROBOTSTXT_OBEY = True \ No newline at end of file diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py index 24ab23a..abf4711 100644 --- a/scrapy_splash/middleware.py +++ b/scrapy_splash/middleware.py @@ -10,11 +10,13 @@ from six.moves.urllib.parse import urljoin from six.moves.http_cookiejar import CookieJar +from w3lib.http import basic_auth_header import scrapy -from scrapy.exceptions import NotConfigured +from scrapy.exceptions import NotConfigured, IgnoreRequest from scrapy.http.headers import Headers from scrapy.http.response.text import TextResponse from scrapy import signals +from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware from scrapy_splash.responsetypes import responsetypes from scrapy_splash.cookies import jar_to_har, har_to_jar @@ -222,26 +224,34 @@ class SplashMiddleware(object): retry_498_priority_adjust = +50 remote_keys_key = '_splash_remote_keys' - def __init__(self, crawler, splash_base_url, slot_policy, log_400): + def __init__(self, crawler, splash_base_url, slot_policy, log_400, auth): self.crawler = crawler self.splash_base_url = splash_base_url self.slot_policy = slot_policy self.log_400 = log_400 self.crawler.signals.connect(self.spider_opened, signals.spider_opened) + self.auth = auth @classmethod def from_crawler(cls, crawler): - splash_base_url = crawler.settings.get('SPLASH_URL', - cls.default_splash_url) - log_400 = crawler.settings.getbool('SPLASH_LOG_400', True) - slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY', - cls.default_policy) + s = crawler.settings + splash_base_url = s.get('SPLASH_URL', cls.default_splash_url) + log_400 = s.getbool('SPLASH_LOG_400', True) + slot_policy = s.get('SPLASH_SLOT_POLICY', cls.default_policy) if slot_policy not in SlotPolicy._known: raise NotConfigured("Incorrect slot policy: %r" % slot_policy) - return cls(crawler, splash_base_url, slot_policy, log_400) + splash_user = s.get('SPLASH_USER', '') + splash_pass = s.get('SPLASH_PASS', '') + auth = None + if splash_user or splash_pass: + auth = basic_auth_header(splash_user, splash_pass) + return cls(crawler, splash_base_url, slot_policy, log_400, auth) def spider_opened(self, spider): + if _http_auth_enabled(spider): + replace_downloader_middleware(self.crawler, RobotsTxtMiddleware, + SafeRobotsTxtMiddleware) if not hasattr(spider, 'state'): spider.state = {} @@ -260,21 +270,24 @@ def _remote_keys(self): def process_request(self, request, spider): if 'splash' not in request.meta: return + splash_options = request.meta['splash'] if request.method not in {'GET', 'POST'}: - logger.warning( + logger.error( "Currently only GET and POST requests are supported by " - "SplashMiddleware; %(request)s will be handled without Splash", + "SplashMiddleware; %(request)s is dropped", {'request': request}, extra={'spider': spider} ) - return request + self.crawler.stats.inc_value('splash/dropped/method/{}'.format( + request.method)) + raise IgnoreRequest("SplashRequest doesn't support " + "HTTP {} method".format(request.method)) if request.meta.get("_splash_processed"): # don't process the same request more than once return - splash_options = request.meta['splash'] request.meta['_splash_processed'] = True slot_policy = splash_options.get('slot_policy', self.slot_policy) @@ -319,6 +332,10 @@ def process_request(self, request, spider): if not splash_options.get('dont_send_headers'): headers = scrapy_headers_to_unicode_dict(request.headers) if headers: + # Headers set by HttpAuthMiddleware should be used for Splash, + # not for the remote website (backwards compatibility). + if _http_auth_enabled(spider): + headers.pop('Authorization', None) args.setdefault('headers', headers) body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4) @@ -353,6 +370,8 @@ def process_request(self, request, spider): splash_url = urljoin(splash_base_url, endpoint) headers = Headers({'Content-Type': 'application/json'}) + if self.auth is not None: + headers['Authorization'] = self.auth headers.update(splash_options.get('splash_headers', {})) new_request = request.replace( url=splash_url, @@ -361,6 +380,7 @@ def process_request(self, request, spider): headers=headers, priority=request.priority + self.rescheduling_priority_adjust ) + new_request.meta['dont_obey_robotstxt'] = True self.crawler.stats.inc_value('splash/%s/request_count' % endpoint) return new_request @@ -478,3 +498,39 @@ def _get_slot_key(self, request_or_response): return self.crawler.engine.downloader._get_slot_key( request_or_response, None ) + + +class SafeRobotsTxtMiddleware(RobotsTxtMiddleware): + def process_request(self, request, spider): + # disable robots.txt for Splash requests + if _http_auth_enabled(spider) and 'splash' in request.meta: + return + return super(SafeRobotsTxtMiddleware, self).process_request( + request, spider) + + +def _http_auth_enabled(spider): + # FIXME: this function should always return False if HttpAuthMiddleware is + # not in a middleware list. + return getattr(spider, 'http_user', '') or getattr(spider, 'http_pass', '') + + +def replace_downloader_middleware(crawler, old_cls, new_cls): + """ Replace downloader middleware with another one """ + try: + new_mw = new_cls.from_crawler(crawler) + except NotConfigured: + return + + mw_manager = crawler.engine.downloader.middleware + mw_manager.middlewares = tuple([ + mw if mw.__class__ is not old_cls else new_mw + for mw in mw_manager.middlewares + ]) + for method_name, callbacks in mw_manager.methods.items(): + for idx, meth in enumerate(callbacks): + method_cls = meth.__self__.__class__ + if method_cls is old_cls: + new_meth = getattr(new_mw, method_name) + # logger.debug("{} is replaced with {}".format(meth, new_meth)) + callbacks[idx] = new_meth diff --git a/tests/conftest.py b/tests/conftest.py index 95b8e5e..2c98101 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,12 @@ import os import pytest -from scrapy.settings import Settings +from .mockserver import MockServer +from .resources import SplashProtected @pytest.fixture() -def settings(request): +def settings(): """ Default scrapy-splash settings """ s = dict( # collect scraped items to .collected_items attribute @@ -28,6 +29,12 @@ def settings(request): DUPEFILTER_CLASS='scrapy_splash.SplashAwareDupeFilter', HTTPCACHE_STORAGE='scrapy_splash.SplashAwareFSCacheStorage', ) - return Settings(s) + return s +@pytest.fixture() +def settings_auth(settings): + with MockServer(SplashProtected) as s: + print("splash url:", s.root_url) + settings['SPLASH_URL'] = s.root_url + yield settings diff --git a/tests/resources.py b/tests/resources.py new file mode 100644 index 0000000..cb1d6c5 --- /dev/null +++ b/tests/resources.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +import os +from six.moves.urllib.parse import urlparse + +from twisted.web.resource import Resource +from zope.interface import implementer +from twisted.web import resource, guard, proxy +from twisted.cred.portal import IRealm, Portal +from twisted.cred.checkers import InMemoryUsernamePasswordDatabaseDontUse + +from scrapy_splash.utils import to_bytes + + +class HtmlResource(Resource): + isLeaf = True + content_type = 'text/html' + html = '' + extra_headers = {} + status_code = 200 + + def render_GET(self, request): + request.setHeader(b'content-type', to_bytes(self.content_type)) + for name, value in self.extra_headers.items(): + request.setHeader(to_bytes(name), to_bytes(value)) + request.setResponseCode(self.status_code) + return to_bytes(self.html) + + +class HelloWorld(HtmlResource): + html = """ +
+ """ + extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'} + + +class HelloWorldDisallowByRobots(HelloWorld): + """ Disallow itself via robots.txt """ + isLeaf = False + + def getChild(self, name, request): + if name == b"robots.txt": + return self.RobotsTxt() + return self + + class RobotsTxt(Resource): + isLeaf = True + def render_GET(self, request): + return b'User-Agent: *\nDisallow: /\n' + + +class HelloWorldDisallowAuth(HelloWorldDisallowByRobots): + """ Disallow itself via robots.txt if a request to robots.txt + contains basic auth header. """ + class RobotsTxt(HelloWorldDisallowByRobots.RobotsTxt): + def render_GET(self, request): + if request.requestHeaders.hasHeader('Authorization'): + return super(HelloWorldDisallowAuth.RobotsTxt, self).render_GET(request) + request.setResponseCode(404) + return b'' + + +class Http400Resource(HtmlResource): + status_code = 400 + html = "Website returns HTTP 400 error" + + +class ManyCookies(Resource, object): + class SetMyCookie(HtmlResource): + html = "hello!" + extra_headers = {'Set-Cookie': 'login=1'} + + def __init__(self): + super(ManyCookies, self).__init__() + self.putChild(b'', HelloWorld()) + self.putChild(b'login', self.SetMyCookie()) + + +def splash_proxy(): + splash_url = os.environ.get('SPLASH_URL') + p = urlparse(splash_url) + return lambda: proxy.ReverseProxyResource(p.hostname, int(p.port), b'') + + +def password_protected(resource_cls, username, password): + # Sorry, but this is nuts. A zillion of classes, arbitrary + # unicode / bytes requirements at random places. Is there a simpler + # way to get HTTP Basic Auth working in Twisted? + @implementer(IRealm) + class SimpleRealm(object): + def requestAvatar(self, avatarId, mind, *interfaces): + if resource.IResource in interfaces: + return resource.IResource, resource_cls(), lambda: None + raise NotImplementedError() + + creds = {username: password} + checkers = [InMemoryUsernamePasswordDatabaseDontUse(**creds)] + return lambda: guard.HTTPAuthSessionWrapper( + Portal(SimpleRealm(), checkers), + [guard.BasicCredentialFactory(b'example.com')]) + + +HelloWorldProtected = password_protected(HelloWorld, 'user', b'userpass') +HelloWorldProtected.__name__ = 'HelloWorldProtected' +HelloWorldProtected.__module__ = __name__ + +SplashProtected = password_protected(splash_proxy(), 'user', b'userpass') +SplashProtected.__name__ = 'SplashProtected' +SplashProtected.__module__ = __name__ diff --git a/tests/test_integration.py b/tests/test_integration.py index 60ef846..2a8c3b7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,11 +1,22 @@ # -*- coding: utf-8 -*- +import pytest import scrapy +from pkg_resources import parse_version from pytest_twisted import inlineCallbacks -from twisted.web.resource import Resource from w3lib.url import canonicalize_url +from w3lib.http import basic_auth_header from scrapy_splash import SplashRequest -from .utils import crawl_items, requires_splash, HtmlResource +from .utils import crawl_items, requires_splash +from .resources import ( + HelloWorld, + Http400Resource, + ManyCookies, + HelloWorldProtected, + HelloWorldDisallowByRobots, + HelloWorldDisallowAuth, +) + DEFAULT_SCRIPT = """ function main(splash) @@ -16,7 +27,10 @@ http_method=splash.args.http_method, body=splash.args.body, } - local wait = tonumber(splash.args.wait or 0.5) + local wait = 0.01 + if splash.args.wait ~= nil then + wait = splash.args.wait + end assert(splash:wait(wait)) local entries = splash:history() @@ -34,40 +48,52 @@ """ -class HelloWorld(HtmlResource): - html = """ - - """ - extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'} +class ResponseSpider(scrapy.Spider): + """ Make a request to URL, return Scrapy response """ + custom_settings = { + 'HTTPERROR_ALLOW_ALL': True, + 'ROBOTSTXT_OBEY': True, + } + url = None + def start_requests(self): + yield SplashRequest(self.url) -class Http400Resource(HtmlResource): - status_code = 400 - html = "Website returns HTTP 400 error" + def parse(self, response): + yield {'response': response} +class LuaSpider(ResponseSpider): + """ Make a request to URL using default Lua script """ + headers = None + splash_headers = None -class ManyCookies(Resource, object): - class SetMyCookie(HtmlResource): - html = "hello!" - extra_headers = {'Set-Cookie': 'login=1'} + def start_requests(self): + yield SplashRequest(self.url, + endpoint='execute', + args={'lua_source': DEFAULT_SCRIPT}, + headers=self.headers, + splash_headers=self.splash_headers) - def __init__(self): - super(ManyCookies, self).__init__() - self.putChild(b'', HelloWorld()) - self.putChild(b'login', self.SetMyCookie()) +class ScrapyAuthSpider(LuaSpider): + """ Spider with incorrect (old, insecure) auth method """ + http_user = 'user' + http_pass = 'userpass' -class ResponseSpider(scrapy.Spider): - """ Make a request to URL, return Scrapy response """ - url = None +class NonSplashSpider(ResponseSpider): + """ Spider which uses HTTP auth and doesn't use Splash """ + http_user = 'user' + http_pass = 'userpass' def start_requests(self): - yield SplashRequest(self.url) + yield scrapy.Request(self.url) - def parse(self, response): - yield {'response': response} + +def assert_single_response(items): + assert len(items) == 1 + return items[0]['response'] @requires_splash @@ -75,8 +101,7 @@ def parse(self, response): def test_basic(settings): items, url, crawler = yield crawl_items(ResponseSpider, HelloWorld, settings) - assert len(items) == 1 - resp = items[0]['response'] + resp = assert_single_response(items) assert resp.url == url assert resp.css('body::text').extract_first().strip() == "hello world!" @@ -124,8 +149,7 @@ def start_requests(self): items, url, crawler = yield crawl_items(LuaScriptSpider, HelloWorld, settings) - assert len(items) == 1 - resp = items[0]['response'] + resp = assert_single_response(items) assert resp.url == url + "/#foo" assert resp.status == resp.splash_response_status == 200 assert resp.css('body::text').extract_first().strip() == "hello world!" @@ -140,29 +164,19 @@ def start_requests(self): @inlineCallbacks def test_bad_request(settings): class BadRequestSpider(ResponseSpider): - custom_settings = {'HTTPERROR_ALLOW_ALL': True} - def start_requests(self): yield SplashRequest(self.url, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'}) - class GoodRequestSpider(ResponseSpider): - custom_settings = {'HTTPERROR_ALLOW_ALL': True} - - def start_requests(self): - yield SplashRequest(self.url, endpoint='execute', - args={'lua_source': DEFAULT_SCRIPT}) - - items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld, settings) - resp = items[0]['response'] + resp = assert_single_response(items) assert resp.status == 400 assert resp.splash_response_status == 400 - items, url, crawler = yield crawl_items(GoodRequestSpider, Http400Resource, + items, url, crawler = yield crawl_items(LuaSpider, Http400Resource, settings) - resp = items[0]['response'] + resp = assert_single_response(items) assert resp.status == 400 assert resp.splash_response_status == 200 @@ -306,3 +320,218 @@ def _cookie_dict(har_cookies): 'bomb': BOMB, } assert splash_request_headers.get(b'Cookie') is None + + +@requires_splash +@inlineCallbacks +def test_access_http_auth(settings): + # website is protected + items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, + settings) + response = assert_single_response(items) + assert response.status == 401 + assert response.splash_response_status == 200 + + # header can be used to access it + AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')} + kwargs = {'headers': AUTH_HEADERS} + items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, + settings, kwargs) + response = assert_single_response(items) + assert 'hello' in response.body_as_unicode() + assert response.status == 200 + assert response.splash_response_status == 200 + + +@requires_splash +@inlineCallbacks +def test_protected_splash_no_auth(settings_auth): + items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, + settings_auth) + response = assert_single_response(items) + assert 'Unauthorized' in response.body_as_unicode() + assert 'hello' not in response.body_as_unicode() + assert response.status == 401 + assert response.splash_response_status == 401 + + +@requires_splash +@inlineCallbacks +def test_protected_splash_manual_headers_auth(settings_auth): + AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')} + kwargs = {'splash_headers': AUTH_HEADERS} + + # auth via splash_headers should work + items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, + settings_auth, kwargs) + response = assert_single_response(items) + assert 'hello' in response.body_as_unicode() + assert response.status == 200 + assert response.splash_response_status == 200 + + # but only for Splash, not for a remote website + items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, + settings_auth, kwargs) + response = assert_single_response(items) + assert 'hello' not in response.body_as_unicode() + assert response.status == 401 + assert response.splash_response_status == 200 + + +@requires_splash +@inlineCallbacks +def test_protected_splash_settings_auth(settings_auth): + settings_auth['SPLASH_USER'] = 'user' + settings_auth['SPLASH_PASS'] = 'userpass' + + # settings works + items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, + settings_auth) + response = assert_single_response(items) + assert 'Unauthorized' not in response.body_as_unicode() + assert 'hello' in response.body_as_unicode() + assert response.status == 200 + assert response.splash_response_status == 200 + + # they can be overridden via splash_headers + bad_auth = {'splash_headers': {'Authorization': 'foo'}} + items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, + settings_auth, bad_auth) + response = assert_single_response(items) + assert response.status == 401 + assert response.splash_response_status == 401 + + # auth error on remote website + items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, + settings_auth) + response = assert_single_response(items) + assert response.status == 401 + assert response.splash_response_status == 200 + + # auth both for Splash and for the remote website + REMOTE_AUTH = {'Authorization': basic_auth_header('user', 'userpass')} + remote_auth_kwargs = {'headers': REMOTE_AUTH} + items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, + settings_auth, remote_auth_kwargs) + response = assert_single_response(items) + assert response.status == 200 + assert response.splash_response_status == 200 + assert 'hello' in response.body_as_unicode() + + # enable remote auth, but not splash auth - request should fail + del settings_auth['SPLASH_USER'] + del settings_auth['SPLASH_PASS'] + items, url, crawler = yield crawl_items(LuaSpider, + HelloWorldProtected, + settings_auth, remote_auth_kwargs) + response = assert_single_response(items) + assert response.status == 401 + assert response.splash_response_status == 401 + + +@requires_splash +@inlineCallbacks +def test_protected_splash_httpauth_middleware(settings_auth): + # httpauth middleware should enable auth for Splash, for backwards + # compatibility reasons + items, url, crawler = yield crawl_items(ScrapyAuthSpider, HelloWorld, + settings_auth) + response = assert_single_response(items) + assert 'Unauthorized' not in response.body_as_unicode() + assert 'hello' in response.body_as_unicode() + assert response.status == 200 + assert response.splash_response_status == 200 + + # but not for a remote website + items, url, crawler = yield crawl_items(ScrapyAuthSpider, + HelloWorldProtected, + settings_auth) + response = assert_single_response(items) + assert 'hello' not in response.body_as_unicode() + assert response.status == 401 + assert response.splash_response_status == 200 + + # headers shouldn't be sent to robots.txt file + items, url, crawler = yield crawl_items(ScrapyAuthSpider, + HelloWorldDisallowAuth, + settings_auth) + response = assert_single_response(items) + assert 'hello' in response.body_as_unicode() + assert response.status == 200 + assert response.splash_response_status == 200 + + # httpauth shouldn't be disabled for non-Splash requests + items, url, crawler = yield crawl_items(NonSplashSpider, + HelloWorldProtected, + settings_auth) + response = assert_single_response(items) + assert 'hello' in response.body_as_unicode() + assert response.status == 200 + assert not hasattr(response, 'splash_response_status') + + +@pytest.mark.xfail( + parse_version(scrapy.__version__) < parse_version("1.1"), + reason="https://github.com/scrapy/scrapy/issues/1471", + strict=True, + run=True, +) +@requires_splash +@inlineCallbacks +def test_robotstxt_can_work(settings_auth): + + def assert_robots_disabled(items): + response = assert_single_response(items) + assert response.status == response.splash_response_status == 200 + assert b'hello' in response.body + + def assert_robots_enabled(items, crawler): + assert len(items) == 0 + assert crawler.stats.get_value('downloader/exception_type_count/scrapy.exceptions.IgnoreRequest') == 1 + + def _crawl_items(spider, resource): + return crawl_items( + spider, + resource, + settings_auth, + url_path='/', # https://github.com/scrapy/protego/issues/17 + ) + + # when old auth method is used, robots.txt should be disabled + items, url, crawler = yield _crawl_items(ScrapyAuthSpider, + HelloWorldDisallowByRobots) + assert_robots_disabled(items) + + # but robots.txt should still work for non-Splash requests + items, url, crawler = yield _crawl_items(NonSplashSpider, + HelloWorldDisallowByRobots) + assert_robots_enabled(items, crawler) + + # robots.txt should work when a proper auth method is used + settings_auth['SPLASH_USER'] = 'user' + settings_auth['SPLASH_PASS'] = 'userpass' + items, url, crawler = yield _crawl_items(LuaSpider, + HelloWorldDisallowByRobots) + assert_robots_enabled(items, crawler) + + # disable robotstxt middleware - robots middleware shouldn't work + class DontObeyRobotsSpider(LuaSpider): + custom_settings = { + 'HTTPERROR_ALLOW_ALL': True, + 'ROBOTSTXT_OBEY': False, + } + items, url, crawler = yield _crawl_items(DontObeyRobotsSpider, + HelloWorldDisallowByRobots) + assert_robots_disabled(items) + + # disable robotstxt middleware via request meta + class MetaDontObeyRobotsSpider(ResponseSpider): + def start_requests(self): + yield SplashRequest(self.url, + endpoint='execute', + meta={'dont_obey_robotstxt': True}, + args={'lua_source': DEFAULT_SCRIPT}) + + items, url, crawler = yield _crawl_items(MetaDontObeyRobotsSpider, + HelloWorldDisallowByRobots) + assert_robots_disabled(items) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 66b79ce..d3a3c49 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -32,8 +32,8 @@ def _get_crawler(settings_dict): return crawler -def _get_mw(): - crawler = _get_crawler({}) +def _get_mw(settings_dict=None): + crawler = _get_crawler(settings_dict or {}) return SplashMiddleware.from_crawler(crawler) @@ -70,6 +70,7 @@ def test_splash_request(): # check request preprocessing req2 = cookie_mw.process_request(req, None) or req req2 = mw.process_request(req2, None) or req2 + assert req2 is not None assert req2 is not req assert req2.url == "http://127.0.0.1:8050/render.html" @@ -139,7 +140,9 @@ def cb(): headers={'X-My-Header': 'value'} ) req2 = cookie_mw.process_request(req, None) or req - req2 = mw.process_request(req2, None) + req2 = mw.process_request(req2, None) or req2 + + assert req2.meta['ajax_crawlable'] is True assert req2.meta['splash'] == { 'endpoint': 'execute', 'splash_url': "http://mysplash.example.com", @@ -348,7 +351,7 @@ def test_magic_response2(): mw = _get_mw() req = SplashRequest('http://example.com/', magic_response=True, headers={'foo': 'bar'}, dont_send_headers=True) - req = mw.process_request(req, None) + req = mw.process_request(req, None) or req assert 'headers' not in req.meta['splash']['args'] resp_data = { @@ -372,7 +375,7 @@ def test_unicode_url(): req = SplashRequest( # note unicode URL u"http://example.com/", endpoint='execute') - req2 = mw.process_request(req, None) + req2 = mw.process_request(req, None) or req res = {'html': 'Hello'} res_body = json.dumps(res) response = TextResponse("http://mysplash.example.com/execute", @@ -387,7 +390,7 @@ def test_unicode_url(): def test_magic_response_http_error(): mw = _get_mw() req = SplashRequest('http://example.com/foo') - req = mw.process_request(req, None) + req = mw.process_request(req, None) or req resp_data = { "info": { @@ -414,7 +417,7 @@ def test_magic_response_http_error(): def test_change_response_class_to_text(): mw = _get_mw() req = SplashRequest('http://example.com/', magic_response=True) - req = mw.process_request(req, None) + req = mw.process_request(req, None) or req # Such response can come when downloading a file, # or returning splash:html(): the headers say it's binary, # but it can be decoded so it becomes a TextResponse. @@ -437,7 +440,7 @@ def test_change_response_class_to_json_binary(): # but this is ok because magic_response presumes we are expecting # a valid splash json response. req = SplashRequest('http://example.com/', magic_response=False) - req = mw.process_request(req, None) + req = mw.process_request(req, None) or req resp = Response('http://mysplash.example.com/execute', headers={b'Content-Type': b'application/json'}, body=b'non-decodable data: \x98\x11\xe7\x17\x8f', @@ -474,7 +477,7 @@ def _get_req(): # first call req = _get_req() req = cookie_mw.process_request(req, spider) or req - req = mw.process_request(req, spider) + req = mw.process_request(req, spider) or req req = cache_mw.process_request(req, spider) or req assert isinstance(req, scrapy.Request) # first call; the cache is empty @@ -498,7 +501,7 @@ def _get_req(): # second call req = _get_req() req = cookie_mw.process_request(req, spider) or req - req = mw.process_request(req, spider) + req = mw.process_request(req, spider) or req cached_resp = cache_mw.process_request(req, spider) or req # response should be from cache: @@ -666,6 +669,7 @@ def test_override_splash_url(): } }) req = mw.process_request(req1, None) + req = mw.process_request(req, None) or req assert req.url == 'http://splash.example.com/render.png' assert json.loads(to_native_str(req.body)) == {'url': req1.url} @@ -677,6 +681,7 @@ def test_url_with_fragment(): 'splash': {'args': {'url': url}} }) req = mw.process_request(req, None) + req = mw.process_request(req, None) or req assert json.loads(to_native_str(req.body)) == {'url': url} @@ -685,6 +690,7 @@ def test_splash_request_url_with_fragment(): url = "http://example.com#id1" req = SplashRequest(url) req = mw.process_request(req, None) + req = mw.process_request(req, None) or req assert json.loads(to_native_str(req.body)) == {'url': url} @@ -740,7 +746,7 @@ def test_slot_policy_per_domain(): def test_slot_policy_scrapy_default(): mw = _get_mw() - req = scrapy.Request("http://example.com", meta = {'splash': { + req = scrapy.Request("http://example.com", meta={'splash': { 'slot_policy': scrapy_splash.SlotPolicy.SCRAPY_DEFAULT }}) req = mw.process_request(req, None) @@ -749,7 +755,7 @@ def test_slot_policy_scrapy_default(): def test_adjust_timeout(): mw = _get_mw() - req1 = scrapy.Request("http://example.com", meta = { + req1 = scrapy.Request("http://example.com", meta={ 'splash': {'args': {'timeout': 60, 'html': 1}}, # download_timeout is always present, @@ -759,9 +765,32 @@ def test_adjust_timeout(): req1 = mw.process_request(req1, None) assert req1.meta['download_timeout'] > 60 - req2 = scrapy.Request("http://example.com", meta = { + req2 = scrapy.Request("http://example.com", meta={ 'splash': {'args': {'html': 1}}, 'download_timeout': 30, }) req2 = mw.process_request(req2, None) assert req2.meta['download_timeout'] == 30 + + +def test_auth(): + def assert_auth_header(user, pwd, header): + mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd}) + req = mw.process_request(SplashRequest("http://example.com"), None) + assert 'Authorization' in req.headers + assert req.headers['Authorization'] == header + + def assert_no_auth_header(user, pwd): + if user is not None or pwd is not None: + mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd}) + else: + mw = _get_mw() + req = mw.process_request(SplashRequest("http://example.com"), None) + assert 'Authorization' not in req.headers + + assert_auth_header('root', '', b'Basic cm9vdDo=') + assert_auth_header('root', 'pwd', b'Basic cm9vdDpwd2Q=') + assert_auth_header('', 'pwd', b'Basic OnB3ZA==') + + assert_no_auth_header('', '') + assert_no_auth_header(None, None) \ No newline at end of file diff --git a/tests/utils.py b/tests/utils.py index dfaa9d8..efd3dc8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,11 +3,9 @@ import pytest from pytest_twisted import inlineCallbacks from twisted.internet.defer import returnValue -from twisted.web.resource import Resource from scrapy.crawler import Crawler -from scrapy_splash.utils import to_bytes -from tests.mockserver import MockServer +from .mockserver import MockServer requires_splash = pytest.mark.skipif( @@ -16,23 +14,14 @@ ) -class HtmlResource(Resource): - isLeaf = True - content_type = 'text/html' - html = '' - extra_headers = {} - status_code = 200 - - def render_GET(self, request): - request.setHeader(b'content-type', to_bytes(self.content_type)) - for name, value in self.extra_headers.items(): - request.setHeader(to_bytes(name), to_bytes(value)) - request.setResponseCode(self.status_code) - return to_bytes(self.html) - - @inlineCallbacks -def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None): +def crawl_items( + spider_cls, + resource_cls, + settings, + spider_kwargs=None, + url_path="", +): """ Use spider_cls to crawl resource_cls. URL of the resource is passed to the spider as ``url`` argument. Return ``(items, resource_url, crawler)`` tuple. @@ -40,9 +29,11 @@ def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None): spider_kwargs = {} if spider_kwargs is None else spider_kwargs crawler = make_crawler(spider_cls, settings) with MockServer(resource_cls) as s: - root_url = s.root_url + print("mock server", s.root_url) + root_url = s.root_url + url_path yield crawler.crawl(url=root_url, **spider_kwargs) - result = crawler.spider.collected_items, s.root_url, crawler + items = getattr(crawler.spider, 'collected_items', []) + result = items, root_url, crawler returnValue(result)