|
1 | 1 | import sys
|
2 | 2 | import asyncio
|
3 | 3 | from io import BytesIO
|
4 |
| - |
5 | 4 | from pyppeteer.errors import PageError, TimeoutError
|
6 | 5 | from scrapy.http import HtmlResponse
|
7 | 6 | import twisted.internet
|
|
11 | 10 | from pyppeteer import launch
|
12 | 11 | from gerapy_pyppeteer.pretend import SCRIPTS as PRETEND_SCRIPTS
|
13 | 12 | from gerapy_pyppeteer.settings import *
|
| 13 | +import urllib.parse |
| 14 | + |
| 15 | +if sys.platform == 'win32': |
| 16 | + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) |
14 | 17 |
|
15 | 18 | reactor = AsyncioSelectorReactor(asyncio.get_event_loop())
|
16 | 19 |
|
@@ -117,6 +120,8 @@ def from_crawler(cls, crawler):
|
117 | 120 | cls.screenshot = settings.get('GERAPY_PYPPETEER_SCREENSHOT', GERAPY_PYPPETEER_SCREENSHOT)
|
118 | 121 | cls.pretend = settings.get('GERAPY_PYPPETEER_PRETEND', GERAPY_PYPPETEER_PRETEND)
|
119 | 122 | cls.sleep = settings.get('GERAPY_PYPPETEER_SLEEP', GERAPY_PYPPETEER_SLEEP)
|
| 123 | + cls.enable_request_interception = settings.getbool('GERAPY_ENABLE_REQUEST_INTERCEPTION', |
| 124 | + GERAPY_ENABLE_REQUEST_INTERCEPTION) |
120 | 125 | cls.retry_enabled = settings.getbool('RETRY_ENABLED')
|
121 | 126 | cls.max_retry_times = settings.getint('RETRY_TIMES')
|
122 | 127 | cls.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
|
@@ -198,16 +203,20 @@ async def _process_request(self, request, spider):
|
198 | 203 | await page.evaluateOnNewDocument(script)
|
199 | 204 |
|
200 | 205 | # set cookies
|
| 206 | + parse_result = urllib.parse.urlsplit(request.url) |
| 207 | + domain = parse_result.hostname |
| 208 | + _cookies = [] |
201 | 209 | if isinstance(request.cookies, dict):
|
202 |
| - await page.setCookie(*[ |
203 |
| - {'name': k, 'value': v} |
204 |
| - for k, v in request.cookies.items() |
205 |
| - ]) |
| 210 | + _cookies = [{'name': k, 'value': v, 'domain': domain} |
| 211 | + for k, v in request.cookies.items()] |
206 | 212 | else:
|
207 |
| - await page.setCookie(request.cookies) |
| 213 | + for _cookie in _cookies: |
| 214 | + if isinstance(_cookie, dict) and 'domain' not in _cookie.keys(): |
| 215 | + _cookie['domain'] = domain |
| 216 | + await page.setCookie(*_cookies) |
208 | 217 |
|
209 | 218 | # the headers must be set using request interception
|
210 |
| - await page.setRequestInterception(True) |
| 219 | + await page.setRequestInterception(self.enable_request_interception) |
211 | 220 |
|
212 | 221 | @page.on('request')
|
213 | 222 | async def _handle_interception(pu_request):
|
|
0 commit comments