Skip to content

Commit 51e3216

Browse files
committed
0010a1
1 parent 7c7c0ba commit 51e3216

File tree

4 files changed

+32
-14
lines changed

4 files changed

+32
-14
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Gerapy Pyppeteer Changelog
22

3+
## 0.0.10 (2020-08-01)
4+
5+
### Features
6+
7+
* Change the priority of `request.meta.get('proxy')` and `pyppeteer_meta.get('proxy')`
8+
* Add `pretend` attribute for `PyppeteerRequest`, which can override `GERAPY_PYPPETEER_PRETEND`
9+
310
## 0.0.9 (2020-07-31)
411

512
### Features

gerapy_pyppeteer/downloadermiddlewares.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,23 @@ async def _process_request(self, request, spider):
177177
logger.debug('pyppeteer_meta %s', pyppeteer_meta)
178178

179179
# set proxy
180-
proxy = pyppeteer_meta.get('proxy')
181-
if not proxy:
182-
proxy = request.meta.get('proxy')
183-
if proxy: options['args'].append(f'--proxy-server={proxy}')
180+
_proxy = request.meta.get('proxy')
181+
if pyppeteer_meta.get('proxy') is not None:
182+
_proxy = pyppeteer_meta.get('proxy')
183+
if _proxy:
184+
options['args'].append(f'--proxy-server={_proxy}')
184185

185186
logger.debug('set options %s', options)
186187

187188
browser = await launch(options)
188189
page = await browser.newPage()
189190
await page.setViewport({'width': self.window_width, 'height': self.window_height})
190191

191-
if self.pretend:
192+
# pretend as normal browser
193+
_pretend = self.pretend
194+
if pyppeteer_meta.get('pretend') is not None:
195+
_pretend = pyppeteer_meta.get('pretend')
196+
if _pretend:
192197
for script in PRETEND_SCRIPTS:
193198
await page.evaluateOnNewDocument(script)
194199

@@ -246,6 +251,7 @@ async def _handle_interception(pu_request):
246251
await browser.close()
247252
return self._retry(request, 504, spider)
248253

254+
# wait for dom loaded
249255
if pyppeteer_meta.get('wait_for'):
250256
_wait_for = pyppeteer_meta.get('wait_for')
251257
try:

gerapy_pyppeteer/request.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ class PyppeteerRequest(Request):
77
Scrapy ``Request`` subclass providing additional arguments
88
"""
99

10-
def __init__(self, url, callback=None, wait_until=None, wait_for=None, script=None, sleep=None, timeout=None,
11-
proxy=None, ignore_resource_types=None, meta=None, screenshot=None, *args,
10+
def __init__(self, url, callback=None, wait_until=None, wait_for=None, script=None, proxy=None,
11+
sleep=None, timeout=None, ignore_resource_types=None, pretend=None, screenshot=None, meta=None, *args,
1212
**kwargs):
1313
"""
1414
:param url: request url
@@ -17,12 +17,14 @@ def __init__(self, url, callback=None, wait_until=None, wait_for=None, script=No
1717
see https://miyakogi.github.io/pyppeteer/reference.html#pyppeteer.page.Page.goto
1818
:param wait_for: wait for some element to load
1919
:param script: script to execute
20-
:param sleep: time to sleep after loaded
21-
:param timeout: load timeout
22-
:param proxy: use proxy to request
23-
:param ignore_resource_types: ignored resource types
20+
:param proxy: use proxy for this time, like `http://x.x.x.x:x`
21+
:param sleep: time to sleep after loaded, override `GERAPY_PYPPETEER_SLEEP`
22+
:param timeout: load timeout, override `GERAPY_PYPPETEER_DOWNLOAD_TIMEOUT`
23+
:param ignore_resource_types: ignored resource types, override `GERAPY_PYPPETEER_IGNORE_RESOURCE_TYPES`
24+
:param pretend: pretend as normal browser, override `GERAPY_PYPPETEER_PRETEND`
2425
:param screenshot: ignored resource types, see
25-
https://miyakogi.github.io/pyppeteer/_modules/pyppeteer/page.html#Page.screenshot
26+
https://miyakogi.github.io/pyppeteer/_modules/pyppeteer/page.html#Page.screenshot,
27+
override `GERAPY_PYPPETEER_SCREENSHOT`
2628
:param args:
2729
:param kwargs:
2830
"""
@@ -36,16 +38,20 @@ def __init__(self, url, callback=None, wait_until=None, wait_for=None, script=No
3638
self.script = pyppeteer_mata.get('script') if pyppeteer_mata.get('script') is not None else script
3739
self.sleep = pyppeteer_mata.get('sleep') if pyppeteer_mata.get('sleep') is not None else sleep
3840
self.proxy = pyppeteer_mata.get('proxy') if pyppeteer_mata.get('proxy') is not None else proxy
41+
self.pretend = pyppeteer_mata.get('pretend') if pyppeteer_mata.get('pretend') is not None else pretend
3942
self.timeout = pyppeteer_mata.get('timeout') if pyppeteer_mata.get('timeout') is not None else timeout
4043
self.ignore_resource_types = pyppeteer_mata.get('ignore_resource_types') if pyppeteer_mata.get(
4144
'ignore_resource_types') is not None else ignore_resource_types
42-
self.screenshot = pyppeteer_mata.get('screenshot') if pyppeteer_mata.get('screenshot') is not None else screenshot
45+
self.screenshot = pyppeteer_mata.get('screenshot') if pyppeteer_mata.get(
46+
'screenshot') is not None else screenshot
47+
4348
pyppeteer_mata = meta.setdefault('pyppeteer', {})
4449
pyppeteer_mata['wait_until'] = self.wait_until
4550
pyppeteer_mata['wait_for'] = self.wait_for
4651
pyppeteer_mata['script'] = self.script
4752
pyppeteer_mata['sleep'] = self.sleep
4853
pyppeteer_mata['proxy'] = self.proxy
54+
pyppeteer_mata['pretend'] = self.pretend
4955
pyppeteer_mata['timeout'] = self.timeout
5056
pyppeteer_mata['screenshot'] = self.screenshot
5157
pyppeteer_mata['ignore_resource_types'] = self.ignore_resource_types

gerapy_pyppeteer/settings.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,5 @@
3636
# ``texttrack``, ``xhr``, ``fetch``, ``eventsource``, ``websocket``,
3737
# ``manifest``, ``other``.
3838
GERAPY_PYPPETEER_IGNORE_RESOURCE_TYPES = []
39-
4039
GERAPY_PYPPETEER_SCREENSHOT = None
4140
GERAPY_PYPPETEER_SLEEP = 1

0 commit comments

Comments
 (0)