Skip to content

Commit 8962c9f

Browse files
committed
update aiohttp
1 parent 23d9c06 commit 8962c9f

File tree

6 files changed

+54
-33
lines changed

6 files changed

+54
-33
lines changed

example/example/settings.py

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,79 +14,80 @@
1414
SPIDER_MODULES = ['example.spiders']
1515
NEWSPIDER_MODULE = 'example.spiders'
1616

17-
1817
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19-
#USER_AGENT = 'example (+http://www.yourdomain.com)'
18+
# USER_AGENT = 'example (+http://www.yourdomain.com)'
2019

2120
# Obey robots.txt rules
2221
ROBOTSTXT_OBEY = False
2322

2423
# Configure maximum concurrent requests performed by Scrapy (default: 16)
2524
CONCURRENT_REQUESTS = 3
25+
DOWNLOAD_TIMEOUT = 10
26+
RETRY_TIMES = 10
2627

2728
# Configure a delay for requests for the same website (default: 0)
2829
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
2930
# See also autothrottle settings and docs
30-
#DOWNLOAD_DELAY = 3
31+
# DOWNLOAD_DELAY = 3
3132
# The download delay setting will honor only one of:
32-
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
33-
#CONCURRENT_REQUESTS_PER_IP = 16
33+
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
34+
# CONCURRENT_REQUESTS_PER_IP = 16
3435

3536
# Disable cookies (enabled by default)
36-
#COOKIES_ENABLED = False
37+
# COOKIES_ENABLED = False
3738

3839
# Disable Telnet Console (enabled by default)
39-
#TELNETCONSOLE_ENABLED = False
40+
# TELNETCONSOLE_ENABLED = False
4041

4142
# Override the default request headers:
42-
#DEFAULT_REQUEST_HEADERS = {
43+
# DEFAULT_REQUEST_HEADERS = {
4344
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
4445
# 'Accept-Language': 'en',
45-
#}
46+
# }
4647

4748
# Enable or disable spider middlewares
4849
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49-
#SPIDER_MIDDLEWARES = {
50+
# SPIDER_MIDDLEWARES = {
5051
# 'example.middlewares.ExampleSpiderMiddleware': 543,
51-
#}
52+
# }
5253

5354
# Enable or disable downloader middlewares
5455
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
5556
DOWNLOADER_MIDDLEWARES = {
56-
'gerapy_proxy.middlewares.ProxyPoolMiddleware': 543,
57+
'gerapy_proxy.middlewares.ProxyPoolMiddleware': 543,
5758
}
5859

5960
GERAPY_PROXY_POOL_URL = 'https://proxypool.scrape.center/random'
6061

6162
# Enable or disable extensions
6263
# See https://docs.scrapy.org/en/latest/topics/extensions.html
63-
#EXTENSIONS = {
64+
# EXTENSIONS = {
6465
# 'scrapy.extensions.telnet.TelnetConsole': None,
65-
#}
66+
# }
6667

6768
# Configure item pipelines
6869
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
69-
#ITEM_PIPELINES = {
70+
# ITEM_PIPELINES = {
7071
# 'example.pipelines.ExamplePipeline': 300,
71-
#}
72+
# }
7273

7374
# Enable and configure the AutoThrottle extension (disabled by default)
7475
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
75-
#AUTOTHROTTLE_ENABLED = True
76+
# AUTOTHROTTLE_ENABLED = True
7677
# The initial download delay
77-
#AUTOTHROTTLE_START_DELAY = 5
78+
# AUTOTHROTTLE_START_DELAY = 5
7879
# The maximum download delay to be set in case of high latencies
79-
#AUTOTHROTTLE_MAX_DELAY = 60
80+
# AUTOTHROTTLE_MAX_DELAY = 60
8081
# The average number of requests Scrapy should be sending in parallel to
8182
# each remote server
82-
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83+
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
8384
# Enable showing throttling stats for every response received:
84-
#AUTOTHROTTLE_DEBUG = False
85+
# AUTOTHROTTLE_DEBUG = False
8586

8687
# Enable and configure HTTP caching (disabled by default)
8788
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88-
#HTTPCACHE_ENABLED = True
89-
#HTTPCACHE_EXPIRATION_SECS = 0
90-
#HTTPCACHE_DIR = 'httpcache'
91-
#HTTPCACHE_IGNORE_HTTP_CODES = []
92-
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
89+
# HTTPCACHE_ENABLED = True
90+
# HTTPCACHE_EXPIRATION_SECS = 0
91+
# HTTPCACHE_DIR = 'httpcache'
92+
# HTTPCACHE_IGNORE_HTTP_CODES = []
93+
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

example/example/spiders/httpbin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,6 @@ def parse(self, response):
3232
:return:
3333
"""
3434
data = json.loads(response.text)
35-
logger.info(f'request from %s successfully, current page %s',
35+
logger.info(f'got request from %s successfully, current page %s',
3636
data.get('origin'),
3737
response.meta.get('page'))

gerapy_proxy/middlewares.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
1-
import requests
21
import random
32
import logging
43
import aiohttp
54
from gerapy_proxy.settings import *
6-
import time
75
import asyncio
86
import sys
97
import twisted.internet
@@ -23,6 +21,16 @@ class ProxyPoolMiddleware(object):
2321
using proxy pool as proxy
2422
"""
2523

24+
def _extract_response(self, text):
25+
"""
26+
extract response content
27+
:param text:
28+
:return:
29+
"""
30+
settings = self.settings
31+
extract_func = settings.get('GERAPY_PROXY_EXTRACT_FUNC', GERAPY_PROXY_EXTRACT_FUNC)
32+
return extract_func(text)
33+
2634
@classmethod
2735
def from_crawler(cls, crawler):
2836
"""
@@ -31,6 +39,7 @@ def from_crawler(cls, crawler):
3139
:return:
3240
"""
3341
settings = crawler.settings
42+
cls.settings = settings
3443
# proxy pool settings
3544
cls.proxy_pool_url = settings.get('GERAPY_PROXY_POOL_URL', GERAPY_PROXY_POOL_URL)
3645
cls.proxy_pool_auth = settings.get('GERAPY_PROXY_POOL_AUTH', GERAPY_PROXY_POOL_AUTH)
@@ -41,7 +50,6 @@ def from_crawler(cls, crawler):
4150
cls.proxy_pool_random_enable_rate = settings.get('GERAPY_PROXY_POOL_RANDOM_ENABLE_RATE',
4251
GERAPY_PROXY_POOL_RANDOM_ENABLE_RATE)
4352
cls.proxy_pool_timeout = settings.get('GERAPY_PROXY_POOL_TIMEOUT', GERAPY_PROXY_POOL_TIMEOUT)
44-
cls.proxy_pool_extract_func = lambda _: settings.get('GERAPY_PROXY_EXTRACT_FUNC', GERAPY_PROXY_EXTRACT_FUNC)
4553
return cls()
4654

4755
async def get_proxy(self):
@@ -50,7 +58,6 @@ async def get_proxy(self):
5058
:return:
5159
"""
5260
logger.debug('start to get proxy from proxy pool')
53-
await asyncio.sleep(10)
5461
kwargs = {}
5562
if self.proxy_pool_auth:
5663
kwargs['auth'] = aiohttp.BasicAuth(login=self.proxy_pool_username, password=self.proxy_pool_password)
@@ -61,7 +68,7 @@ async def get_proxy(self):
6168
async with aiohttp.ClientSession() as client:
6269
response = await client.get(self.proxy_pool_url, **kwargs)
6370
if response.status == 200:
64-
proxy = self.proxy_pool_extract_func()(response.text)
71+
proxy = self._extract_response(await response.text())
6572
logger.debug('get proxy %s', proxy)
6673
return proxy
6774

gerapy_proxy/settings.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1+
from gerapy_proxy.utils import strip_response
2+
13
GERAPY_PROXY_POOL_URL = None
24
GERAPY_PROXY_POOL_AUTH = False
35
GERAPY_PROXY_POOL_USERNAME = None
46
GERAPY_PROXY_POOL_PASSWORD = None
57
GERAPY_PROXY_POOL_MIN_RETRY_TIMES = 0
68
GERAPY_PROXY_POOL_RANDOM_ENABLE_RATE = 1
79
GERAPY_PROXY_POOL_TIMEOUT = 5
8-
GERAPY_PROXY_EXTRACT_FUNC = lambda x: x.strip() if x else None
10+
GERAPY_PROXY_EXTRACT_FUNC = strip_response
911

1012
GERAPY_PROXY_TUNNEL_URL = None
1113
GERAPY_PROXY_TUNNEL_AUTH = False

gerapy_proxy/utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
def strip_response(data):
2+
"""
3+
strip response data
4+
:param data:
5+
:return:
6+
"""
7+
if not data:
8+
return
9+
if isinstance(data, str):
10+
return data.strip()

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
scrapy>=2.0.0
2+
aiohttp>=3.6.2

0 commit comments

Comments
 (0)