update aiohttp

Germey · Germey · commit 8962c9f08238 · 2020-07-15T18:07:17.000+08:00
diff --git a/example/example/settings.py b/example/example/settings.py
@@ -14,79 +14,80 @@
 SPIDER_MODULES = ['example.spiders']
 NEWSPIDER_MODULE = 'example.spiders'
 
-
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'example (+http://www.yourdomain.com)'
+# USER_AGENT = 'example (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 CONCURRENT_REQUESTS = 3
+DOWNLOAD_TIMEOUT = 10
+RETRY_TIMES = 10
 
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'example.middlewares.ExampleSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-   'gerapy_proxy.middlewares.ProxyPoolMiddleware': 543,
+    'gerapy_proxy.middlewares.ProxyPoolMiddleware': 543,
 }
 
 GERAPY_PROXY_POOL_URL = 'https://proxypool.scrape.center/random'
 
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+# ITEM_PIPELINES = {
 #    'example.pipelines.ExamplePipeline': 300,
-#}
+# }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/example/example/spiders/httpbin.py b/example/example/spiders/httpbin.py
@@ -32,6 +32,6 @@ def parse(self, response):
         :return:
         """
         data = json.loads(response.text)
-        logger.info(f'request from %s successfully, current page %s',
+        logger.info(f'got request from %s successfully, current page %s',
                     data.get('origin'),
                     response.meta.get('page'))
diff --git a/gerapy_proxy/middlewares.py b/gerapy_proxy/middlewares.py
@@ -1,9 +1,7 @@
-import requests
 import random
 import logging
 import aiohttp
 from gerapy_proxy.settings import *
-import time
 import asyncio
 import sys
 import twisted.internet
@@ -23,6 +21,16 @@ class ProxyPoolMiddleware(object):
     using proxy pool as proxy
     """
     
+    def _extract_response(self, text):
+        """
+        extract response content
+        :param text:
+        :return:
+        """
+        settings = self.settings
+        extract_func = settings.get('GERAPY_PROXY_EXTRACT_FUNC', GERAPY_PROXY_EXTRACT_FUNC)
+        return extract_func(text)
+    
     @classmethod
     def from_crawler(cls, crawler):
         """
@@ -31,6 +39,7 @@ def from_crawler(cls, crawler):
         :return:
         """
         settings = crawler.settings
+        cls.settings = settings
         # proxy pool settings
         cls.proxy_pool_url = settings.get('GERAPY_PROXY_POOL_URL', GERAPY_PROXY_POOL_URL)
         cls.proxy_pool_auth = settings.get('GERAPY_PROXY_POOL_AUTH', GERAPY_PROXY_POOL_AUTH)
@@ -41,7 +50,6 @@ def from_crawler(cls, crawler):
         cls.proxy_pool_random_enable_rate = settings.get('GERAPY_PROXY_POOL_RANDOM_ENABLE_RATE',
                                                          GERAPY_PROXY_POOL_RANDOM_ENABLE_RATE)
         cls.proxy_pool_timeout = settings.get('GERAPY_PROXY_POOL_TIMEOUT', GERAPY_PROXY_POOL_TIMEOUT)
-        cls.proxy_pool_extract_func = lambda _: settings.get('GERAPY_PROXY_EXTRACT_FUNC', GERAPY_PROXY_EXTRACT_FUNC)
         return cls()
     
     async def get_proxy(self):
@@ -50,7 +58,6 @@ async def get_proxy(self):
         :return:
         """
         logger.debug('start to get proxy from proxy pool')
-        await asyncio.sleep(10)
         kwargs = {}
         if self.proxy_pool_auth:
             kwargs['auth'] = aiohttp.BasicAuth(login=self.proxy_pool_username, password=self.proxy_pool_password)
@@ -61,7 +68,7 @@ async def get_proxy(self):
         async with aiohttp.ClientSession() as client:
             response = await client.get(self.proxy_pool_url, **kwargs)
             if response.status == 200:
-                proxy = self.proxy_pool_extract_func()(response.text)
+                proxy = self._extract_response(await response.text())
                 logger.debug('get proxy %s', proxy)
                 return proxy
     
diff --git a/gerapy_proxy/settings.py b/gerapy_proxy/settings.py
@@ -1,11 +1,13 @@
+from gerapy_proxy.utils import strip_response
+
 GERAPY_PROXY_POOL_URL = None
 GERAPY_PROXY_POOL_AUTH = False
 GERAPY_PROXY_POOL_USERNAME = None
 GERAPY_PROXY_POOL_PASSWORD = None
 GERAPY_PROXY_POOL_MIN_RETRY_TIMES = 0
 GERAPY_PROXY_POOL_RANDOM_ENABLE_RATE = 1
 GERAPY_PROXY_POOL_TIMEOUT = 5
-GERAPY_PROXY_EXTRACT_FUNC = lambda x: x.strip() if x else None
+GERAPY_PROXY_EXTRACT_FUNC = strip_response
 
 GERAPY_PROXY_TUNNEL_URL = None
 GERAPY_PROXY_TUNNEL_AUTH = False
diff --git a/gerapy_proxy/utils.py b/gerapy_proxy/utils.py
@@ -0,0 +1,10 @@
+def strip_response(data):
+    """
+    strip response data
+    :param data:
+    :return:
+    """
+    if not data:
+        return
+    if isinstance(data, str):
+        return data.strip()
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
 scrapy>=2.0.0
+aiohttp>=3.6.2