-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Description
I wrote a quick AsyncScraper class below:
import logging, datetime, time
import aiohttp
import asyncio
import uvloop
# asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger.addHandler(logging.StreamHandler())
class AsyncScraper(object):
headers = {"User-Agent" : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'}
def __init__(self, max_connections=1000, timeout=10):
self.max_connections = max_connections
self.timeout = timeout
async def get_response(self, url, session):
with aiohttp.Timeout(timeout=self.timeout):
async with session.get(url, allow_redirects=True, headers=AsyncScraper.headers, timeout=self.timeout) as response:
try:
content = await response.text()
return {'error': "", 'status': response.status, 'url':url, 'content': content, 'timestamp': str(datetime.datetime.utcnow())}
except Exception as err:
return {'error': err, 'status': "", 'url':url, 'content': "", 'timestamp': str(datetime.datetime.utcnow())}
finally:
response.close()
def get_all(self, urls):
loop = asyncio.get_event_loop()
with aiohttp.ClientSession(loop=loop, connector=aiohttp.TCPConnector(keepalive_timeout=10, limit=self.max_connections, verify_ssl=False)) as session:
tasks = asyncio.gather(*[self.get_response(url, session) for url in urls], return_exceptions=True)
results = loop.run_until_complete(tasks)
return results
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
def process_urls(urls, chunk_size=1000):
scraper = AsyncScraper()
results = []
t0 = time.time()
for i, urls_chunk in enumerate(chunks(sorted(set(urls)), chunk_size)):
t1 = time.time()
result = scraper.get_all(urls_chunk)
success_size = len( [_ for _ in result if ((isinstance(_, Exception) is False) and (_['status']==200)) ] )
results.extend(result)
logger.debug("batch {} => success: {} => iteration time: {}s => total time: {}s => total processed {}".format(i+1, success_size, time.time()-t1, time.time()-t0, len(results)))
return results
and I've run into two main issues:
-
If I pass in a flat list of URLs, say 100k (via the get_all method), I get flooded with errors:
2017-04-17 15:50:53,541 - asyncio - ERROR - Fatal error on SSL transport
protocol: <asyncio.sslproto.SSLProtocol object at 0x10d5439b0>
transport: <_SelectorSocketTransport closing fd=612 read=idle write=<idle, bufsize=0>>
Traceback (most recent call last):
File "/Users/vgoklani/anaconda3/lib/python3.6/asyncio/sslproto.py", line 639, in _process_write_backlog
ssldata = self._sslpipe.shutdown(self._finalize)
File "/Users/vgoklani/anaconda3/lib/python3.6/asyncio/sslproto.py", line 151, in shutdown
raise RuntimeError('shutdown in progress')
RuntimeError: shutdown in progress -
I then batched the URLs in chunks of 1,000, and timed the response between batches. And I was clearly able to measure the performance decay over time (see below). Moreover, the number of errors increased over time... What am I doing wrong?
iteration 0 done in 16.991s
iteration 1 done in 39.376s
iteration 2 done in 35.656s
iteration 3 done in 19.716s
iteration 4 done in 29.331s
iteration 5 done in 19.708s
iteration 6 done in 19.572s
iteration 7 done in 29.907s
iteration 8 done in 23.379s
iteration 9 done in 21.762s
iteration 10 done in 22.091s
iteration 11 done in 22.940s
iteration 12 done in 31.285s
iteration 13 done in 24.549s
iteration 14 done in 26.297s
iteration 15 done in 23.816s
iteration 16 done in 29.094s
iteration 17 done in 24.885s
iteration 18 done in 26.456s
iteration 19 done in 27.412s
iteration 20 done in 29.969s
iteration 21 done in 28.503s
iteration 22 done in 28.699s
iteration 23 done in 31.570s
iteration 26 done in 31.898s
iteration 27 done in 33.553s
iteration 28 done in 34.022s
iteration 29 done in 33.866s
iteration 30 done in 36.351s
iteration 31 done in 40.060s
iteration 32 done in 35.523s
iteration 33 done in 36.607s
iteration 34 done in 36.325s
iteration 35 done in 38.425s
iteration 36 done in 39.106s
iteration 37 done in 38.972s
iteration 38 done in 39.845s
iteration 39 done in 40.393s
iteration 40 done in 40.734s
iteration 41 done in 47.799s
iteration 42 done in 43.070s
iteration 43 done in 43.365s
iteration 44 done in 42.081s
iteration 45 done in 44.118s
iteration 46 done in 44.955s
iteration 47 done in 45.400s
iteration 48 done in 45.987s
iteration 49 done in 46.041s
iteration 50 done in 45.899s
iteration 51 done in 49.008s
iteration 52 done in 49.544s
iteration 53 done in 55.432s
iteration 54 done in 52.590s
iteration 55 done in 50.185s
iteration 56 done in 52.858s
iteration 57 done in 52.698s
iteration 58 done in 53.048s
iteration 59 done in 54.120s
iteration 60 done in 54.151s
iteration 61 done in 55.465s
iteration 62 done in 56.889s
iteration 63 done in 56.967s
iteration 64 done in 57.690s
iteration 65 done in 57.052s
iteration 66 done in 67.214s
iteration 67 done in 58.457s
iteration 68 done in 60.882s
iteration 69 done in 58.440s
iteration 70 done in 60.755s
iteration 71 done in 58.043s
iteration 72 done in 65.076s
iteration 73 done in 63.371s
iteration 74 done in 62.800s
iteration 75 done in 62.419s
iteration 76 done in 61.376s
iteration 77 done in 63.164s
iteration 78 done in 65.443s
iteration 79 done in 64.616s
iteration 80 done in 69.544s
iteration 81 done in 68.226s
iteration 82 done in 78.050s
iteration 83 done in 67.871s
iteration 84 done in 69.780s
iteration 85 done in 67.812s
iteration 86 done in 68.895s
iteration 87 done in 71.086s
iteration 88 done in 68.809s
iteration 89 done in 70.945s
iteration 90 done in 72.760s
iteration 91 done in 71.773s
iteration 92 done in 72.522s
The time here corresponds to the iteration time to process 1,000 URLs. Please advise. Thanks