|
7 | 7 | from scrapy.http import Request |
8 | 8 | from scrapy.conf import settings |
9 | 9 | from scrapy.utils.python import to_unicode |
| 10 | +from scrapy.utils.reqser import request_to_dict, request_from_dict |
10 | 11 |
|
11 | 12 | import redis |
12 | 13 | import random |
@@ -81,7 +82,7 @@ def __init__(self, server, persist, update_int, timeout, retries, logger, |
81 | 82 | self.ip_update_interval = ip_refresh |
82 | 83 | self.add_type = add_type |
83 | 84 | self.add_ip = add_ip |
84 | | - self.item_retires = retries |
| 85 | + self.item_retries = retries |
85 | 86 | self.logger = logger |
86 | 87 | self.ip_regex = re.compile(ip_regex) |
87 | 88 | self.backlog_blacklist = backlog_blacklist |
@@ -391,7 +392,7 @@ def enqueue_request(self, request): |
391 | 392 | if not request.dont_filter and self.dupefilter.request_seen(request): |
392 | 393 | self.logger.debug("Request not added back to redis") |
393 | 394 | return |
394 | | - req_dict = self.request_to_dict(request) |
| 395 | + req_dict = request_to_dict(request, self.spider) |
395 | 396 |
|
396 | 397 | if not self.is_blacklisted(req_dict['meta']['appid'], |
397 | 398 | req_dict['meta']['crawlid']): |
@@ -436,28 +437,6 @@ def enqueue_request(self, request): |
436 | 437 | .format(appid=req_dict['meta']['appid'], |
437 | 438 | id=req_dict['meta']['crawlid'])) |
438 | 439 |
|
439 | | - def request_to_dict(self, request): |
440 | | - ''' |
441 | | - Convert Request object to a dict. |
442 | | - modified from scrapy.utils.reqser |
443 | | - ''' |
444 | | - req_dict = { |
445 | | - # urls should be safe (safe_string_url) |
446 | | - 'url': to_unicode(request.url), |
447 | | - 'method': request.method, |
448 | | - 'headers': dict(request.headers), |
449 | | - 'body': request.body, |
450 | | - 'cookies': request.cookies, |
451 | | - 'meta': request.meta, |
452 | | - '_encoding': request._encoding, |
453 | | - 'priority': request.priority, |
454 | | - 'dont_filter': request.dont_filter, |
455 | | - # callback/errback are assumed to be a bound instance of the spider |
456 | | - 'callback': None if request.callback is None else request.callback.__name__, |
457 | | - 'errback': None if request.errback is None else request.errback.__name__, |
458 | | - } |
459 | | - return req_dict |
460 | | - |
461 | 440 | def find_item(self): |
462 | 441 | ''' |
463 | 442 | Finds an item from the throttled queues |
@@ -504,50 +483,46 @@ def next_request(self): |
504 | 483 | if item: |
505 | 484 | self.logger.debug(u"Found url to crawl {url}" \ |
506 | 485 | .format(url=item['url'])) |
507 | | - try: |
508 | | - req = Request(item['url']) |
509 | | - except ValueError: |
510 | | - # need absolute url |
511 | | - # need better url validation here |
512 | | - req = Request('http://' + item['url']) |
513 | | - |
514 | | - try: |
515 | | - if 'callback' in item and item['callback'] is not None: |
516 | | - req.callback = getattr(self.spider, item['callback']) |
517 | | - except AttributeError: |
518 | | - self.logger.warn("Unable to find callback method") |
519 | | - |
520 | | - try: |
521 | | - if 'errback' in item and item['errback'] is not None: |
522 | | - req.errback = getattr(self.spider, item['errback']) |
523 | | - except AttributeError: |
524 | | - self.logger.warn("Unable to find errback method") |
525 | | - |
526 | 486 | if 'meta' in item: |
527 | | - item = item['meta'] |
528 | | - |
529 | | - # defaults not in schema |
530 | | - if 'curdepth' not in item: |
531 | | - item['curdepth'] = 0 |
532 | | - if "retry_times" not in item: |
533 | | - item['retry_times'] = 0 |
534 | | - |
535 | | - for key in list(item.keys()): |
536 | | - req.meta[key] = item[key] |
| 487 | + # item is a serialized request |
| 488 | + req = request_from_dict(item, self.spider) |
| 489 | + else: |
| 490 | + # item is a feed from outside, parse it manually |
| 491 | + req = self.request_from_feed(item) |
537 | 492 |
|
538 | 493 | # extra check to add items to request |
539 | | - if 'useragent' in item and item['useragent'] is not None: |
540 | | - req.headers['User-Agent'] = item['useragent'] |
541 | | - if 'cookie' in item and item['cookie'] is not None: |
542 | | - if isinstance(item['cookie'], dict): |
543 | | - req.cookies = item['cookie'] |
544 | | - elif isinstance(item['cookie'], basestring): |
545 | | - req.cookies = self.parse_cookie(item['cookie']) |
| 494 | + if 'useragent' in req.meta and req.meta['useragent'] is not None: |
| 495 | + req.headers['User-Agent'] = req.meta['useragent'] |
546 | 496 |
|
547 | 497 | return req |
548 | 498 |
|
549 | 499 | return None |
550 | 500 |
|
| 501 | + def request_from_feed(self, item): |
| 502 | + try: |
| 503 | + req = Request(item['url']) |
| 504 | + except ValueError: |
| 505 | + # need absolute url |
| 506 | + # need better url validation here |
| 507 | + req = Request('http://' + item['url']) |
| 508 | + |
| 509 | + # defaults not in schema |
| 510 | + if 'curdepth' not in item: |
| 511 | + item['curdepth'] = 0 |
| 512 | + if "retry_times" not in item: |
| 513 | + item['retry_times'] = 0 |
| 514 | + |
| 515 | + for key in list(item.keys()): |
| 516 | + req.meta[key] = item[key] |
| 517 | + |
| 518 | + # extra check to add items to request |
| 519 | + if 'cookie' in item and item['cookie'] is not None: |
| 520 | + if isinstance(item['cookie'], dict): |
| 521 | + req.cookies = item['cookie'] |
| 522 | + elif isinstance(item['cookie'], basestring): |
| 523 | + req.cookies = self.parse_cookie(item['cookie']) |
| 524 | + return req |
| 525 | + |
551 | 526 | def parse_cookie(self, string): |
552 | 527 | ''' |
553 | 528 | Parses a cookie string like returned in a Set-Cookie header |
|
0 commit comments