|  | 
|  | 1 | +# Define here the models for your spider middleware | 
|  | 2 | +# | 
|  | 3 | +# See documentation in: | 
|  | 4 | +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html | 
|  | 5 | + | 
|  | 6 | +from scrapy import signals | 
|  | 7 | + | 
|  | 8 | +# useful for handling different item types with a single interface | 
|  | 9 | +from itemadapter import ItemAdapter | 
|  | 10 | + | 
|  | 11 | + | 
|  | 12 | +class ExampleSpiderMiddleware: | 
|  | 13 | +    # Not all methods need to be defined. If a method is not defined, | 
|  | 14 | +    # scrapy acts as if the spider middleware does not modify the | 
|  | 15 | +    # passed objects. | 
|  | 16 | + | 
|  | 17 | +    @classmethod | 
|  | 18 | +    def from_crawler(cls, crawler): | 
|  | 19 | +        # This method is used by Scrapy to create your spiders. | 
|  | 20 | +        s = cls() | 
|  | 21 | +        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | 
|  | 22 | +        return s | 
|  | 23 | + | 
|  | 24 | +    def process_spider_input(self, response, spider): | 
|  | 25 | +        # Called for each response that goes through the spider | 
|  | 26 | +        # middleware and into the spider. | 
|  | 27 | + | 
|  | 28 | +        # Should return None or raise an exception. | 
|  | 29 | +        return None | 
|  | 30 | + | 
|  | 31 | +    def process_spider_output(self, response, result, spider): | 
|  | 32 | +        # Called with the results returned from the Spider, after | 
|  | 33 | +        # it has processed the response. | 
|  | 34 | + | 
|  | 35 | +        # Must return an iterable of Request, or item objects. | 
|  | 36 | +        for i in result: | 
|  | 37 | +            yield i | 
|  | 38 | + | 
|  | 39 | +    def process_spider_exception(self, response, exception, spider): | 
|  | 40 | +        # Called when a spider or process_spider_input() method | 
|  | 41 | +        # (from other spider middleware) raises an exception. | 
|  | 42 | + | 
|  | 43 | +        # Should return either None or an iterable of Request or item objects. | 
|  | 44 | +        pass | 
|  | 45 | + | 
|  | 46 | +    async def process_start(self, start): | 
|  | 47 | +        # Called with an async iterator over the spider start() method or the | 
|  | 48 | +        # maching method of an earlier spider middleware. | 
|  | 49 | +        async for item_or_request in start: | 
|  | 50 | +            yield item_or_request | 
|  | 51 | + | 
|  | 52 | +    def spider_opened(self, spider): | 
|  | 53 | +        spider.logger.info("Spider opened: %s" % spider.name) | 
|  | 54 | + | 
|  | 55 | + | 
|  | 56 | +class ExampleDownloaderMiddleware: | 
|  | 57 | +    # Not all methods need to be defined. If a method is not defined, | 
|  | 58 | +    # scrapy acts as if the downloader middleware does not modify the | 
|  | 59 | +    # passed objects. | 
|  | 60 | + | 
|  | 61 | +    @classmethod | 
|  | 62 | +    def from_crawler(cls, crawler): | 
|  | 63 | +        # This method is used by Scrapy to create your spiders. | 
|  | 64 | +        s = cls() | 
|  | 65 | +        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | 
|  | 66 | +        return s | 
|  | 67 | + | 
|  | 68 | +    def process_request(self, request, spider): | 
|  | 69 | +        # Called for each request that goes through the downloader | 
|  | 70 | +        # middleware. | 
|  | 71 | + | 
|  | 72 | +        # Must either: | 
|  | 73 | +        # - return None: continue processing this request | 
|  | 74 | +        # - or return a Response object | 
|  | 75 | +        # - or return a Request object | 
|  | 76 | +        # - or raise IgnoreRequest: process_exception() methods of | 
|  | 77 | +        #   installed downloader middleware will be called | 
|  | 78 | +        return None | 
|  | 79 | + | 
|  | 80 | +    def process_response(self, request, response, spider): | 
|  | 81 | +        # Called with the response returned from the downloader. | 
|  | 82 | + | 
|  | 83 | +        # Must either; | 
|  | 84 | +        # - return a Response object | 
|  | 85 | +        # - return a Request object | 
|  | 86 | +        # - or raise IgnoreRequest | 
|  | 87 | +        return response | 
|  | 88 | + | 
|  | 89 | +    def process_exception(self, request, exception, spider): | 
|  | 90 | +        # Called when a download handler or a process_request() | 
|  | 91 | +        # (from other downloader middleware) raises an exception. | 
|  | 92 | + | 
|  | 93 | +        # Must either: | 
|  | 94 | +        # - return None: continue processing this exception | 
|  | 95 | +        # - return a Response object: stops process_exception() chain | 
|  | 96 | +        # - return a Request object: stops process_exception() chain | 
|  | 97 | +        pass | 
|  | 98 | + | 
|  | 99 | +    def spider_opened(self, spider): | 
|  | 100 | +        spider.logger.info("Spider opened: %s" % spider.name) | 
0 commit comments