sudo apt-get install python3-dev libxml2-dev libxslt1-dev zlib1g-dev libffi-dev libssl-dev
sudo apt install python3-dev # 17.04
sudo pip install Scrapy
# 创建项目
scrapy startproject <projectname>
cd <projectname>
scrapy genspider <spidername> <url>
# 启动项目
scrapy crawl dmoz
# shell启动
scrapy shell "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/"
response.xpath('//title')
yield scrapy.Request(url, callback, method, headers, cookies)
class MyMiddleware(object):
def process_spider_exception(self, response, exception, spider):
logging.error("无法获取数据")
logging.error("url: %s" % response.url)
logging.error("Http状态码: %d" % response.status)
if hasattr(spider, 'secret_key') == False:
spider.secret_key = 1
else:
spider.secret_key += 1
if spider.secret_key <= 10:
time.sleep(60*secret_key)
yield scrapy.Request(response.url, callback=spider.parse)
else:
logging.error("重复了太多次,依然无法获取数据")
return None
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def open_spider(self, item, spider):
return item
- [close_spider]
关闭spider
- [process_item]
处理item