diff --git a/v2ex/scrapy.cfg b/v2ex/scrapy.cfg new file mode 100644 index 0000000..f5ea966 --- /dev/null +++ b/v2ex/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = v2ex.settings + +[deploy] +#url = http://localhost:6800/ +project = v2ex diff --git a/v2ex/v2ex/__init__.py b/v2ex/v2ex/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/v2ex/v2ex/items.py b/v2ex/v2ex/items.py new file mode 100644 index 0000000..e4df6f4 --- /dev/null +++ b/v2ex/v2ex/items.py @@ -0,0 +1,11 @@ +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +from scrapy.item import Item, Field + +class v2exItem(Item): + # define the fields for your item here like: + name = Field() + diff --git a/v2ex/v2ex/pipelines.py b/v2ex/v2ex/pipelines.py new file mode 100644 index 0000000..eca743c --- /dev/null +++ b/v2ex/v2ex/pipelines.py @@ -0,0 +1,50 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import redis + + +from scrapy import signals + + +import json +import codecs +from collections import OrderedDict + + +class JsonWithEncodingPipeline(object): + + def __init__(self): + self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') + + def process_item(self, item, spider): + line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" + self.file.write(line) + return item + + def close_spider(self, spider): + self.file.close() + + +class RedisPipeline(object): + + def __init__(self): + self.r = redis.StrictRedis(host='localhost', port=6379) + + def process_item(self, item, spider): + if not item['id']: + print 'no id item!!' + + str_recorded_item = self.r.get(item['id']) + final_item = None + if str_recorded_item is None: + final_item = item + else: + ritem = eval(self.r.get(item['id'])) + final_item = dict(item.items() + ritem.items()) + self.r.set(item['id'], final_item) + + def close_spider(self, spider): + return diff --git a/v2ex/v2ex/settings.py b/v2ex/v2ex/settings.py new file mode 100644 index 0000000..3076776 --- /dev/null +++ b/v2ex/v2ex/settings.py @@ -0,0 +1,36 @@ +# Scrapy settings for v2ex project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +import sys +import os +from os.path import dirname +path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) +sys.path.append(path) +from misc.log import * + +BOT_NAME = 'v2ex' + +SPIDER_MODULES = ['v2ex.spiders'] +NEWSPIDER_MODULE = 'v2ex.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'v2ex (+http://www.yourdomain.com)' + +DOWNLOADER_MIDDLEWARES = { + # 'misc.middleware.CustomHttpProxyMiddleware': 400, + 'misc.middleware.CustomUserAgentMiddleware': 401, +} + +ITEM_PIPELINES = { + 'v2ex.pipelines.JsonWithEncodingPipeline': 300, + #'v2ex.pipelines.RedisPipeline': 301, +} + +LOG_LEVEL = 'INFO' + +DOWNLOAD_DELAY = 1 diff --git a/v2ex/v2ex/spiders/__init__.py b/v2ex/v2ex/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/v2ex/v2ex/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/v2ex/v2ex/spiders/spider.py b/v2ex/v2ex/spiders/spider.py new file mode 100644 index 0000000..c28c88f --- /dev/null +++ b/v2ex/v2ex/spiders/spider.py @@ -0,0 +1,48 @@ +import re +import json +from urlparse import urlparse +import urllib +import pdb + + +from scrapy.selector import Selector +try: + from scrapy.spiders import Spider +except: + from scrapy.spiders import BaseSpider as Spider +from scrapy.utils.response import get_base_url +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor as sle + + +from v2ex.items import * +from misc.log import * +from misc.spider import CommonSpider + + +class v2exSpider(CommonSpider): + name = "v2ex" + allowed_domains = ["v2ex.com"] + start_urls = [ + "http://www.v2ex.com/", + ] + rules = [ + Rule(sle(allow=("http://www.v2ex.com/$")), callback='parse_1', follow=True), + ] + + list_css_rules = { + '.cell.item': { + 'title': '.item_title a::text', + 'node': '.node::text', + 'author': '.node+ strong a::text', + 'reply_count': '.count_livid::text' + } + } + + def parse_1(self, response): + info('Parse '+response.url) + # import pdb; pdb.set_trace() + x = self.parse_with_rules(response, self.list_css_rules, dict) + print(json.dumps(x, ensure_ascii=False, indent=2)) + #pp.pprint(x) + # return self.parse_with_rules(response, self.css_rules, v2exItem)