add v2ex list extraction.

johnolson2219 · Jun 12, 2016 · a59d88a · a59d88a
1 parent 562f4f6
commit a59d88a
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 0 deletions.
diff --git a/v2ex/scrapy.cfg b/v2ex/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = v2ex.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = v2ex
diff --git a/v2ex/v2ex/__init__.py b/v2ex/v2ex/__init__.py
diff --git a/v2ex/v2ex/items.py b/v2ex/v2ex/items.py
@@ -0,0 +1,11 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class v2exItem(Item):
+    # define the fields for your item here like:
+    name = Field()
+
diff --git a/v2ex/v2ex/pipelines.py b/v2ex/v2ex/pipelines.py
@@ -0,0 +1,50 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import redis
+
+
+from scrapy import signals
+
+
+import json
+import codecs
+from collections import OrderedDict
+
+
+class JsonWithEncodingPipeline(object):
+
+    def __init__(self):
+        self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
+
+    def process_item(self, item, spider):
+        line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
+        self.file.write(line)
+        return item
+
+    def close_spider(self, spider):
+        self.file.close()
+
+
+class RedisPipeline(object):
+
+    def __init__(self):
+        self.r = redis.StrictRedis(host='localhost', port=6379)
+
+    def process_item(self, item, spider):
+        if not item['id']:
+            print 'no id item!!'
+
+        str_recorded_item = self.r.get(item['id'])
+        final_item = None
+        if str_recorded_item is None:
+            final_item = item
+        else:
+            ritem = eval(self.r.get(item['id']))
+            final_item = dict(item.items() + ritem.items())
+        self.r.set(item['id'], final_item)
+
+    def close_spider(self, spider):
+        return
diff --git a/v2ex/v2ex/settings.py b/v2ex/v2ex/settings.py
@@ -0,0 +1,36 @@
+# Scrapy settings for v2ex project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+import sys
+import os
+from os.path import dirname
+path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
+sys.path.append(path)
+from misc.log import *
+
+BOT_NAME = 'v2ex'
+
+SPIDER_MODULES = ['v2ex.spiders']
+NEWSPIDER_MODULE = 'v2ex.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'v2ex (+http://www.yourdomain.com)'
+
+DOWNLOADER_MIDDLEWARES = {
+   # 'misc.middleware.CustomHttpProxyMiddleware': 400,
+    'misc.middleware.CustomUserAgentMiddleware': 401,
+}
+
+ITEM_PIPELINES = {
+    'v2ex.pipelines.JsonWithEncodingPipeline': 300,
+    #'v2ex.pipelines.RedisPipeline': 301,
+}
+
+LOG_LEVEL = 'INFO'
+
+DOWNLOAD_DELAY = 1
diff --git a/v2ex/v2ex/spiders/__init__.py b/v2ex/v2ex/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/v2ex/v2ex/spiders/spider.py b/v2ex/v2ex/spiders/spider.py
@@ -0,0 +1,48 @@
+import re
+import json
+from urlparse import urlparse
+import urllib
+import pdb
+
+
+from scrapy.selector import Selector
+try:
+    from scrapy.spiders import Spider
+except:
+    from scrapy.spiders import BaseSpider as Spider
+from scrapy.utils.response import get_base_url
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor as sle
+
+
+from v2ex.items import *
+from misc.log import *
+from misc.spider import CommonSpider
+
+
+class v2exSpider(CommonSpider):
+    name = "v2ex"
+    allowed_domains = ["v2ex.com"]
+    start_urls = [
+        "http://www.v2ex.com/",
+    ]
+    rules = [
+        Rule(sle(allow=("http://www.v2ex.com/$")), callback='parse_1', follow=True),
+    ]
+
+    list_css_rules = { 
+        '.cell.item': {
+            'title': '.item_title a::text',
+            'node': '.node::text',
+            'author': '.node+ strong a::text',
+            'reply_count': '.count_livid::text'
+        }   
+    }   
+
+    def parse_1(self, response):
+        info('Parse '+response.url)
+        # import pdb; pdb.set_trace()
+        x = self.parse_with_rules(response, self.list_css_rules, dict)
+        print(json.dumps(x, ensure_ascii=False, indent=2))
+        #pp.pprint(x)
+        # return self.parse_with_rules(response, self.css_rules, v2exItem)