存储

hexing · hexing · commit 4c7322b842e1 · 2018-05-29T01:00:32.000+08:00
diff --git a/spider/__pycache__/settings.cpython-36.pyc b/spider/__pycache__/settings.cpython-36.pyc
diff --git a/spider/items.py b/spider/items.py
@@ -7,8 +7,11 @@
 
 import scrapy
 
-
-class SpiderItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
+class SmartContractItem(scrapy.Item):
+    
+    # 合约token
+    token = scrapy.Field()
+    # 合约name
+    name  = scrapy.Field()
+    # 合约代码
+    code  = scrapy.Field()
diff --git a/spider/pipelines.py b/spider/pipelines.py
@@ -5,7 +5,20 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 
+import codecs
 
 class SpiderPipeline(object):
     def process_item(self, item, spider):
         return item
+# -*- coding: utf-8 -*-
+
+
+class MypjtPipeline(object):
+    def __init__(self):
+        self.file = codecs.open("D:/Kangbb/data1.txt", "w", encoding="utf-8")
+    def process_item(self, item, spider):
+        l = str(item['title'])+'\n'
+        self.file.write(l)
+        return item
+    def close_spider(self):
+        self.file.close()
diff --git a/spider/settings.py b/spider/settings.py
@@ -19,15 +19,15 @@
 #USER_AGENT = 'spider (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 1
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -44,6 +44,11 @@
 #   'Accept-Language': 'en',
 #}
 
+DEFAULT_REQUEST_HEADERS = {
+  'Accept': 'text/html, application/xhtml+xml, application/xml',
+  'Accept-Language': 'zh-CN,zh;q=0.8',
+}
+
 # Enable or disable spider middlewares
 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
@@ -56,6 +61,14 @@
 #    'spider.middlewares.SpiderDownloaderMiddleware': 543,
 #}
 
+#激活自定义UserAgent和代理IP
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+   'spider.useragent.UserAgent': 1,
+   'spider.proxymiddlewares.ProxyMiddleware':100,
+   'scrapy.downloadermiddleware.useragent.UserAgentMiddleware' : None,
+}
+
 # Enable or disable extensions
 # See https://doc.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
@@ -64,9 +77,12 @@
 
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+TEM_PIPELINES = {
 #    'spider.pipelines.SpiderPipeline': 300,
-#}
+     'scrapy.pipelines.files.FilesPipeline': 2, 
+}
+
+FILES_STORE='examples_src'
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
diff --git a/spider/spiders/smart_contract_spider.py b/spider/spiders/smart_contract_spider.py
@@ -1,19 +1,45 @@
 import scrapy
 
+from spider.items import SmartContractItem
+
 class SmartContractSpider(scrapy.Spider):
-    name = 'smart-contract'
-    
+
+    name           = 'smart-contract'
+
+    base_url       = "https://etherscan.io/"
+
+    token_max_page = 2
+
     def start_requests(self):
-        urls = [
-            'http://quotes.toscrape.com/page/1/',
-            'http://quotes.toscrape.com/page/2/',
-        ]
-        for url in urls:
-            yield scrapy.Request(url=url, callback=self.parse)
-
-    def parse(self, response):
-        page = response.url.split("/")[-2]
-        filename = 'quotes-%s.html' % page
-        with open(filename, 'wb') as f:
-            f.write(response.body)
-        self.log('Saved file %s' % filename)
+        for p in range(1, self.token_max_page):
+            url = self.base_url + 'tokens?p=' + str(p)
+            yield scrapy.Request(url, self.parseToken)
+
+    def parseToken(self, response):
+
+        for each in response.xpath('//h5'):
+            token_str     = each.xpath('./a/@href').extract_first()
+            token         = token_str.split("/")[2]
+            sub_url       = self.base_url + '/address/'+ token + '#code'
+
+            yield scrapy.Request(sub_url, self.parseCode) 
+
+            print(token)
+        
+
+    def parseCode(self, response):
+        item                  = SmartContractItem()
+
+        item['name']          = response.xpath('//*[@id="ContentPlaceHolder1_divSummary"]/div[1]/table/thead/tr/th/font/text()').extract_first()
+        item['token']         = response.xpath('//*[@id="mainaddress"]/text()').extract_first() 
+        item['code']          = response.xpath('//*[@id="editor"]/text()').extract_first()
+        
+        print(item)
+
+        yield item
+
+        return item
+        
+
+        
+