Skip to content

Commit 4c7322b

Browse files
author
hexing
committed
存储
1 parent 0ad70a2 commit 4c7322b

File tree

5 files changed

+82
-24
lines changed

5 files changed

+82
-24
lines changed
438 Bytes
Binary file not shown.

spider/items.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@
77

88
import scrapy
99

10-
11-
class SpiderItem(scrapy.Item):
12-
# define the fields for your item here like:
13-
# name = scrapy.Field()
14-
pass
10+
class SmartContractItem(scrapy.Item):
11+
12+
# 合约token
13+
token = scrapy.Field()
14+
# 合约name
15+
name = scrapy.Field()
16+
# 合约代码
17+
code = scrapy.Field()

spider/pipelines.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,20 @@
55
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
66
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
77

8+
import codecs
89

910
class SpiderPipeline(object):
1011
def process_item(self, item, spider):
1112
return item
13+
# -*- coding: utf-8 -*-
14+
15+
16+
class MypjtPipeline(object):
17+
def __init__(self):
18+
self.file = codecs.open("D:/Kangbb/data1.txt", "w", encoding="utf-8")
19+
def process_item(self, item, spider):
20+
l = str(item['title'])+'\n'
21+
self.file.write(l)
22+
return item
23+
def close_spider(self):
24+
self.file.close()

spider/settings.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@
1919
#USER_AGENT = 'spider (+http://www.yourdomain.com)'
2020

2121
# Obey robots.txt rules
22-
ROBOTSTXT_OBEY = True
22+
ROBOTSTXT_OBEY = False
2323

2424
# Configure maximum concurrent requests performed by Scrapy (default: 16)
2525
#CONCURRENT_REQUESTS = 32
2626

2727
# Configure a delay for requests for the same website (default: 0)
2828
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
2929
# See also autothrottle settings and docs
30-
#DOWNLOAD_DELAY = 3
30+
DOWNLOAD_DELAY = 1
3131
# The download delay setting will honor only one of:
3232
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
3333
#CONCURRENT_REQUESTS_PER_IP = 16
@@ -44,6 +44,11 @@
4444
# 'Accept-Language': 'en',
4545
#}
4646

47+
DEFAULT_REQUEST_HEADERS = {
48+
'Accept': 'text/html, application/xhtml+xml, application/xml',
49+
'Accept-Language': 'zh-CN,zh;q=0.8',
50+
}
51+
4752
# Enable or disable spider middlewares
4853
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
4954
#SPIDER_MIDDLEWARES = {
@@ -56,6 +61,14 @@
5661
# 'spider.middlewares.SpiderDownloaderMiddleware': 543,
5762
#}
5863

64+
#激活自定义UserAgent和代理IP
65+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
66+
DOWNLOADER_MIDDLEWARES = {
67+
'spider.useragent.UserAgent': 1,
68+
'spider.proxymiddlewares.ProxyMiddleware':100,
69+
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware' : None,
70+
}
71+
5972
# Enable or disable extensions
6073
# See https://doc.scrapy.org/en/latest/topics/extensions.html
6174
#EXTENSIONS = {
@@ -64,9 +77,12 @@
6477

6578
# Configure item pipelines
6679
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67-
#ITEM_PIPELINES = {
80+
TEM_PIPELINES = {
6881
# 'spider.pipelines.SpiderPipeline': 300,
69-
#}
82+
'scrapy.pipelines.files.FilesPipeline': 2,
83+
}
84+
85+
FILES_STORE='examples_src'
7086

7187
# Enable and configure the AutoThrottle extension (disabled by default)
7288
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,45 @@
11
import scrapy
22

3+
from spider.items import SmartContractItem
4+
35
class SmartContractSpider(scrapy.Spider):
4-
name = 'smart-contract'
5-
6+
7+
name = 'smart-contract'
8+
9+
base_url = "https://etherscan.io/"
10+
11+
token_max_page = 2
12+
613
def start_requests(self):
7-
urls = [
8-
'http://quotes.toscrape.com/page/1/',
9-
'http://quotes.toscrape.com/page/2/',
10-
]
11-
for url in urls:
12-
yield scrapy.Request(url=url, callback=self.parse)
13-
14-
def parse(self, response):
15-
page = response.url.split("/")[-2]
16-
filename = 'quotes-%s.html' % page
17-
with open(filename, 'wb') as f:
18-
f.write(response.body)
19-
self.log('Saved file %s' % filename)
14+
for p in range(1, self.token_max_page):
15+
url = self.base_url + 'tokens?p=' + str(p)
16+
yield scrapy.Request(url, self.parseToken)
17+
18+
def parseToken(self, response):
19+
20+
for each in response.xpath('//h5'):
21+
token_str = each.xpath('./a/@href').extract_first()
22+
token = token_str.split("/")[2]
23+
sub_url = self.base_url + '/address/'+ token + '#code'
24+
25+
yield scrapy.Request(sub_url, self.parseCode)
26+
27+
print(token)
28+
29+
30+
def parseCode(self, response):
31+
item = SmartContractItem()
32+
33+
item['name'] = response.xpath('//*[@id="ContentPlaceHolder1_divSummary"]/div[1]/table/thead/tr/th/font/text()').extract_first()
34+
item['token'] = response.xpath('//*[@id="mainaddress"]/text()').extract_first()
35+
item['code'] = response.xpath('//*[@id="editor"]/text()').extract_first()
36+
37+
print(item)
38+
39+
yield item
40+
41+
return item
42+
43+
44+
45+

0 commit comments

Comments
 (0)