Skip to content

Commit a50cc93

Browse files
committed
0716-update
1 parent fb31823 commit a50cc93

File tree

8 files changed

+26
-9
lines changed

8 files changed

+26
-9
lines changed
-33 Bytes
Binary file not shown.
-8 Bytes
Binary file not shown.
-8 Bytes
Binary file not shown.
-8 Bytes
Binary file not shown.

MedicalKG/pipelines.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
client = pymongo.MongoClient('localhost')
1313
db = client.MedicalKG
1414
collection = db.MedicalKG
15-
15+
# print(db)
16+
# print(collection)
1617

1718
class MedicalkgPipeline:
1819
def process_item(self, item, spider):

MedicalKG/spiders/BaikeMedical.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
class BaikemedicalSpider(scrapy.Spider):
88
name = 'BaikeMedical'
99
allowed_domains = ['baike.baidu.com']
10-
start_urls = ['https://baike.baidu.com/wikitag/taglist?tagId=75953']
11-
# 'https://baike.baidu.com/wikitag/taglist?tagId=75954',
12-
# 'https://baike.baidu.com/wikitag/taglist?tagId=75955',
13-
# 'https://baike.baidu.com/wikitag/taglist?tagId=75956']
10+
start_urls = ['https://baike.baidu.com/wikitag/taglist?tagId=75953'
11+
'https://baike.baidu.com/wikitag/taglist?tagId=75954',
12+
'https://baike.baidu.com/wikitag/taglist?tagId=75955',
13+
'https://baike.baidu.com/wikitag/taglist?tagId=75956']
1414
triple_cnt = 0
1515

1616
def start_requests(self):
@@ -26,12 +26,12 @@ def start_requests(self):
2626
while(cur_height < scrollHeight)
2727
do
2828
splash:evaljs("window.scrollTo(0, document.body.scrollHeight)")
29-
splash:wait(0.2)
29+
splash:wait(0.5)
3030
prev_height = cur_height
3131
cur_height = splash:evaljs("document.body.scrollTop")
32-
splash:wait(0.1)
32+
splash:wait(0.2)
3333
scrollHeight = splash:evaljs("document.body.scrollHeight")
34-
splash:wait(0.1)
34+
splash:wait(0.2)
3535
print(cur_height, scrollHeight)
3636
if prev_height == cur_height then
3737
lag_cnt = lag_cnt + 1
@@ -45,6 +45,20 @@ def start_requests(self):
4545
}
4646
end
4747
"""
48+
# scripts = """
49+
# function main(splash, args)
50+
# assert(splash:go(args.url))
51+
# assert(splash:wait(0.5))
52+
# for i=50,1,-1
53+
# do
54+
# splash:evaljs("window.scrollTo(0, document.body.scrollHeight)")
55+
# splash:wait(1)
56+
# end
57+
# return {
58+
# html = splash:html(),
59+
# }
60+
# end
61+
# """
4862
for url in self.start_urls:
4963
yield SplashRequest(url=url,
5064
callback=self.parse,
@@ -55,7 +69,9 @@ def start_requests(self):
5569
})
5670

5771
def parse_second_page(self, response):
58-
page_target = response.xpath('//dl[@class="lemmaWgt-lemmaTitle lemmaWgt-lemmaTitle-"]/dd/h1/text()').extract_first()
72+
page_target = response.xpath(
73+
'//input[@id="query"]/@data-value').extract_first()
74+
# page_target = response.xpath('//dl[@class="lemmaWgt-lemmaTitle lemmaWgt-lemmaTitle-"]/dd/h1/text()').extract_first()
5975
blocks = response.xpath('//div[@class="basic-info J-basic-info cmn-clearfix"]/dl')
6076
for block in blocks:
6177
names = block.xpath('./dt/text()').extract()
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)