7
7
class BaikemedicalSpider (scrapy .Spider ):
8
8
name = 'BaikeMedical'
9
9
allowed_domains = ['baike.baidu.com' ]
10
- start_urls = ['https://baike.baidu.com/wikitag/taglist?tagId=75953' ]
11
- # 'https://baike.baidu.com/wikitag/taglist?tagId=75954',
12
- # 'https://baike.baidu.com/wikitag/taglist?tagId=75955',
13
- # 'https://baike.baidu.com/wikitag/taglist?tagId=75956']
10
+ start_urls = ['https://baike.baidu.com/wikitag/taglist?tagId=75953'
11
+ 'https://baike.baidu.com/wikitag/taglist?tagId=75954' ,
12
+ 'https://baike.baidu.com/wikitag/taglist?tagId=75955' ,
13
+ 'https://baike.baidu.com/wikitag/taglist?tagId=75956' ]
14
14
triple_cnt = 0
15
15
16
16
def start_requests (self ):
@@ -26,12 +26,12 @@ def start_requests(self):
26
26
while(cur_height < scrollHeight)
27
27
do
28
28
splash:evaljs("window.scrollTo(0, document.body.scrollHeight)")
29
- splash:wait(0.2 )
29
+ splash:wait(0.5 )
30
30
prev_height = cur_height
31
31
cur_height = splash:evaljs("document.body.scrollTop")
32
- splash:wait(0.1 )
32
+ splash:wait(0.2 )
33
33
scrollHeight = splash:evaljs("document.body.scrollHeight")
34
- splash:wait(0.1 )
34
+ splash:wait(0.2 )
35
35
print(cur_height, scrollHeight)
36
36
if prev_height == cur_height then
37
37
lag_cnt = lag_cnt + 1
@@ -45,6 +45,20 @@ def start_requests(self):
45
45
}
46
46
end
47
47
"""
48
+ # scripts = """
49
+ # function main(splash, args)
50
+ # assert(splash:go(args.url))
51
+ # assert(splash:wait(0.5))
52
+ # for i=50,1,-1
53
+ # do
54
+ # splash:evaljs("window.scrollTo(0, document.body.scrollHeight)")
55
+ # splash:wait(1)
56
+ # end
57
+ # return {
58
+ # html = splash:html(),
59
+ # }
60
+ # end
61
+ # """
48
62
for url in self .start_urls :
49
63
yield SplashRequest (url = url ,
50
64
callback = self .parse ,
@@ -55,7 +69,9 @@ def start_requests(self):
55
69
})
56
70
57
71
def parse_second_page (self , response ):
58
- page_target = response .xpath ('//dl[@class="lemmaWgt-lemmaTitle lemmaWgt-lemmaTitle-"]/dd/h1/text()' ).extract_first ()
72
+ page_target = response .xpath (
73
+ '//input[@id="query"]/@data-value' ).extract_first ()
74
+ # page_target = response.xpath('//dl[@class="lemmaWgt-lemmaTitle lemmaWgt-lemmaTitle-"]/dd/h1/text()').extract_first()
59
75
blocks = response .xpath ('//div[@class="basic-info J-basic-info cmn-clearfix"]/dl' )
60
76
for block in blocks :
61
77
names = block .xpath ('./dt/text()' ).extract ()
0 commit comments