forked from patidardhruv/Dark-Web-Spiders
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy paththread_scrape_spider.py
86 lines (70 loc) · 3.4 KB
/
thread_scrape_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import scrapy #for scraping
import csv #for importing URLs from the scraped data.
#import data from csv(which was obtained by using title_scraper.py)
with open('data.csv', newline='') as f:
reader = csv.reader(f)
data = list(reader)
#keep URLs, drop rest.
for i in range(len(data)):
data[i]=data[i][2]
data.pop(0) #remove the entry that has column name
class ThreadScraperSpider(scrapy.Spider):
#default scrapy variables
name = 'thread_scraper'
allowed_domains = ['sample.website']
number_of_posts=0
#copied from the browser after solving the CAPTCHA. This is to bypass the CAPTCHA.
cookies_ = {'keys':'values'}
#scrape page from every URL in the post data that had been earlier scraped.
def start_requests(self):
print('Entering Start reqs. Correct file is.')
for post_url in data:
absolute_post_url = 'sample.website' + post_url
yield scrapy.Request(absolute_post_url, cookies=self.cookies_, callback=self.parse)
def parse(self, response):
self.number_of_posts+=1
comment_data=[]
#TODO Scrape Post
Post_content=response.xpath('//div[(contains(@class, "postTop"))]/a/text()')
Post_content+=' : '
for i in response.xpath('//div[(contains(@class, "postContent"))]/text()').extract():
Post_content+=i
Post_content+=' '
author_name=response.xpath('//div[(contains(@class, "postTop"))]//div[(contains(@class, "author"))]/a[(contains(text(), "/u/"))]/text()')
votes=response.xpath('//div[(contains(@class, "voteCount"))]/text()')
timestamp=response.xpath('//div[(contains(@class, "postTop"))]//div[(contains(@class, "author"))]/span/@title')[0].extract()
#for each comment, search for sub comments.
root=response.xpath('//*[@class="postComments"]/div[(contains(@class, "comment"))]')
for j in range(len(root)):
self.scrape_comment(root[j],str(self.number_of_posts)+'.'+str(j+1),comment_data)
self.hunt_comments(root[j],str(self.number_of_posts)+'.'+str(j+1),comment_data)
for element in comment_data:
yield{
'timestamp': element['timestamp'],
'comment_index': element['comment_index'],
'comment_body': element['comment_body'],
'poster_name': element['poster_name'],
'votes': element['votes']
}
#TODO GOTO NEXT PAGE.
#recursively look for comments and parse each using 'scrape_comment'
def hunt_comments(self,comment_html,comment_index,comment_data):
new_root=comment_html.xpath('./div[(contains(@id, "c-"))]')
for i in range(len(new_root)):
self.scrape_comment(new_root[i],comment_index+'.'+str(i+1), comment_data)
self.hunt_comments(new_root[i],comment_index+'.'+str(i+1),comment_data)
#parse a single comment
def scrape_comment(self,comment_html,comment_index,comment_data):
comment_body=comment_html.xpath('.//div[(contains(@class, "commentBody"))]').extract()[0][25:-6].replace('<br>\r\n',' ')
poster_name=comment_html.xpath('.//div/a[(contains(@href, "/u/"))]/@href').extract()[0]
# backup variable timestamp=response.xpath('//*[@class="postComments"]/div/div/div[(contains(@class, "commentContent"))]/div/div[(contains(@class, "timestamp"))]/span/@title').extract()
timestamp=comment_html.xpath('.//div/div[(contains(@class, "timestamp"))]/span/@title').extract()[0]
votes=comment_html.xpath('.//div/div[(contains(@class, "votes"))]/text()').extract()[0]
retdict={
'timestamp': timestamp,
'comment_index': comment_index,
'comment_body': comment_body,
'poster_name': poster_name,
'votes': votes,
}
comment_data.append(retdict)