@@ -16,8 +16,35 @@ class ScrapSpider(scrapy.Spider):
16
16
def start_requests (self ):
17
17
todayFolder (self )
18
18
urls = [
19
- 'http://www.dailyo.in/politics' ,
20
- 'http://www.deccanchronicle.com/opinion' ,
19
+ "http://www.dailyo.in/politics" ,
20
+ "http://www.deccanchronicle.com/opinion" ,
21
+ "http://www.dnaindia.com/analysis" ,
22
+ "http://www.firstpost.com/category/politics" ,
23
+ "http://www.forbesindia.com" ,
24
+ "http://www.frontline.in" ,
25
+ "http://www.hindustantimes.com/opinion" ,
26
+ "http://indiatoday.intoday.in/calendar" ,
27
+ "http://www.livemint.com/opinion" ,
28
+ "http://www.ndtv.com/opinion" ,
29
+ "http://www.news18.com/blogs" ,
30
+ "http://www.outlookindia.com/website" ,
31
+ "http://www.outlookindia.com/magazine" ,
32
+ "http://www.rediff.com/news/interviews10.html" ,
33
+ "http://www.rediff.com/news/columns10.html" ,
34
+ "http://scroll.in" ,
35
+ "https://blogs.economictimes.indiatimes.com" ,
36
+ "http://www.financialexpress.com/print/edits-columns" ,
37
+ "http://www.thehindu.com/opinion" ,
38
+ "http://www.thehindubusinessline.com/opinion" ,
39
+ "http://www.huffingtonpost.in/the-blog" ,
40
+ "http://theindianeconomist.com" ,
41
+ "http://indianexpress.com/opinion" ,
42
+ "http://www.newindianexpress.com/Opinions" ,
43
+ "http://www.dailypioneer.com/columnists" ,
44
+ "http://blogs.timesofindia.indiatimes.com" ,
45
+ "http://www.tribuneindia.com/news/opinion" ,
46
+ "http://hewire.in" ,
47
+ "https://www.telegraphindia.com/opinion" ,
21
48
]
22
49
for url in urls :
23
50
request = scrapy .Request (
@@ -28,8 +55,7 @@ def start_requests(self):
28
55
def parse (self , response ):
29
56
today = datetime .datetime .today ().strftime ('%Y-%m-%d' )
30
57
domain = (response .url ).split ('/' )[2 ]
31
-
32
-
58
+
33
59
# www.deccanchronicle.com parsing
34
60
if (domain == 'www.deccanchronicle.com' ):
35
61
deccanchroniclearray = []
@@ -38,41 +64,37 @@ def parse(self, response):
38
64
res2 = case .css ('div.opnionTopBig' )
39
65
for news in res :
40
66
dcobj = {"title" : news .css ("a > h3::text" ).extract_first (),
41
- "link" : domain + news .css ("a::attr(href)" ).extract_first (),
42
- "source" : domain ,
43
- }
67
+ "link" : domain + news .css ("a::attr(href)" ).extract_first (),
68
+ "source" : domain ,
69
+ }
44
70
# yield deploy
45
71
deccanchroniclearray .append (dcobj .copy ())
46
72
addCounter (domain )
47
73
for news in res2 :
48
74
dcobj = {"title" : news .css ("a > h3::text" ).extract_first (),
49
- "link" : domain + news .css ("a::attr(href)" ).extract_first (),
50
- "source" : domain ,
51
- }
75
+ "link" : domain + news .css ("a::attr(href)" ).extract_first (),
76
+ "source" : domain ,
77
+ }
52
78
# yield deploy
53
79
deccanchroniclearray .append (dcobj .copy ())
54
80
addCounter (domain )
55
- with open ('./jsons/%s/%s.json' % (today ,domain ), 'w' ) as fp :
56
- json .dump (deccanchroniclearray , fp )
57
-
81
+ with open ('./jsons/%s/%s.json' % (today , domain ), 'w' ) as fp :
82
+ json .dump (deccanchroniclearray , fp )
58
83
59
84
# www.dailyo.in parsing
60
85
elif (domain == 'www.dailyo.in' ):
61
86
dailyoarray = []
62
87
case2 = response .css ('div#story_container > div > div.story-list' )
63
88
for news in case2 :
64
- dailyoobj = {"title" : news .css ("div.storybox > div.storytext > h2 > a::text" ).extract_first (),
65
- "link" : domain + news .css ("div.storybox > div.storytext > h2 > a::attr(href)" ).extract_first (),
66
- "source" : domain ,
67
- }
89
+ dailyoobj = {"title" : news .css ("div.storybox > div.storytext > h2 > a::text" ).extract_first (),
90
+ "link" : domain + news .css ("div.storybox > div.storytext > h2 > a::attr(href)" ).extract_first (),
91
+ "source" : domain ,
92
+ }
68
93
# yield deploy
69
94
dailyoarray .append (dailyoobj .copy ())
70
95
addCounter (domain )
71
- with open ('./jsons/%s/%s.json' % (today ,domain ), 'w' ) as fp :
72
- json .dump (dailyoarray , fp )
96
+ with open ('./jsons/%s/%s.json' % (today , domain ), 'w' ) as fp :
97
+ json .dump (dailyoarray , fp )
73
98
# with open('./counters/%s_daily-counters.json' %today, 'w') as fp:
74
99
# self.log(getCounter('daily'))
75
100
# json.dump(getCounter('daily'), fp)
76
-
77
-
78
-
0 commit comments