Skip to content

Commit d40741f

Browse files
file name changes
1 parent 86d3d05 commit d40741f

File tree

4 files changed

+77
-24
lines changed

4 files changed

+77
-24
lines changed

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.pythonPath": "/usr/local/bin/python3"
3+
}

countersReport.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,40 @@
11
import datetime
22
import json
33
import os.path
4-
from scheduler import csvExporter
4+
from csvExportFile import csvExporter
55

66
today = datetime.datetime.today().strftime('%Y-%m-%d')
77
dailyZeroArray = {
88
"www.deccanchronicle.com": 0,
9-
"www.dailyo.in": 0
9+
"www.dailyo.in": 0,
10+
"www.dnaindia.com/analysis":0,
11+
"www.firstpost.com/category/politics":0,
12+
"www.forbesindia.com":0,
13+
"www.frontline.in":0,
14+
"www.hindustantimes.com/opinion":0,
15+
"indiatoday.intoday.in/calendar":0,
16+
"www.livemint.com/opinion":0,
17+
"www.ndtv.com/opinion":0,
18+
"www.news18.com/blogs":0,
19+
"www.outlookindia.com/website":0,
20+
"www.outlookindia.com/magazine":0,
21+
"www.rediff.com/news/interviews10.html":0,
22+
"www.rediff.com/news/columns10.html":0,
23+
"scroll.in":0,
24+
"blogs.economictimes.indiatimes.com":0,
25+
"www.financialexpress.com/print/edits-columns":0,
26+
"www.thehindu.com/opinion":0,
27+
"www.thehindubusinessline.com/opinion":0,
28+
"www.huffingtonpost.in/the-blog":0,
29+
"theindianeconomist.com":0,
30+
"indianexpress.com/opinion":0,
31+
"www.newindianexpress.com/Opinions":0,
32+
"www.dailypioneer.com/columnists":0,
33+
"blogs.timesofindia.indiatimes.com":0,
34+
"www.tribuneindia.com/news/opinion":0,
35+
"thewire.in":0,
36+
"www.telegraphindia.com/opinion":0,
37+
1038
}
1139
weeklyZeroArray = {
1240
"www.deccanchronicle.com": 0,

crawlers/spiders/scrapit.py

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,35 @@ class ScrapSpider(scrapy.Spider):
1616
def start_requests(self):
1717
todayFolder(self)
1818
urls = [
19-
'http://www.dailyo.in/politics',
20-
'http://www.deccanchronicle.com/opinion',
19+
"http://www.dailyo.in/politics",
20+
"http://www.deccanchronicle.com/opinion",
21+
"http://www.dnaindia.com/analysis",
22+
"http://www.firstpost.com/category/politics",
23+
"http://www.forbesindia.com",
24+
"http://www.frontline.in",
25+
"http://www.hindustantimes.com/opinion",
26+
"http://indiatoday.intoday.in/calendar",
27+
"http://www.livemint.com/opinion",
28+
"http://www.ndtv.com/opinion",
29+
"http://www.news18.com/blogs",
30+
"http://www.outlookindia.com/website",
31+
"http://www.outlookindia.com/magazine",
32+
"http://www.rediff.com/news/interviews10.html",
33+
"http://www.rediff.com/news/columns10.html",
34+
"http://scroll.in",
35+
"https://blogs.economictimes.indiatimes.com",
36+
"http://www.financialexpress.com/print/edits-columns",
37+
"http://www.thehindu.com/opinion",
38+
"http://www.thehindubusinessline.com/opinion",
39+
"http://www.huffingtonpost.in/the-blog",
40+
"http://theindianeconomist.com",
41+
"http://indianexpress.com/opinion",
42+
"http://www.newindianexpress.com/Opinions",
43+
"http://www.dailypioneer.com/columnists",
44+
"http://blogs.timesofindia.indiatimes.com",
45+
"http://www.tribuneindia.com/news/opinion",
46+
"http://hewire.in",
47+
"https://www.telegraphindia.com/opinion",
2148
]
2249
for url in urls:
2350
request = scrapy.Request(
@@ -28,8 +55,7 @@ def start_requests(self):
2855
def parse(self, response):
2956
today = datetime.datetime.today().strftime('%Y-%m-%d')
3057
domain = (response.url).split('/')[2]
31-
32-
58+
3359
# www.deccanchronicle.com parsing
3460
if (domain == 'www.deccanchronicle.com'):
3561
deccanchroniclearray = []
@@ -38,41 +64,37 @@ def parse(self, response):
3864
res2 = case.css('div.opnionTopBig')
3965
for news in res:
4066
dcobj = {"title": news.css("a > h3::text").extract_first(),
41-
"link": domain + news.css("a::attr(href)").extract_first(),
42-
"source": domain,
43-
}
67+
"link": domain + news.css("a::attr(href)").extract_first(),
68+
"source": domain,
69+
}
4470
# yield deploy
4571
deccanchroniclearray.append(dcobj.copy())
4672
addCounter(domain)
4773
for news in res2:
4874
dcobj = {"title": news.css("a > h3::text").extract_first(),
49-
"link": domain + news.css("a::attr(href)").extract_first(),
50-
"source": domain,
51-
}
75+
"link": domain + news.css("a::attr(href)").extract_first(),
76+
"source": domain,
77+
}
5278
# yield deploy
5379
deccanchroniclearray.append(dcobj.copy())
5480
addCounter(domain)
55-
with open('./jsons/%s/%s.json' %(today,domain), 'w') as fp:
56-
json.dump(deccanchroniclearray, fp)
57-
81+
with open('./jsons/%s/%s.json' % (today, domain), 'w') as fp:
82+
json.dump(deccanchroniclearray, fp)
5883

5984
# www.dailyo.in parsing
6085
elif (domain == 'www.dailyo.in'):
6186
dailyoarray = []
6287
case2 = response.css('div#story_container > div > div.story-list')
6388
for news in case2:
64-
dailyoobj = {"title": news.css("div.storybox > div.storytext > h2 > a::text").extract_first(),
65-
"link": domain + news.css("div.storybox > div.storytext > h2 > a::attr(href)").extract_first(),
66-
"source": domain,
67-
}
89+
dailyoobj = {"title": news.css("div.storybox > div.storytext > h2 > a::text").extract_first(),
90+
"link": domain + news.css("div.storybox > div.storytext > h2 > a::attr(href)").extract_first(),
91+
"source": domain,
92+
}
6893
# yield deploy
6994
dailyoarray.append(dailyoobj.copy())
7095
addCounter(domain)
71-
with open('./jsons/%s/%s.json' %(today,domain), 'w') as fp:
72-
json.dump(dailyoarray, fp)
96+
with open('./jsons/%s/%s.json' % (today, domain), 'w') as fp:
97+
json.dump(dailyoarray, fp)
7398
# with open('./counters/%s_daily-counters.json' %today, 'w') as fp:
7499
# self.log(getCounter('daily'))
75100
# json.dump(getCounter('daily'), fp)
76-
77-
78-
File renamed without changes.

0 commit comments

Comments
 (0)