-
-
Notifications
You must be signed in to change notification settings - Fork 432
/
Copy pathgdelt_crawler.py
119 lines (95 loc) · 3.52 KB
/
gdelt_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from newsplease.crawler.spiders.newsplease_spider import NewspleaseSpider
try:
import urllib2
except ImportError:
import urllib.request as urllib2
import csv
import logging
import os
import re
import io
import requests
import scrapy
import zipfile
# to improve performance, regex statements are compiled only once per module
re_export = re.compile(r'.*?(http.*?export\.CSV\.zip)')
class GdeltCrawler(NewspleaseSpider, scrapy.Spider):
name = "GdeltCrawler"
ignored_allowed_domains = None
start_urls = None
original_url = None
log = None
config = None
helper = None
def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
self.log = logging.getLogger(__name__)
self.config = config
self.helper = helper
self.original_url = url
self.ignored_allowed_domain = self.helper.url_extractor \
.get_allowed_domain(url)
self.start_urls = [url] # [self.helper.url_extractor.get_start_url(url)]
super(GdeltCrawler, self).__init__(*args, **kwargs)
def parse(self, response):
"""
Parse the Rss Feed
:param obj response: The scrapy response
"""
return self.rss_parse(response)
def rss_parse(self, response):
"""
Extracts all article links and initiates crawling them.
:param obj response: The scrapy response
"""
# get last_update zip url
match = re.match(re_export, response.text)
if match:
last_update_zip_url = match.group(1)
# fetch zip file
r = requests.get(last_update_zip_url)
# unzip
z = zipfile.ZipFile(io.BytesIO(r.content))
extracted = z.namelist()
z.extractall('/tmp')
csv_file_path = '/tmp/%s' % extracted[0]
# read csv to get all urls
urls = set() # set to remove duplicates
with open(csv_file_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter='\t')
for row in csv_reader:
urls.add(row[-1])
# rm the file
os.remove(csv_file_path)
for url in urls:
yield scrapy.Request(url, lambda resp: self.article_parse(
resp, 'gdelt'))
def article_parse(self, response, rss_title=None):
"""
Checks any given response on being an article and if positiv,
passes the response to the pipeline.
:param obj response: The scrapy response
:param str rss_title: Title extracted from the rss feed
"""
if not self.helper.parse_crawler.content_type(response):
return
yield self.helper.parse_crawler.pass_to_pipeline_if_article(
response, self.ignored_allowed_domain, self.original_url,
rss_title)
@staticmethod
def only_extracts_articles():
"""
Meta-Method, so if the heuristic "crawler_contains_only_article_alikes"
is called, the heuristic will return True on this crawler.
"""
return True
@staticmethod
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
Rss Crawler is supported if the url is a valid rss feed
Determines if this crawler works on the given url.
:param str url: The url to test
:param str check_certificate:
:return bool: Determines wether this crawler work on the given url
"""
# TODO: check if the url is a valid RSS feed
return True