-
Notifications
You must be signed in to change notification settings - Fork 1
/
mobility_to_rdf.py
153 lines (117 loc) · 5.57 KB
/
mobility_to_rdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import urllib.parse
from urllib.request import urlretrieve, urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas
from rdflib import Graph, Literal, Namespace, URIRef, BNode
from rdflib.namespace import RDF, RDFS, FOAF, OWL, XSD, DC, DCTERMS
import progressbar
# https://stackoverflow.com/questions/35569042/ssl-certificate-verify-failed-with-python3
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def urify(string):
return urllib.parse.quote_plus(string)
def get_apple_csv():
# scraper to get the link to apple's csv
op = webdriver.ChromeOptions()
op.add_argument('headless')
driver = webdriver.Chrome(
'/usr/lib/chromium-browser/chromedriver', options=op)
driver.get('https://www.apple.com/covid19/mobility')
driver.implicitly_wait(5)
link = driver.find_element_by_xpath('//*[@id="download-card"]/div[2]/a')
href = link.get_attribute('href')
driver.quit()
return href
def get_google_csv():
# scraper to get the link to google's csv
url = 'https://www.google.com/covid19/mobility/'
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, features="html.parser")
link = soup.select_one('.icon-link')
href = link.get('href')
return href
# produced URIs will start with
BASE_URI = "http://localhost:8000/"
OUTPUT_NAME = "mobility_data"
OUTPUT_FORMAT = "ttl"
# download csv data from apple and google
# google_mobility_trends = pandas.read_csv(get_google_csv())
# apple_mobility_trends = pandas.read_csv(get_apple_csv())
google_mobility_trends = pandas.read_csv('google.csv')
apple_mobility_trends = pandas.read_csv('apple.csv')
g_it_regions = ['Friuli Venezia Giulia', 'Lombardia', 'Sicilia', 'Sardegna', 'Piemonte',
'Valle d\'Aosta', 'Puglia', 'Toscana', 'Trentino Alto Adige']
g_en_regions = ['Friuli-Venezia Giulia', 'Lombardy', 'Sicily', 'Sardinia', 'Piedmont', 'Aosta',
'Apulia', 'Tuscany', 'Trentino-South Tyrol']
a_it_regions = ['Friuli Venezia Giulia', 'Emilia-Romagna']
a_en_regions = ['Friuli venezia Giulia', 'Emilia Romagna']
# keep only italian data
google_mobility_trends = google_mobility_trends[google_mobility_trends['country_region_code'] == 'IT']
apple_mobility_trends = apple_mobility_trends[apple_mobility_trends['country'] == 'Italy']
# create an empty rdf graph and set the proper ontologies
g = Graph()
italy = Namespace(BASE_URI + "italy.ttl#")
obs = Namespace(BASE_URI + "observation.ttl#")
mob = Namespace(BASE_URI + "mobility.ttl#")
g.bind("mob", mob)
g.bind("obs", obs)
g.bind("italy", italy)
for _, google_row in progressbar.progressbar(google_mobility_trends.iterrows(), max_value=len(google_mobility_trends.index)):
if pandas.isnull(google_row.sub_region_1):
continue
# translate google region names to italian
region_name = google_row.sub_region_1
if region_name in g_en_regions:
region_name = g_it_regions[g_en_regions.index(google_row.sub_region_1)]
# and then to apple's english version, so that we can put the two together
apple_region_name = region_name
if region_name in a_it_regions:
apple_region_name = a_en_regions[a_it_regions.index(region_name)]
# keep the apple row(s) with the same google region and date
apple_rows = apple_mobility_trends.loc[(apple_mobility_trends['alternative_name'] == apple_region_name) |
(apple_mobility_trends['region'] == apple_region_name + ' Region')]
apple_rows = apple_rows[['transportation_type', google_row.date]]
# create new instances
uri_region = URIRef(BASE_URI + "region/" +
urify(region_name))
uri_observation = URIRef(BASE_URI + "observation/" +
urify(google_row.date))
g.add([uri_region, RDF.type, italy.Region])
g.add([uri_observation, RDF.type, obs.Observation])
# set data for the given observation
blank = BNode()
g.add([uri_observation, obs.date, Literal(google_row.date)])
g.add([uri_observation, obs.of, blank])
g.add([blank, RDF.type, mob.MobilityObservation])
g.add([blank, mob.place, uri_region])
g.add([blank, mob.retail_recreation, Literal(
google_row.retail_and_recreation_percent_change_from_baseline)])
g.add([blank, mob.grocery_pharmacy, Literal(
google_row.grocery_and_pharmacy_percent_change_from_baseline)])
g.add([blank, mob.parks, Literal(
google_row.parks_percent_change_from_baseline)])
g.add([blank, mob.transit_stations, Literal(
google_row.transit_stations_percent_change_from_baseline)])
g.add([blank, mob.workplaces, Literal(
google_row.workplaces_percent_change_from_baseline)])
g.add([blank, mob.residential, Literal(
google_row.residential_percent_change_from_baseline)])
apple_walking = apple_rows.loc[apple_rows['transportation_type'] == 'walking']
apple_driving = apple_rows.loc[apple_rows['transportation_type'] == 'driving']
apple_transit = apple_rows.loc[apple_rows['transportation_type'] == 'transit']
if apple_walking.empty == False:
g.add([blank, mob.walking, Literal(
apple_walking[google_row.date].values[0])])
if apple_driving.empty == False:
g.add([blank, mob.driving, Literal(
apple_driving[google_row.date].values[0])])
if apple_transit.empty == False:
g.add([blank, mob.transit, Literal(
apple_transit[google_row.date].values[0])])
g.serialize(destination='./' + OUTPUT_NAME + '.' +
OUTPUT_FORMAT, format=OUTPUT_FORMAT)