Skip to content

Commit f047ca3

Browse files
authored
OECD import scripts (datacommonsorg#867)
* oecd scripts * fix * update readme
1 parent ae0ce68 commit f047ca3

File tree

5 files changed

+716
-0
lines changed

5 files changed

+716
-0
lines changed

scripts/oecd/sdmx/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# OECD Bulk Import
2+
3+
This folder contains scripts for the bulk OECD import. This is currently a schemaless import.
4+
5+
Note: This was a very quick first pass attempt to get some data in, so only contains a few hundred of the OECD datasets that follow a specific format.
6+
7+
TODO(nhdiaz): Add tests / get remaining data.
8+
9+
To download data:
10+
```
11+
OPENSSL_CONF=openssl.cnf python3 download.py
12+
```
13+
14+
To process data and generate artifacts:
15+
```
16+
python3 process.py
17+
```
18+
19+
## SDMX
20+
21+
OECD uses the SDMX format for their data. We have translated this into the Data Commons data model as follows:
22+
23+
* The LOCATION dimension is used for observationAbout
24+
* The TIME_PERIOD dimension is used for observationDate
25+
* The TIME_FORMAT attribute is used for observationPeriod
26+
* The UNIT attribute is used for unit
27+
* The POWERCODE attribute is used for scalingFactor
28+
* The OBS_STATUS (observation level) attribute is used for measurementMethod
29+
* All other dimensions and attributes are added the stat var definition
30+
* The series is used as the parent SVG

scripts/oecd/sdmx/countries

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
ABW
2+
AFG
3+
AFI
4+
AGO
5+
AIA
6+
ALB
7+
AND
8+
ANT
9+
ARE
10+
ARG
11+
ARM
12+
ASM
13+
ATA
14+
ATB
15+
ATF
16+
ATG
17+
ATN
18+
AUS
19+
AUT
20+
AZE
21+
BDI
22+
BEL
23+
BEN
24+
BES
25+
BFA
26+
BGD
27+
BGR
28+
BHR
29+
BHS
30+
BIH
31+
BLM
32+
BLR
33+
BLZ
34+
BMU
35+
BOL
36+
BRA
37+
BRB
38+
BRN
39+
BTN
40+
BVT
41+
BWA
42+
BYS
43+
CAF
44+
CAN
45+
CCK
46+
CHE
47+
CHL
48+
CHN
49+
CIV
50+
CMR
51+
COD
52+
COG
53+
COK
54+
COL
55+
COM
56+
CPV
57+
CRI
58+
CSK
59+
CTE
60+
CUB
61+
CUW
62+
CXR
63+
CYM
64+
CYP
65+
CZE
66+
DDR
67+
DEU
68+
DHY
69+
DJI
70+
DMA
71+
DNK
72+
DOM
73+
DZA
74+
ECU
75+
EGY
76+
ERI
77+
ESH
78+
ESP
79+
EST
80+
ETH
81+
FIN
82+
FJI
83+
FLK
84+
FRA
85+
FRO
86+
FSM
87+
FXX
88+
GAB
89+
GBR
90+
GEL
91+
GEO
92+
GGY
93+
GHA
94+
GIB
95+
GIN
96+
GLP
97+
GMB
98+
GNB
99+
GNQ
100+
GRC
101+
GRD
102+
GRL
103+
GTM
104+
GUF
105+
GUM
106+
GUY
107+
HKG
108+
HMD
109+
HND
110+
HRV
111+
HTI
112+
HUN
113+
HVO
114+
IDN
115+
IMN
116+
IND
117+
IOT
118+
IRL
119+
IRN
120+
IRQ
121+
ISL
122+
ISR
123+
ITA
124+
JAM
125+
JEY
126+
JOR
127+
JPN
128+
KAZ
129+
KEN
130+
KGZ
131+
KHM
132+
KIR
133+
KNA
134+
KOR
135+
KWT
136+
LAO
137+
LBN
138+
LBR
139+
LBY
140+
LCA
141+
LIE
142+
LKA
143+
LSO
144+
LTU
145+
LUX
146+
LVA
147+
MAC
148+
MAF
149+
MAR
150+
MCO
151+
MDA
152+
MDG
153+
MDV
154+
MEX
155+
MHL
156+
MKD
157+
MLI
158+
MLT
159+
MMR
160+
MNE
161+
MNG
162+
MNP
163+
MOZ
164+
MRT
165+
MSR
166+
MTQ
167+
MUS
168+
MWI
169+
MYS
170+
MYT
171+
NAM
172+
NCL
173+
NER
174+
NFK
175+
NGA
176+
NHB
177+
NIC
178+
NIU
179+
NLD
180+
NOR
181+
NPL
182+
NRU
183+
NZL
184+
OMN
185+
PAK
186+
PAN
187+
PCN
188+
PER
189+
PHL
190+
PLW
191+
PNG
192+
POL
193+
PRI
194+
PRK
195+
PRT
196+
PRY
197+
PSE
198+
PYF
199+
QAT
200+
REU
201+
ROU
202+
RUS
203+
RWA
204+
SAU
205+
SCG
206+
SDN
207+
SEN
208+
SGP
209+
SGS
210+
SHN
211+
SJM
212+
SLB
213+
SLE
214+
SLV
215+
SMR
216+
SOM
217+
SPM
218+
SRB
219+
SSD
220+
STP
221+
SUN
222+
SUR
223+
SVK
224+
SVN
225+
SWE
226+
SWZ
227+
SXM
228+
SYC
229+
SYR
230+
TCA
231+
TCD
232+
TGO
233+
THA
234+
TJK
235+
TKL
236+
TKM
237+
TLS
238+
TON
239+
TTO
240+
TUN
241+
TUR
242+
TUV
243+
TWN
244+
TZA
245+
UGA
246+
UKR
247+
UMI
248+
URY
249+
USA
250+
UZB
251+
VAT
252+
VCT
253+
VEN
254+
VGB
255+
VIR
256+
VNM
257+
VUT
258+
WLF
259+
WSM
260+
XKS
261+
YEM
262+
YUG
263+
ZAF
264+
ZMB
265+
ZWE

scripts/oecd/sdmx/download.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
'''
15+
Bulk downloads OECD datasets.
16+
17+
Note: this currently is a "best-effort" single pass and will just skip any
18+
datasets with errors or failures.
19+
20+
Produces:
21+
* identifiers.csv: Series codes and names.
22+
* input/<CODE>.json: Folder of all fetched input datasets.
23+
24+
Usage: OPENSSL_CONF=openssl.cnf python3 download.py
25+
'''
26+
import csv
27+
import json
28+
import requests
29+
import os
30+
from xml.etree import ElementTree
31+
32+
PREFIX = '{http://www.SDMX.org/resources/SDMXML/schemas/v2_0/'
33+
34+
35+
def get_series():
36+
'''Gets all series.
37+
38+
Returns:
39+
Map of series code -> name.
40+
'''
41+
series = {}
42+
response = requests.get(
43+
'https://stats.oecd.org/RestSDMX/sdmx.ashx/GetKeyFamily/all')
44+
root = ElementTree.fromstring(response.content)
45+
for child in root:
46+
for key in child:
47+
if key.tag == PREFIX + 'structure}KeyFamily' and 'id' in key.attrib:
48+
for attr in key:
49+
if '{http://www.w3.org/XML/1998/namespace}lang' in attr.attrib and attr.attrib[
50+
'{http://www.w3.org/XML/1998/namespace}lang'] == 'en':
51+
series[key.attrib['id']] = attr.text
52+
return series
53+
54+
55+
if __name__ == '__main__':
56+
57+
# Precompute series list to avoid refetching on failure.
58+
with open('identifiers.csv', 'w') as f:
59+
series = get_series()
60+
writer = csv.writer(f)
61+
for s in sorted(series):
62+
writer.writerow([s, series[s]])
63+
64+
if not os.path.exists('input'):
65+
os.makedirs('input')
66+
67+
with open('identifiers.csv') as f_in:
68+
reader = csv.reader(f_in)
69+
for row in reader:
70+
identifier = row[0]
71+
print(identifier)
72+
try:
73+
result = requests.get(
74+
f'http://stats.oecd.org/sdmx-json/data/{identifier}/all/all'
75+
)
76+
with open(f'input/{identifier}.json', 'w') as f_out:
77+
try:
78+
f_out.write(json.dumps(result.json()))
79+
except:
80+
print('Error parsing:', identifier)
81+
except:
82+
print('Error fetching data for:', identifier)

scripts/oecd/sdmx/openssl.cnf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
openssl_conf = openssl_init
2+
3+
[openssl_init]
4+
ssl_conf = ssl_sect
5+
6+
[ssl_sect]
7+
system_default = system_default_sect
8+
9+
[system_default_sect]
10+
Options = UnsafeLegacyRenegotiation

0 commit comments

Comments
 (0)