Skip to content

Commit 32cd836

Browse files
patricklnzHenrZu
andauthored
1123 population data for different years (#1124)
-Population Data can now be downloaded for past years, specified with ref_year -If an invalid ref_year is used, we automatically load the newest data Co-authored-by: HenrZu <69154294+HenrZu@users.noreply.github.com>
1 parent 9b2aed4 commit 32cd836

File tree

3 files changed

+85
-41
lines changed

3 files changed

+85
-41
lines changed

pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,11 @@ def cli(what):
462462
parser.add_argument(
463463
'--files', nargs="*", default='All'
464464
)
465+
if 'ref_year' in what_list:
466+
parser.add_argument(
467+
'--ref-year', default='newest',
468+
help='Considered year.'
469+
)
465470

466471
# add optional download options
467472
if '--no-progress-indicators' in sys.argv:

pycode/memilio-epidata/memilio/epidata/getPopulationData.py

Lines changed: 54 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -39,21 +39,32 @@
3939
pd.options.mode.copy_on_write = True
4040

4141

42-
def read_population_data():
42+
def read_population_data(ref_year):
4343
"""! Reads Population data from regionalstatistik.de
4444
4545
A request is made to regionalstatistik.de and the StringIO is read in as a csv into the dataframe format.
46+
@param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
4647
@return DataFrame
4748
"""
48-
49-
download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv'
50-
req = requests.get(download_url)
51-
df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6)
52-
53-
return df_pop_raw
54-
55-
56-
def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_format: str, merge_eisenach: bool):
49+
if ref_year is not None:
50+
try:
51+
download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv&zeiten=' + \
52+
str(ref_year)
53+
req = requests.get(download_url)
54+
df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6)
55+
except pd.errors.ParserError:
56+
gd.default_print('Warning', 'Data for year '+str(ref_year) +
57+
' is not available; downloading newest data instead.')
58+
ref_year = None
59+
if ref_year is None:
60+
download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv'
61+
req = requests.get(download_url)
62+
df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6)
63+
64+
return df_pop_raw, ref_year
65+
66+
67+
def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_format: str, merge_eisenach: bool, ref_year):
5768
"""! Writes population dataframe into directory with new column names and age groups
5869
5970
@param df_pop Population data DataFrame to be exported
@@ -62,6 +73,7 @@ def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_forma
6273
@param merge_eisenach Defines whether the counties 'Wartburgkreis'
6374
and 'Eisenach' are listed separately or
6475
combined as one entity 'Wartburgkreis'.
76+
@param ref_year None or year (jjjj) convertible to str. Reference year.
6577
@return exported DataFrame
6678
"""
6779

@@ -111,19 +123,20 @@ def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_forma
111123

112124
gd.check_dir(directory)
113125

114-
if len(df_pop_export) == 401:
115-
filename = 'county_current_population_dim401'
116-
gd.write_dataframe(df_pop_export, directory, filename, file_format)
117-
118-
if len(df_pop_export) == 400 or merge_eisenach:
126+
if ref_year is None:
119127
filename = 'county_current_population'
128+
else:
129+
filename = 'county_' + str(ref_year) + '_population'
130+
131+
if len(df_pop_export) == 401:
132+
filename = filename + '_dim401'
120133

121-
# Merge Eisenach and Wartburgkreis
122-
df_pop_export = geoger.merge_df_counties_all(
123-
df_pop_export, sorting=[dd.EngEng["idCounty"]],
124-
columns=dd.EngEng["idCounty"])
134+
# Merge Eisenach and Wartburgkreis
135+
df_pop_export = geoger.merge_df_counties_all(
136+
df_pop_export, sorting=[dd.EngEng["idCounty"]],
137+
columns=dd.EngEng["idCounty"])
125138

126-
gd.write_dataframe(df_pop_export, directory, filename, file_format)
139+
gd.write_dataframe(df_pop_export, directory, filename, file_format)
127140

128141
return df_pop_export
129142

@@ -203,23 +216,20 @@ def test_total_population(df_pop, age_cols):
203216
@param df_pop Population Dataframe with all counties
204217
@param age_cols All age groups in DataFrame"""
205218

206-
total_sum_2020 = 83155031
207-
total_sum_2021 = 83237124
208-
total_sum_2022 = 84358845
219+
total_sum_expect = 84e6
209220
total_sum = df_pop[age_cols].sum().sum()
210221

211-
if total_sum == total_sum_2022:
212-
pass
213-
elif total_sum == total_sum_2021:
214-
warnings.warn('Using data of 2021. Newer data is available.')
215-
elif total_sum == total_sum_2020:
216-
warnings.warn('Using data of 2020. Newer data is available.')
217-
else:
218-
raise gd.DataError('Total Population does not match expectation.')
222+
if not isinstance(total_sum, (int, np.integer)):
223+
raise gd.DataError('Unexpected dtypes in Population Data.')
224+
# check if total population is +-5% accurate to 2024 population
225+
if (total_sum > 1.05*total_sum_expect) or (total_sum < 0.95*total_sum_expect):
226+
gd.default_print(
227+
'Warning', 'Total Population does not match expectation.')
219228

220229

221230
def fetch_population_data(read_data: bool = dd.defaultDict['read_data'],
222231
out_folder: str = dd.defaultDict['out_folder'],
232+
ref_year=None,
223233
**kwargs
224234
) -> pd.DataFrame:
225235
"""! Downloads or reads the population data.
@@ -232,6 +242,7 @@ def fetch_population_data(read_data: bool = dd.defaultDict['read_data'],
232242
downloaded. Default defined in defaultDict.
233243
@param out_folder Path to folder where data is written in folder
234244
out_folder/Germany. Default defined in defaultDict.
245+
@param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
235246
@return DataFrame with adjusted population data for all ages to current level.
236247
"""
237248
conf = gd.Conf(out_folder, **kwargs)
@@ -245,9 +256,9 @@ def fetch_population_data(read_data: bool = dd.defaultDict['read_data'],
245256
directory = os.path.join(out_folder, 'Germany')
246257
gd.check_dir(directory)
247258

248-
df_pop_raw = read_population_data()
259+
df_pop_raw, ref_year = read_population_data(ref_year)
249260

250-
return df_pop_raw
261+
return df_pop_raw, ref_year
251262

252263

253264
def preprocess_population_data(df_pop_raw: pd.DataFrame,
@@ -310,7 +321,8 @@ def preprocess_population_data(df_pop_raw: pd.DataFrame,
310321
def write_population_data(df_pop: pd.DataFrame,
311322
out_folder: str = dd.defaultDict['out_folder'],
312323
file_format: str = dd.defaultDict['file_format'],
313-
merge_eisenach: bool = True
324+
merge_eisenach: bool = True,
325+
ref_year=None
314326
) -> None or pd.DataFrame:
315327
"""! Write the population data into json files
316328
Three kinds of structuring of the data are done.
@@ -324,19 +336,21 @@ def write_population_data(df_pop: pd.DataFrame,
324336
@param merge_eisenach [Default: True] or False. Defines whether the
325337
counties 'Wartburgkreis' and 'Eisenach' are listed separately or
326338
combined as one entity 'Wartburgkreis'.
339+
@param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
327340
328341
@return None
329342
"""
330343
directory = os.path.join(out_folder, 'Germany')
331344
df_pop_export = export_population_dataframe(
332-
df_pop, directory, file_format, merge_eisenach)
345+
df_pop, directory, file_format, merge_eisenach, ref_year)
333346
return df_pop_export
334347

335348

336349
def get_population_data(read_data: bool = dd.defaultDict['read_data'],
337350
file_format: str = dd.defaultDict['file_format'],
338351
out_folder: str = dd.defaultDict['out_folder'],
339352
merge_eisenach: bool = True,
353+
ref_year=None,
340354
**kwargs
341355
):
342356
"""! Download age-stratified population data for the German counties.
@@ -369,14 +383,16 @@ def get_population_data(read_data: bool = dd.defaultDict['read_data'],
369383
@param merge_eisenach [Default: True] or False. Defines whether the
370384
counties 'Wartburgkreis' and 'Eisenach' are listed separately or
371385
combined as one entity 'Wartburgkreis'.
386+
@param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
372387
@param username str. Username to sign in at regionalstatistik.de.
373388
@param password str. Password to sign in at regionalstatistik.de.
374389
@return DataFrame with adjusted population data for all ages to current level.
375390
"""
376-
raw_df = fetch_population_data(
391+
raw_df, ref_year = fetch_population_data(
377392
read_data=read_data,
378393
out_folder=out_folder,
379394
file_format=file_format,
395+
ref_year=ref_year,
380396
**kwargs
381397
)
382398
preprocess_df = preprocess_population_data(
@@ -387,7 +403,8 @@ def get_population_data(read_data: bool = dd.defaultDict['read_data'],
387403
df_pop=preprocess_df,
388404
file_format=file_format,
389405
out_folder=out_folder,
390-
merge_eisenach=True
406+
merge_eisenach=True,
407+
ref_year=ref_year
391408
)
392409
return df_pop_export
393410

pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
# limitations under the License.
1919
#############################################################################
2020
import os
21+
import io
2122
import unittest
22-
import configparser
2323
import json
2424
import pandas as pd
2525

26-
from unittest.mock import patch
26+
from unittest.mock import patch, Mock
2727
from pyfakefs import fake_filesystem_unittest
2828

2929
from memilio.epidata import getPopulationData as gpd
@@ -50,7 +50,7 @@ def setUp(self):
5050
def test_export_population_data(self):
5151

5252
result_df = gpd.export_population_dataframe(
53-
self.df_pop, self.path, 'json', True)
53+
self.df_pop, self.path, 'json', True, 'newest')
5454
# check if one file is written
5555
self.assertEqual(len(os.listdir(self.path)), 1)
5656

@@ -64,12 +64,34 @@ def test_export_population_data(self):
6464
'50-64 years', '65-74 years', '>74 years'])
6565

6666
@patch('memilio.epidata.getPopulationData.read_population_data',
67-
return_value=df_pop_raw)
67+
return_value=(df_pop_raw, None))
6868
@patch('memilio.epidata.getPopulationData.assign_population_data', return_value=df_pop)
6969
@patch('memilio.epidata.getPopulationData.test_total_population')
7070
def test_get_population_data_full(self, mock_test, mock_assign, mock_download):
7171
# should not raise any errors
7272
gpd.get_population_data(out_folder=self.path)
73+
# test ref_year
74+
gpd.get_population_data(out_folder=self.path, ref_year=2013)
75+
76+
@patch('io.StringIO')
77+
@patch('pandas.read_csv')
78+
@patch('requests.get')
79+
def test_read_population_data(self, mock_req, mock_pd, mock_io):
80+
# Test a year that does not have population Data. Function should throw a
81+
# warning, and download the newest data (ref_year = None)
82+
test_year = 2000
83+
# Create a mock response object for requests.get()
84+
mock_response = Mock()
85+
mock_response.text = "mocked csv data"
86+
mock_req.return_value = mock_response
87+
# Mock pandas.read_csv to raise a ParserError on the first call and return a DataFrame on the second
88+
mock_pd.side_effect = [pd.errors.ParserError, pd.DataFrame()]
89+
# Mock io.StringIO to return the StringIO object for pandas.read_csv
90+
mock_io.return_value = io.StringIO("mocked csv data")
91+
df, year = gpd.read_population_data(test_year)
92+
# Test results, ref_year should now be None
93+
self.assertTrue(df.empty) # from mock
94+
self.assertIsNone(year)
7395

7496

7597
if __name__ == '__main__':

0 commit comments

Comments
 (0)