1123 population data for different years (#1124)

patricklnz · HenrZu · web-flow · commit 32cd8362465c · 2024-09-25T13:05:58.000+02:00
-Population Data can now be downloaded for past years, specified with ref_year
-If an invalid ref_year is used, we automatically load the newest data

Co-authored-by: HenrZu &lt;69154294+HenrZu@users.noreply.github.com&gt;
diff --git a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py
@@ -462,6 +462,11 @@ def cli(what):
         parser.add_argument(
             '--files', nargs="*", default='All'
         )
+    if 'ref_year' in what_list:
+        parser.add_argument(
+            '--ref-year', default='newest',
+            help='Considered year.'
+        )
 
     # add optional download options
     if '--no-progress-indicators' in sys.argv:
diff --git a/pycode/memilio-epidata/memilio/epidata/getPopulationData.py b/pycode/memilio-epidata/memilio/epidata/getPopulationData.py
@@ -39,21 +39,32 @@
 pd.options.mode.copy_on_write = True
 
 
-def read_population_data():
+def read_population_data(ref_year):
     """! Reads Population data from regionalstatistik.de
 
     A request is made to regionalstatistik.de and the StringIO is read in as a csv into the dataframe format.
+    @param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
     @return DataFrame
     """
-
-    download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv'
-    req = requests.get(download_url)
-    df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6)
-
-    return df_pop_raw
-
-
-def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_format: str, merge_eisenach: bool):
+    if ref_year is not None:
+        try:
+            download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv&zeiten=' + \
+                str(ref_year)
+            req = requests.get(download_url)
+            df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6)
+        except pd.errors.ParserError:
+            gd.default_print('Warning', 'Data for year '+str(ref_year) +
+                             ' is not available; downloading newest data instead.')
+            ref_year = None
+    if ref_year is None:
+        download_url = 'https://www.regionalstatistik.de/genesis/online?operation=download&code=12411-02-03-4&option=csv'
+        req = requests.get(download_url)
+        df_pop_raw = pd.read_csv(io.StringIO(req.text), sep=';', header=6)
+
+    return df_pop_raw, ref_year
+
+
+def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_format: str, merge_eisenach: bool, ref_year):
     """! Writes population dataframe into directory with new column names and age groups
 
     @param df_pop Population data DataFrame to be exported
@@ -62,6 +73,7 @@ def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_forma
     @param merge_eisenach Defines whether the counties 'Wartburgkreis'
         and 'Eisenach' are listed separately or
         combined as one entity 'Wartburgkreis'.
+    @param ref_year None or year (jjjj) convertible to str. Reference year.
     @return exported DataFrame
     """
 
@@ -111,19 +123,20 @@ def export_population_dataframe(df_pop: pd.DataFrame, directory: str, file_forma
 
     gd.check_dir(directory)
 
-    if len(df_pop_export) == 401:
-        filename = 'county_current_population_dim401'
-        gd.write_dataframe(df_pop_export, directory, filename, file_format)
-
-    if len(df_pop_export) == 400 or merge_eisenach:
+    if ref_year is None:
         filename = 'county_current_population'
+    else:
+        filename = 'county_' + str(ref_year) + '_population'
+
+    if len(df_pop_export) == 401:
+        filename = filename + '_dim401'
 
-        # Merge Eisenach and Wartburgkreis
-        df_pop_export = geoger.merge_df_counties_all(
-            df_pop_export, sorting=[dd.EngEng["idCounty"]],
-            columns=dd.EngEng["idCounty"])
+    # Merge Eisenach and Wartburgkreis
+    df_pop_export = geoger.merge_df_counties_all(
+        df_pop_export, sorting=[dd.EngEng["idCounty"]],
+        columns=dd.EngEng["idCounty"])
 
-        gd.write_dataframe(df_pop_export, directory, filename, file_format)
+    gd.write_dataframe(df_pop_export, directory, filename, file_format)
 
     return df_pop_export
 
@@ -203,23 +216,20 @@ def test_total_population(df_pop, age_cols):
     @param df_pop Population Dataframe with all counties
     @param age_cols All age groups in DataFrame"""
 
-    total_sum_2020 = 83155031
-    total_sum_2021 = 83237124
-    total_sum_2022 = 84358845
+    total_sum_expect = 84e6
     total_sum = df_pop[age_cols].sum().sum()
 
-    if total_sum == total_sum_2022:
-        pass
-    elif total_sum == total_sum_2021:
-        warnings.warn('Using data of 2021. Newer data is available.')
-    elif total_sum == total_sum_2020:
-        warnings.warn('Using data of 2020. Newer data is available.')
-    else:
-        raise gd.DataError('Total Population does not match expectation.')
+    if not isinstance(total_sum, (int, np.integer)):
+        raise gd.DataError('Unexpected dtypes in Population Data.')
+    # check if total population is +-5% accurate to 2024 population
+    if (total_sum > 1.05*total_sum_expect) or (total_sum < 0.95*total_sum_expect):
+        gd.default_print(
+            'Warning', 'Total Population does not match expectation.')
 
 
 def fetch_population_data(read_data: bool = dd.defaultDict['read_data'],
                           out_folder: str = dd.defaultDict['out_folder'],
+                          ref_year=None,
                           **kwargs
                           ) -> pd.DataFrame:
     """! Downloads or reads the population data.
@@ -232,6 +242,7 @@ def fetch_population_data(read_data: bool = dd.defaultDict['read_data'],
         downloaded. Default defined in defaultDict.
     @param out_folder Path to folder where data is written in folder
         out_folder/Germany. Default defined in defaultDict.
+    @param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
     @return DataFrame with adjusted population data for all ages to current level.
     """
     conf = gd.Conf(out_folder, **kwargs)
@@ -245,9 +256,9 @@ def fetch_population_data(read_data: bool = dd.defaultDict['read_data'],
     directory = os.path.join(out_folder, 'Germany')
     gd.check_dir(directory)
 
-    df_pop_raw = read_population_data()
+    df_pop_raw, ref_year = read_population_data(ref_year)
 
-    return df_pop_raw
+    return df_pop_raw, ref_year
 
 
 def preprocess_population_data(df_pop_raw: pd.DataFrame,
@@ -310,7 +321,8 @@ def preprocess_population_data(df_pop_raw: pd.DataFrame,
 def write_population_data(df_pop: pd.DataFrame,
                           out_folder: str = dd.defaultDict['out_folder'],
                           file_format: str = dd.defaultDict['file_format'],
-                          merge_eisenach: bool = True
+                          merge_eisenach: bool = True,
+                          ref_year=None
                           ) -> None or pd.DataFrame:
     """! Write the population data into json files
     Three kinds of structuring of the data are done.
@@ -324,19 +336,21 @@ def write_population_data(df_pop: pd.DataFrame,
     @param merge_eisenach [Default: True] or False. Defines whether the
         counties 'Wartburgkreis' and 'Eisenach' are listed separately or
         combined as one entity 'Wartburgkreis'.
+    @param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
 
     @return None
     """
     directory = os.path.join(out_folder, 'Germany')
     df_pop_export = export_population_dataframe(
-        df_pop, directory, file_format, merge_eisenach)
+        df_pop, directory, file_format, merge_eisenach, ref_year)
     return df_pop_export
 
 
 def get_population_data(read_data: bool = dd.defaultDict['read_data'],
                         file_format: str = dd.defaultDict['file_format'],
                         out_folder: str = dd.defaultDict['out_folder'],
                         merge_eisenach: bool = True,
+                        ref_year=None,
                         **kwargs
                         ):
     """! Download age-stratified population data for the German counties.
@@ -369,14 +383,16 @@ def get_population_data(read_data: bool = dd.defaultDict['read_data'],
     @param merge_eisenach [Default: True] or False. Defines whether the
         counties 'Wartburgkreis' and 'Eisenach' are listed separately or
         combined as one entity 'Wartburgkreis'.
+    @param ref_year [Default: None] or year (jjjj) convertible to str. Reference year.
     @param username str. Username to sign in at regionalstatistik.de.
     @param password str. Password to sign in at regionalstatistik.de.
     @return DataFrame with adjusted population data for all ages to current level.
     """
-    raw_df = fetch_population_data(
+    raw_df, ref_year = fetch_population_data(
         read_data=read_data,
         out_folder=out_folder,
         file_format=file_format,
+        ref_year=ref_year,
         **kwargs
     )
     preprocess_df = preprocess_population_data(
@@ -387,7 +403,8 @@ def get_population_data(read_data: bool = dd.defaultDict['read_data'],
         df_pop=preprocess_df,
         file_format=file_format,
         out_folder=out_folder,
-        merge_eisenach=True
+        merge_eisenach=True,
+        ref_year=ref_year
     )
     return df_pop_export
 
diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_get_population_data.py
@@ -18,12 +18,12 @@
 # limitations under the License.
 #############################################################################
 import os
+import io
 import unittest
-import configparser
 import json
 import pandas as pd
 
-from unittest.mock import patch
+from unittest.mock import patch, Mock
 from pyfakefs import fake_filesystem_unittest
 
 from memilio.epidata import getPopulationData as gpd
@@ -50,7 +50,7 @@ def setUp(self):
     def test_export_population_data(self):
 
         result_df = gpd.export_population_dataframe(
-            self.df_pop, self.path, 'json', True)
+            self.df_pop, self.path, 'json', True, 'newest')
         # check if one file is written
         self.assertEqual(len(os.listdir(self.path)), 1)
 
@@ -64,12 +64,34 @@ def test_export_population_data(self):
                                                        '50-64 years', '65-74 years', '>74 years'])
 
     @patch('memilio.epidata.getPopulationData.read_population_data',
-           return_value=df_pop_raw)
+           return_value=(df_pop_raw, None))
     @patch('memilio.epidata.getPopulationData.assign_population_data', return_value=df_pop)
     @patch('memilio.epidata.getPopulationData.test_total_population')
     def test_get_population_data_full(self, mock_test, mock_assign, mock_download):
         # should not raise any errors
         gpd.get_population_data(out_folder=self.path)
+        # test ref_year
+        gpd.get_population_data(out_folder=self.path, ref_year=2013)
+
+    @patch('io.StringIO')
+    @patch('pandas.read_csv')
+    @patch('requests.get')
+    def test_read_population_data(self, mock_req, mock_pd, mock_io):
+        # Test a year that does not have population Data. Function should throw a
+        # warning, and download the newest data (ref_year = None)
+        test_year = 2000
+        # Create a mock response object for requests.get()
+        mock_response = Mock()
+        mock_response.text = "mocked csv data"
+        mock_req.return_value = mock_response
+        # Mock pandas.read_csv to raise a ParserError on the first call and return a DataFrame on the second
+        mock_pd.side_effect = [pd.errors.ParserError, pd.DataFrame()]
+        # Mock io.StringIO to return the StringIO object for pandas.read_csv
+        mock_io.return_value = io.StringIO("mocked csv data")
+        df, year = gpd.read_population_data(test_year)
+        # Test results, ref_year should now be None
+        self.assertTrue(df.empty)  # from mock
+        self.assertIsNone(year)
 
 
 if __name__ == '__main__':