910 Handle pandas read excel engines (#940)

patricklnz · annawendler · web-flow · commit 8ddb863051a4 · 2024-04-02T13:44:14.000+02:00
- Newer pandas versions use calamine engine (which is faster) instead of openpyxl

Co-authored-by: annawendler &lt;106674756+annawendler@users.noreply.github.com&gt;
diff --git a/pycode/memilio-epidata/README.rst b/pycode/memilio-epidata/README.rst
@@ -50,6 +50,7 @@ Required python packages:
 - matplotlib
 - tables
 - numpy>=1.22,<1.25
+- pyarrow
 - openpyxl
 - xlrd
 - requests
diff --git a/pycode/memilio-epidata/memilio/epidata/download_config.conf b/pycode/memilio-epidata/memilio/epidata/download_config.conf
@@ -31,4 +31,4 @@ path_to_use = default
 no_raw = False
 
 # matplotlib backend to use
-mpl_backend = TkAgg
+mpl_backend = QtAgg
diff --git a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py
@@ -320,7 +320,7 @@ def get_official_county_table():
         file = gd.download_file(url_counties, 1024, None,
                                 p.set_progress, verify=False)
     county_table = pd.read_excel(
-        file, sheet_name=1, header=5, engine='openpyxl')
+        file, sheet_name=1, header=5, engine=gd.Conf.excel_engine)
     rename_kreise_deu_dict = {
         1: dd.EngEng['idCounty'],
         '2': "type",  # name not important, column not used so far
diff --git a/pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py b/pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py
@@ -80,14 +80,14 @@ def get_case_data_with_estimations(
 
         # get case data
         gcd.get_case_data(
-            read_data, file_format, out_folder, no_raw, start_date, end_date,
-            impute_dates, moving_average, make_plot, split_berlin,
-            rep_date)
+            read_data=read_data, file_format=file_format, out_folder=out_folder, no_raw=no_raw, start_date=start_date, end_date=end_date,
+            impute_dates=impute_dates, moving_average=moving_average, make_plot=make_plot, split_berlin=split_berlin,
+            rep_date=rep_date)
 
         # get data from John Hopkins University
         gjd.get_jh_data(
-            read_data, file_format, out_folder, no_raw, start_date, end_date,
-            impute_dates, moving_average, make_plot)
+            read_data=read_data, file_format=file_format, out_folder=out_folder, no_raw=no_raw, start_date=start_date, end_date=end_date,
+            impute_dates=impute_dates, moving_average=moving_average, make_plot=make_plot)
 
     # Now we now which data is generated and we can use it
     # read in jh data
@@ -393,7 +393,7 @@ def download_weekly_deaths_numbers(sheet_names, data_path):
     # Since sheet_names is a list of names get file returns a dict
     # with sheet_names as keys and their corresponding dataframes as values.
     df_dict = gd.get_file(filepath=data_path + name_file + '.json', url=url, read_data=False,
-                          param_dict={'sheet_name': sheet_names, 'header': 0, 'engine': 'openpyxl'})
+                          param_dict={'sheet_name': sheet_names, 'header': 0, 'engine': gd.Conf.excel_engine})
 
     return df_dict
 
diff --git a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py
@@ -41,6 +41,7 @@
 from io import BytesIO
 from zipfile import ZipFile
 from enum import Enum
+from pkg_resources import parse_version
 
 import pandas as pd
 
@@ -63,6 +64,12 @@ class Conf:
 
     v_level = 'Info'
     show_progr = False
+    if parse_version(pd.__version__) < parse_version('2.2'):
+        excel_engine = 'openpyxl'
+    else:
+        # calamine is faster, but cannot be used for pandas < 2.2
+        # also there are issues with pd >= 2.2 and openpyxl engine
+        excel_engine = 'calamine'
 
     def __init__(self, out_folder, **kwargs):
 
@@ -250,7 +257,8 @@ def get_file(
 
     @return pandas dataframe
     """
-    param_dict_excel = {"sheet_name": 0, "header": 0, "engine": 'openpyxl'}
+    param_dict_excel = {"sheet_name": 0,
+                        "header": 0, "engine": Conf.excel_engine}
     param_dict_csv = {"sep": ',', "header": 0, "encoding": None, 'dtype': None}
     param_dict_zip = {}
 
diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py
@@ -238,12 +238,12 @@ def read_files(directory, fine_resolution, run_checks):
             df_npis_desc = pd.read_excel(
                 os.path.join(
                     directory, 'datensatzbeschreibung_massnahmen.xlsx'),
-                sheet_name=2, engine='openpyxl')
+                sheet_name=2, engine=gd.Conf.excel_engine)
         else:
             df_npis_desc = pd.read_excel(
                 os.path.join(
                     directory, 'datensatzbeschreibung_massnahmen.xlsx'),
-                sheet_name=3, engine='openpyxl')
+                sheet_name=3, engine=gd.Conf.excel_engine)
     except FileNotFoundError:
         print_manual_download(
             'datensatzbeschreibung_massnahmen.xlsx',
@@ -256,7 +256,7 @@ def read_files(directory, fine_resolution, run_checks):
         if fine_resolution > 0:
             df_npis_combinations_pre = pd.read_excel(
                 os.path.join(
-                    directory, fname), engine='openpyxl')
+                    directory, fname), engine=gd.Conf.excel_engine)
     except FileNotFoundError:
         raise FileNotFoundError('File ' + fname + ' not found.')
 
@@ -693,7 +693,7 @@ def get_npi_data(fine_resolution=2,
                 df_in_valid = pd.read_excel(
                     os.path.join(
                         directory, 'combinations_npis_cleanoutput.xlsx'),
-                    sheet_name=i, engine='openpyxl')
+                    sheet_name=i, engine=gd.Conf.excel_engine)
                 if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out):
                     gd.default_print('Error', 'Error in combination matrix.')
                 del df_in_valid
@@ -1400,7 +1400,7 @@ def plot_interaction_matrix(filename, directory):
 
     try:
         codelist = pd.ExcelFile(os.path.join(
-            directory, filename + '.xlsx'), engine='openpyxl').sheet_names
+            directory, filename + '.xlsx'), engine=gd.Conf.excel_engine).sheet_names
     except FileNotFoundError:
         raise FileNotFoundError('File ' + filename + ' not found.')
 
@@ -1419,7 +1419,7 @@ def plot_interaction_matrix(filename, directory):
     for code in codelist:
         df = pd.read_excel(
             os.path.join(directory, filename + '.xlsx'),
-            sheet_name=code, engine='openpyxl')
+            sheet_name=code, engine=gd.Conf.excel_engine)
 
         # remove first column and convert to numpy array
         array_exclusion = df.iloc[:, 1:].to_numpy()
diff --git a/pycode/memilio-epidata/memilio/epidata/getTestingData.py b/pycode/memilio-epidata/memilio/epidata/getTestingData.py
@@ -51,8 +51,10 @@ def download_testing_data():
     url = 'https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Daten/Testzahlen-gesamt.xlsx?__blob=publicationFile'
     header = {'User-Agent': 'Mozilla/5.0'}
     r = requests.get(url, headers=header)
+    if r.status_code != 200:  # e.g. 404
+        raise requests.exceptions.HTTPError("HTTPError: "+str(r.status_code))
     with io.BytesIO(r.content) as fh:
-        df = pd.io.excel.ExcelFile(fh, engine='openpyxl')
+        df = pd.io.excel.ExcelFile(fh, engine=gd.Conf.excel_engine)
         sheet_names = df.sheet_names
         df_test[0] = pd.read_excel(
             df, sheet_name=sheet_names[1],
@@ -67,8 +69,10 @@ def download_testing_data():
     url = 'https://ars.rki.de/Docs/SARS_CoV2/Daten/data_wochenbericht.xlsx'
     header = {'User-Agent': 'Mozilla/5.0'}
     r = requests.get(url, headers=header)
+    if r.status_code != 200:  # e.g. 404
+        raise requests.exceptions.HTTPError("HTTPError: "+str(r.status_code))
     with io.BytesIO(r.content) as fh:
-        df = pd.io.excel.ExcelFile(fh, engine='openpyxl')
+        df = pd.io.excel.ExcelFile(fh, engine=gd.Conf.excel_engine)
         sheet_names = df.sheet_names
         df_test[1] = pd.read_excel(df, sheet_name=sheet_names[3], header=[4],
                                    dtype={'Anteil positiv': float})
diff --git a/pycode/memilio-epidata/setup.py b/pycode/memilio-epidata/setup.py
@@ -77,10 +77,8 @@ def run(self):
     long_description='',
     test_suite='memilio.epidata_test',
     install_requires=[
-        # smaller pandas versions contain a bug that sometimes prevents reading
-        # some excel files (e.g. population or twitter data)
-        # Has to use less than 2.2.0, see Issue #910
-        'pandas>=2.0.0,<2.2.0',
+        # pandas 2.0 is minimum for CoW
+        'pandas>=2.0.0',
         # FutureWarning of pandas that pyarrow will be required in a future release
         'pyarrow',
         'matplotlib',
@@ -94,6 +92,8 @@ def run(self):
         'pyxlsb',
         'wget',
         'twill==3.1',
+        'PyQt6',
+        'python-calamine',
         pymagic
     ],
     extras_require={