Skip to content

Commit 8ddb863

Browse files
910 Handle pandas read excel engines (#940)
- Newer pandas versions use calamine engine (which is faster) instead of openpyxl Co-authored-by: annawendler <106674756+annawendler@users.noreply.github.com>
1 parent 870a8b1 commit 8ddb863

File tree

8 files changed

+34
-21
lines changed

8 files changed

+34
-21
lines changed

pycode/memilio-epidata/README.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Required python packages:
5050
- matplotlib
5151
- tables
5252
- numpy>=1.22,<1.25
53+
- pyarrow
5354
- openpyxl
5455
- xlrd
5556
- requests

pycode/memilio-epidata/memilio/epidata/download_config.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ path_to_use = default
3131
no_raw = False
3232

3333
# matplotlib backend to use
34-
mpl_backend = TkAgg
34+
mpl_backend = QtAgg

pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def get_official_county_table():
320320
file = gd.download_file(url_counties, 1024, None,
321321
p.set_progress, verify=False)
322322
county_table = pd.read_excel(
323-
file, sheet_name=1, header=5, engine='openpyxl')
323+
file, sheet_name=1, header=5, engine=gd.Conf.excel_engine)
324324
rename_kreise_deu_dict = {
325325
1: dd.EngEng['idCounty'],
326326
'2': "type", # name not important, column not used so far

pycode/memilio-epidata/memilio/epidata/getCaseDatawithEstimations.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,14 +80,14 @@ def get_case_data_with_estimations(
8080

8181
# get case data
8282
gcd.get_case_data(
83-
read_data, file_format, out_folder, no_raw, start_date, end_date,
84-
impute_dates, moving_average, make_plot, split_berlin,
85-
rep_date)
83+
read_data=read_data, file_format=file_format, out_folder=out_folder, no_raw=no_raw, start_date=start_date, end_date=end_date,
84+
impute_dates=impute_dates, moving_average=moving_average, make_plot=make_plot, split_berlin=split_berlin,
85+
rep_date=rep_date)
8686

8787
# get data from John Hopkins University
8888
gjd.get_jh_data(
89-
read_data, file_format, out_folder, no_raw, start_date, end_date,
90-
impute_dates, moving_average, make_plot)
89+
read_data=read_data, file_format=file_format, out_folder=out_folder, no_raw=no_raw, start_date=start_date, end_date=end_date,
90+
impute_dates=impute_dates, moving_average=moving_average, make_plot=make_plot)
9191

9292
# Now we now which data is generated and we can use it
9393
# read in jh data
@@ -393,7 +393,7 @@ def download_weekly_deaths_numbers(sheet_names, data_path):
393393
# Since sheet_names is a list of names get file returns a dict
394394
# with sheet_names as keys and their corresponding dataframes as values.
395395
df_dict = gd.get_file(filepath=data_path + name_file + '.json', url=url, read_data=False,
396-
param_dict={'sheet_name': sheet_names, 'header': 0, 'engine': 'openpyxl'})
396+
param_dict={'sheet_name': sheet_names, 'header': 0, 'engine': gd.Conf.excel_engine})
397397

398398
return df_dict
399399

pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from io import BytesIO
4242
from zipfile import ZipFile
4343
from enum import Enum
44+
from pkg_resources import parse_version
4445

4546
import pandas as pd
4647

@@ -63,6 +64,12 @@ class Conf:
6364

6465
v_level = 'Info'
6566
show_progr = False
67+
if parse_version(pd.__version__) < parse_version('2.2'):
68+
excel_engine = 'openpyxl'
69+
else:
70+
# calamine is faster, but cannot be used for pandas < 2.2
71+
# also there are issues with pd >= 2.2 and openpyxl engine
72+
excel_engine = 'calamine'
6673

6774
def __init__(self, out_folder, **kwargs):
6875

@@ -250,7 +257,8 @@ def get_file(
250257
251258
@return pandas dataframe
252259
"""
253-
param_dict_excel = {"sheet_name": 0, "header": 0, "engine": 'openpyxl'}
260+
param_dict_excel = {"sheet_name": 0,
261+
"header": 0, "engine": Conf.excel_engine}
254262
param_dict_csv = {"sep": ',', "header": 0, "encoding": None, 'dtype': None}
255263
param_dict_zip = {}
256264

pycode/memilio-epidata/memilio/epidata/getNPIData.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -238,12 +238,12 @@ def read_files(directory, fine_resolution, run_checks):
238238
df_npis_desc = pd.read_excel(
239239
os.path.join(
240240
directory, 'datensatzbeschreibung_massnahmen.xlsx'),
241-
sheet_name=2, engine='openpyxl')
241+
sheet_name=2, engine=gd.Conf.excel_engine)
242242
else:
243243
df_npis_desc = pd.read_excel(
244244
os.path.join(
245245
directory, 'datensatzbeschreibung_massnahmen.xlsx'),
246-
sheet_name=3, engine='openpyxl')
246+
sheet_name=3, engine=gd.Conf.excel_engine)
247247
except FileNotFoundError:
248248
print_manual_download(
249249
'datensatzbeschreibung_massnahmen.xlsx',
@@ -256,7 +256,7 @@ def read_files(directory, fine_resolution, run_checks):
256256
if fine_resolution > 0:
257257
df_npis_combinations_pre = pd.read_excel(
258258
os.path.join(
259-
directory, fname), engine='openpyxl')
259+
directory, fname), engine=gd.Conf.excel_engine)
260260
except FileNotFoundError:
261261
raise FileNotFoundError('File ' + fname + ' not found.')
262262

@@ -693,7 +693,7 @@ def get_npi_data(fine_resolution=2,
693693
df_in_valid = pd.read_excel(
694694
os.path.join(
695695
directory, 'combinations_npis_cleanoutput.xlsx'),
696-
sheet_name=i, engine='openpyxl')
696+
sheet_name=i, engine=gd.Conf.excel_engine)
697697
if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out):
698698
gd.default_print('Error', 'Error in combination matrix.')
699699
del df_in_valid
@@ -1400,7 +1400,7 @@ def plot_interaction_matrix(filename, directory):
14001400

14011401
try:
14021402
codelist = pd.ExcelFile(os.path.join(
1403-
directory, filename + '.xlsx'), engine='openpyxl').sheet_names
1403+
directory, filename + '.xlsx'), engine=gd.Conf.excel_engine).sheet_names
14041404
except FileNotFoundError:
14051405
raise FileNotFoundError('File ' + filename + ' not found.')
14061406

@@ -1419,7 +1419,7 @@ def plot_interaction_matrix(filename, directory):
14191419
for code in codelist:
14201420
df = pd.read_excel(
14211421
os.path.join(directory, filename + '.xlsx'),
1422-
sheet_name=code, engine='openpyxl')
1422+
sheet_name=code, engine=gd.Conf.excel_engine)
14231423

14241424
# remove first column and convert to numpy array
14251425
array_exclusion = df.iloc[:, 1:].to_numpy()

pycode/memilio-epidata/memilio/epidata/getTestingData.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,10 @@ def download_testing_data():
5151
url = 'https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Daten/Testzahlen-gesamt.xlsx?__blob=publicationFile'
5252
header = {'User-Agent': 'Mozilla/5.0'}
5353
r = requests.get(url, headers=header)
54+
if r.status_code != 200: # e.g. 404
55+
raise requests.exceptions.HTTPError("HTTPError: "+str(r.status_code))
5456
with io.BytesIO(r.content) as fh:
55-
df = pd.io.excel.ExcelFile(fh, engine='openpyxl')
57+
df = pd.io.excel.ExcelFile(fh, engine=gd.Conf.excel_engine)
5658
sheet_names = df.sheet_names
5759
df_test[0] = pd.read_excel(
5860
df, sheet_name=sheet_names[1],
@@ -67,8 +69,10 @@ def download_testing_data():
6769
url = 'https://ars.rki.de/Docs/SARS_CoV2/Daten/data_wochenbericht.xlsx'
6870
header = {'User-Agent': 'Mozilla/5.0'}
6971
r = requests.get(url, headers=header)
72+
if r.status_code != 200: # e.g. 404
73+
raise requests.exceptions.HTTPError("HTTPError: "+str(r.status_code))
7074
with io.BytesIO(r.content) as fh:
71-
df = pd.io.excel.ExcelFile(fh, engine='openpyxl')
75+
df = pd.io.excel.ExcelFile(fh, engine=gd.Conf.excel_engine)
7276
sheet_names = df.sheet_names
7377
df_test[1] = pd.read_excel(df, sheet_name=sheet_names[3], header=[4],
7478
dtype={'Anteil positiv': float})

pycode/memilio-epidata/setup.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,8 @@ def run(self):
7777
long_description='',
7878
test_suite='memilio.epidata_test',
7979
install_requires=[
80-
# smaller pandas versions contain a bug that sometimes prevents reading
81-
# some excel files (e.g. population or twitter data)
82-
# Has to use less than 2.2.0, see Issue #910
83-
'pandas>=2.0.0,<2.2.0',
80+
# pandas 2.0 is minimum for CoW
81+
'pandas>=2.0.0',
8482
# FutureWarning of pandas that pyarrow will be required in a future release
8583
'pyarrow',
8684
'matplotlib',
@@ -94,6 +92,8 @@ def run(self):
9492
'pyxlsb',
9593
'wget',
9694
'twill==3.1',
95+
'PyQt6',
96+
'python-calamine',
9797
pymagic
9898
],
9999
extras_require={

0 commit comments

Comments
 (0)