Skip to content

Commit 2592849

Browse files
authored
Fix indices when reading Excel files in parallel (#2526)
Signed-off-by: Vasilij Litvinov <vasilij.n.litvinov@intel.com>
1 parent e76366b commit 2592849

File tree

3 files changed

+15
-0
lines changed

3 files changed

+15
-0
lines changed

modin/engines/base/io/text/excel_dispatcher.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ def _read(cls, io, **kwargs):
7676
# close only if it were us who opened the object
7777
io_file.close()
7878

79+
pandas_kw = dict(kwargs) # preserve original kwargs
7980
with ZipFile(io) as z:
8081
from io import BytesIO
8182

@@ -129,6 +130,13 @@ def _read(cls, io, **kwargs):
129130
# Remove column names that are specified as `index_col`
130131
if index_col is not None:
131132
column_names = column_names.drop(column_names[index_col])
133+
134+
if not all(column_names):
135+
# some column names are empty, use pandas reader to take the names from it
136+
pandas_kw["nrows"] = 1
137+
df = pandas.read_excel(io, **pandas_kw)
138+
column_names = df.columns
139+
132140
# Compute partition metadata upfront so it is uniform for all partitions
133141
chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)
134142
num_splits = min(len(column_names), num_partitions)
10.5 KB
Binary file not shown.

modin/pandas/test/test_io.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,6 +1325,13 @@ def test_from_excel_sheetname_title():
13251325
df_equals(modin_df, pandas_df)
13261326

13271327

1328+
@check_file_leaks
1329+
def test_excel_empty_line():
1330+
path = "modin/pandas/test/data/test_emptyline.xlsx"
1331+
modin_df = pd.read_excel(path)
1332+
assert str(modin_df)
1333+
1334+
13281335
@pytest.mark.parametrize(
13291336
"sheet_name",
13301337
["Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3],

0 commit comments

Comments
 (0)