Skip to content

Commit d710a16

Browse files
authored
Ensure excel reader closes file if it is passed as path (#2514)
Signed-off-by: Vasilij Litvinov <vasilij.n.litvinov@intel.com>
1 parent 031f444 commit d710a16

File tree

4 files changed

+75
-8
lines changed

4 files changed

+75
-8
lines changed

modin/config/envvars.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# governing permissions and limitations under the License.
1313

1414
import os
15+
import sys
1516
from textwrap import dedent
1617
import warnings
1718
from packaging import version
@@ -208,6 +209,18 @@ class TestDatasetSize(EnvironmentVariable, type=str):
208209
choices = ("Small", "Normal", "Big")
209210

210211

212+
class TrackFileLeaks(EnvironmentVariable, type=bool):
213+
"""
214+
Whether to track for open file handles leakage during testing
215+
"""
216+
217+
varname = "MODIN_TEST_TRACK_FILE_LEAKS"
218+
# Turn off tracking on Windows by default because
219+
# psutil's open_files() can be extremely slow on Windows (up to adding a few hours).
220+
# see https://github.com/giampaolo/psutil/pull/597
221+
default = sys.platform != "win32"
222+
223+
211224
def _check_vars():
212225
"""
213226
Look out for any environment variables that start with "MODIN_" prefix

modin/engines/base/io/text/excel_dispatcher.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def _read(cls, io, **kwargs):
4141
return cls.single_worker_read(io, **kwargs)
4242

4343
from zipfile import ZipFile
44-
from openpyxl import load_workbook
4544
from openpyxl.worksheet.worksheet import Worksheet
4645
from openpyxl.worksheet._reader import WorksheetReader
4746
from openpyxl.reader.excel import ExcelReader
@@ -59,12 +58,23 @@ def _read(cls, io, **kwargs):
5958
"Parallel `read_excel` is a new feature! Please email "
6059
"bug_reports@modin.org if you run into any problems."
6160
)
62-
wb = load_workbook(filename=io, read_only=True)
63-
# Get shared strings
64-
ex = ExcelReader(io, read_only=True)
65-
ex.read_manifest()
66-
ex.read_strings()
67-
ws = Worksheet(wb)
61+
62+
# NOTE: ExcelReader() in read-only mode does not close file handle by itself
63+
# work around that by passing file object if we received some path
64+
io_file = open(io, "rb") if isinstance(io, str) else io
65+
try:
66+
ex = ExcelReader(io_file, read_only=True)
67+
ex.read()
68+
wb = ex.wb
69+
70+
# Get shared strings
71+
ex.read_manifest()
72+
ex.read_strings()
73+
ws = Worksheet(wb)
74+
finally:
75+
if isinstance(io, str):
76+
# close only if it were us who opened the object
77+
io_file.close()
6878

6979
with ZipFile(io) as z:
7080
from io import BytesIO

modin/pandas/test/test_io.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import csv
2929

3030
from .utils import (
31+
check_file_leaks,
3132
df_equals,
3233
json_short_string,
3334
json_short_bytes,
@@ -1260,6 +1261,7 @@ def test_from_clipboard():
12601261

12611262

12621263
@pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details")
1264+
@check_file_leaks
12631265
def test_from_excel():
12641266
setup_excel_file(NROWS)
12651267

@@ -1271,6 +1273,7 @@ def test_from_excel():
12711273
teardown_excel_file()
12721274

12731275

1276+
@check_file_leaks
12741277
def test_from_excel_engine():
12751278
setup_excel_file(NROWS)
12761279

@@ -1283,6 +1286,7 @@ def test_from_excel_engine():
12831286
teardown_excel_file()
12841287

12851288

1289+
@check_file_leaks
12861290
def test_from_excel_index_col():
12871291
setup_excel_file(NROWS)
12881292

@@ -1295,6 +1299,7 @@ def test_from_excel_index_col():
12951299
teardown_excel_file()
12961300

12971301

1302+
@check_file_leaks
12981303
def test_from_excel_all_sheets():
12991304
setup_excel_file(NROWS)
13001305

@@ -1312,6 +1317,7 @@ def test_from_excel_all_sheets():
13121317
teardown_excel_file()
13131318

13141319

1320+
@check_file_leaks
13151321
def test_from_excel_sheetname_title():
13161322
path = "modin/pandas/test/data/excel_sheetname_title.xlsx"
13171323
modin_df = pd.read_excel(path)
@@ -1323,6 +1329,7 @@ def test_from_excel_sheetname_title():
13231329
"sheet_name",
13241330
["Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3],
13251331
)
1332+
@check_file_leaks
13261333
def test_from_excel_sheet_name(sheet_name):
13271334
fname = "modin/pandas/test/data/modin_error_book.xlsx"
13281335
modin_df = pd.read_excel(fname, sheet_name=sheet_name)

modin/pandas/test/utils.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@
2323
)
2424
import modin.pandas as pd
2525
from modin.utils import to_pandas
26-
from modin.config import TestDatasetSize
26+
from modin.config import TestDatasetSize, TrackFileLeaks
2727
from io import BytesIO
2828
import os
2929
from string import ascii_letters
3030
import csv
31+
import psutil
32+
import functools
3133

3234
random_state = np.random.RandomState(seed=42)
3335

@@ -1005,3 +1007,38 @@ def insert_lines_to_csv(
10051007
**csv_reader_writer_params,
10061008
)
10071009
writer.writerows(lines)
1010+
1011+
1012+
def _get_open_files():
1013+
"""
1014+
psutil open_files() can return a lot of extra information that we can allow to
1015+
be different, like file position; for simplicity we care about path and fd only.
1016+
"""
1017+
return sorted((info.path, info.fd) for info in psutil.Process().open_files())
1018+
1019+
1020+
def check_file_leaks(func):
1021+
"""
1022+
A decorator that ensures that no *newly* opened file handles are left
1023+
after decorated function is finished.
1024+
"""
1025+
if not TrackFileLeaks.get():
1026+
return func
1027+
1028+
@functools.wraps(func)
1029+
def check(*a, **kw):
1030+
fstart = _get_open_files()
1031+
try:
1032+
return func(*a, **kw)
1033+
finally:
1034+
leaks = []
1035+
for item in _get_open_files():
1036+
try:
1037+
fstart.remove(item)
1038+
except ValueError:
1039+
leaks.append(item)
1040+
assert (
1041+
not leaks
1042+
), f"Unexpected open handles left for: {', '.join(item[0] for item in leaks)}"
1043+
1044+
return check

0 commit comments

Comments
 (0)