-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Better inference of spreadsheet formats. #38522
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a2ce319
4266368
998a778
29d1dd0
163285c
de99f1a
9aea6b9
21b43e1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
import abc | ||
import datetime | ||
import inspect | ||
from io import BufferedIOBase, BytesIO, RawIOBase | ||
import os | ||
from textwrap import fill | ||
from typing import Any, Dict, Mapping, Union, cast | ||
from typing import Any, BinaryIO, Dict, Mapping, Union, cast | ||
import warnings | ||
from zipfile import ZipFile | ||
|
||
from pandas._config import config | ||
|
||
|
@@ -888,32 +888,77 @@ def close(self): | |
return content | ||
|
||
|
||
def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: | ||
def _peek_stream( | ||
stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20 | ||
) -> bytes: | ||
""" | ||
Check if the stream is an OpenDocument Spreadsheet (.ods) file | ||
|
||
It uses magic values inside the stream | ||
Return the specified number of bytes from the start of the stream | ||
and seek back to the start of the stream afterwards. | ||
|
||
Parameters | ||
---------- | ||
stream : Union[BufferedIOBase, RawIOBase] | ||
IO stream with data which might be an ODS file | ||
size: int | ||
The number of bytes to read. | ||
|
||
Returns | ||
------- | ||
is_ods : bool | ||
Boolean indication that this is indeed an ODS file or not | ||
content : bytes | ||
The bytes founds. | ||
""" | ||
stream.seek(0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if stream doesn't support seek? I think this is possible with compressed files. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a feeling we'd have seen reports of this either in the pandas tracker or xlrd's tracker, since, unless I'm mistaken this type of seeking has been used in xlrd for many years now. |
||
is_ods = False | ||
if stream.read(4) == b"PK\003\004": | ||
stream.seek(30) | ||
is_ods = ( | ||
stream.read(54) == b"mimetype" | ||
b"application/vnd.oasis.opendocument.spreadsheet" | ||
) | ||
content = stream.read(size) | ||
stream.seek(0) | ||
return is_ods | ||
return content | ||
|
||
|
||
_XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" | ||
_ZIP_SIGNATURE = b"PK\x03\x04" | ||
_PEEK_SIZE = max(len(_XLS_SIGNATURE), len(_ZIP_SIGNATURE)) | ||
|
||
|
||
def _engine_from_content(stream: Union[BufferedIOBase, RawIOBase, BinaryIO]) -> str: | ||
""" | ||
Use the content of a stream to try and figure out which engine to use. | ||
|
||
It uses magic values inside the stream. | ||
|
||
Parameters | ||
---------- | ||
stream : Union[BufferedIOBase, RawIOBase] | ||
IO stream with data which might contain spreadsheet data. | ||
|
||
Returns | ||
------- | ||
engine : Optional[engine] | ||
The string engine if it can be confidently inferred. | ||
""" | ||
engine = None | ||
peek = _peek_stream(stream, _PEEK_SIZE) | ||
cjw296 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if peek.startswith(_XLS_SIGNATURE): | ||
engine = "xlrd" | ||
|
||
elif peek.startswith(_ZIP_SIGNATURE): | ||
zf = ZipFile(stream) | ||
|
||
# Workaround for some third party files that use forward slashes and | ||
# lower case names. We map the expected name in lowercase to the | ||
# actual filename in the zip container. | ||
component_names = { | ||
name.replace("\\", "/").lower(): name for name in zf.namelist() | ||
} | ||
|
||
stream.seek(0) | ||
|
||
if "xl/workbook.xml" in component_names: | ||
engine = "openpyxl" | ||
if "xl/workbook.bin" in component_names: | ||
engine = "pyxlsb" | ||
if "content.xml" in component_names: | ||
engine = "odf" | ||
|
||
return engine | ||
|
||
|
||
class ExcelFile: | ||
|
@@ -970,21 +1015,39 @@ class ExcelFile: | |
"pyxlsb": PyxlsbReader, | ||
} | ||
|
||
_ext_to_engine: Mapping[str, str] = { | ||
".ods": "odf", | ||
".xls": "xlrd", | ||
".xlsx": "openpyxl", | ||
} | ||
|
||
def __init__( | ||
self, path_or_buffer, engine=None, storage_options: StorageOptions = None | ||
): | ||
if engine is None: | ||
# Determine ext and use odf for ods stream/file | ||
|
||
ext = peek = None | ||
|
||
if isinstance(path_or_buffer, bytes): | ||
path_or_buffer = BytesIO(path_or_buffer) | ||
|
||
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): | ||
ext = None | ||
if _is_ods_stream(path_or_buffer): | ||
engine = "odf" | ||
else: | ||
engine = _engine_from_content(path_or_buffer) | ||
peek = _peek_stream(path_or_buffer) | ||
|
||
elif isinstance(path_or_buffer, (str, os.PathLike)): | ||
ext = os.path.splitext(str(path_or_buffer))[-1] | ||
if ext == ".ods": | ||
engine = "odf" | ||
handles = get_handle( | ||
stringify_path(path_or_buffer), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe the user expansion that |
||
"rb", | ||
storage_options=storage_options, | ||
is_text=False, | ||
) | ||
with handles: | ||
engine = _engine_from_content(handles.handle) | ||
peek = _peek_stream(handles.handle) | ||
|
||
if ( | ||
elif ( | ||
import_optional_dependency( | ||
"xlrd", raise_on_missing=False, on_version="ignore" | ||
) | ||
|
@@ -995,38 +1058,16 @@ def __init__( | |
if isinstance(path_or_buffer, Book): | ||
engine = "xlrd" | ||
|
||
# GH 35029 - Prefer openpyxl except for xls files | ||
# Couldn't tell for definite, so guess based on extension: | ||
if engine is None: | ||
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": | ||
engine = "xlrd" | ||
elif ( | ||
import_optional_dependency( | ||
"openpyxl", raise_on_missing=False, on_version="ignore" | ||
) | ||
is not None | ||
): | ||
engine = "openpyxl" | ||
else: | ||
caller = inspect.stack()[1] | ||
if ( | ||
caller.filename.endswith("pandas/io/excel/_base.py") | ||
and caller.function == "read_excel" | ||
): | ||
stacklevel = 4 | ||
else: | ||
stacklevel = 2 | ||
warnings.warn( | ||
"The xlrd engine is no longer maintained and is not " | ||
"supported when using pandas with python >= 3.9. However, " | ||
"the engine xlrd will continue to be allowed for the " | ||
"indefinite future. Beginning with pandas 1.2.0, the " | ||
"openpyxl engine will be used if it is installed and the " | ||
"engine argument is not specified. Either install openpyxl " | ||
"or specify engine='xlrd' to silence this warning.", | ||
FutureWarning, | ||
stacklevel=stacklevel, | ||
) | ||
engine = "xlrd" | ||
engine = self._ext_to_engine.get(ext) | ||
|
||
if engine is None: | ||
raise ValueError( | ||
f"Could not find engine for {repr(path_or_buffer)}, content was " | ||
f"{repr(peek)}" | ||
) | ||
|
||
if engine not in self._engines: | ||
raise ValueError(f"Unknown engine: {engine}") | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -508,10 +508,10 @@ def test_reader_spaces(self, read_ext): | |
def test_read_excel_ods_nested_xml(self, read_ext, basename, expected): | ||
# see gh-35802 | ||
engine = pd.read_excel.keywords["engine"] | ||
if engine != "odf": | ||
if engine not in ("odf", None): | ||
pytest.skip(f"Skipped for engine: {engine}") | ||
|
||
actual = pd.read_excel(basename + read_ext) | ||
actual = pd.read_excel(basename + ".ods") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can now run these tests on the |
||
tm.assert_frame_equal(actual, expected) | ||
|
||
def test_reading_all_sheets(self, read_ext): | ||
|
@@ -624,7 +624,6 @@ def test_read_from_http_url(self, read_ext): | |
local_table = pd.read_excel("test1" + read_ext) | ||
tm.assert_frame_equal(url_table, local_table) | ||
|
||
@td.skip_if_not_us_locale | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does anyone know what the intention was in this skip? I'm in the UK and wanted to make sure I hadn't broken these tests. I couldn't think of any reason why this needed to be in, but have removed it in a separate commit which I have no problem dropping out of this PR if people would prefer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was added in #21814. But no idea why (the test doesn't seem to have anything locale specific (like eg date parsing)). But we can see what CI says. |
||
def test_read_from_s3_url(self, read_ext, s3_resource, s3so): | ||
# Bucket "pandas-test" created in tests/io/conftest.py | ||
with open("test1" + read_ext, "rb") as f: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Tests that don't need or don't work with the autouse fixtures in test_readers.py | ||
import pytest | ||
|
||
import pandas as pd | ||
|
||
|
||
def test_unreadable_bytes(): | ||
with pytest.raises( | ||
ValueError, match=r"Could not find engine for .+, content was b'rubbish'" | ||
): | ||
pd.read_excel(b"rubbish") | ||
|
||
|
||
def test_unreadable_file(tmp_path): | ||
bad = tmp_path / "bad" | ||
bad.write_bytes(b"rubbish") | ||
with pytest.raises( | ||
ValueError, match=r"Could not find engine for .+, content was b'rubbish'" | ||
): | ||
pd.read_excel(bad) | ||
|
||
|
||
def test_non_existent_path(tmp_path): | ||
bad = tmp_path / "bad" | ||
with pytest.raises(FileNotFoundError): | ||
pd.read_excel(bad) |
Uh oh!
There was an error while loading. Please reload this page.