-
Notifications
You must be signed in to change notification settings - Fork 4
/
pandas.py
136 lines (111 loc) · 4.49 KB
/
pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from __future__ import annotations
from datetime import date, datetime, time, timedelta
from importlib.metadata import version
from typing import TYPE_CHECKING, Union, cast
import pandas as pd
from packaging.version import Version, parse
from pandas._typing import Scalar
from pandas.compat._optional import import_optional_dependency
from pandas.core.shared_docs import _shared_docs
from pandas.io.excel import ExcelFile
from pandas.io.excel._base import ( # type:ignore[attr-defined] # missing in pandas-stubs
BaseExcelReader,
)
from pandas.util._decorators import ( # type:ignore[attr-defined] # missing in pandas-stubs
doc,
)
if TYPE_CHECKING:
from pandas._typing import FilePath, ReadBuffer, StorageOptions
from python_calamine import CalamineSheet, CalamineWorkbook
_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta]
PANDAS_VERSION = parse(version("pandas"))
class CalamineExcelReader(BaseExcelReader):
book: CalamineWorkbook
@doc(storage_options=_shared_docs["storage_options"])
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
storage_options: StorageOptions | None = None,
engine_kwargs: dict | None = None,
) -> None:
"""
Reader using calamine engine (xlsx/xls/xlsb/ods).
Parameters
----------
filepath_or_buffer : str, path to be parsed or
an open readable stream.
{storage_options}
engine_kwargs : dict, optional
Arbitrary keyword arguments passed to excel engine.
"""
import_optional_dependency("python_calamine")
if PANDAS_VERSION >= Version("2.2.0"):
raise ValueError("Pandas >= 2.2.0 has builtin support of calamine")
elif PANDAS_VERSION >= Version("2.1.0"):
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
engine_kwargs=engine_kwargs,
)
elif PANDAS_VERSION >= Version("2.0.0"):
super().__init__(
filepath_or_buffer,
storage_options=storage_options,
)
else:
raise ValueError("Pandas >= 2 is only supported")
@property
def _workbook_class(self) -> type[CalamineWorkbook]:
from python_calamine import CalamineWorkbook
return CalamineWorkbook
def load_workbook(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
engine_kwargs: dict | None = None,
) -> CalamineWorkbook:
from python_calamine import load_workbook
return load_workbook(filepath_or_buffer, **(engine_kwargs or {}))
@property
def sheet_names(self) -> list[str]:
from python_calamine import SheetTypeEnum
return [
sheet.name
for sheet in self.book.sheets_metadata
if sheet.typ == SheetTypeEnum.WorkSheet
]
def get_sheet_by_name(self, name: str) -> CalamineSheet:
self.raise_if_bad_sheet_by_name(name)
return self.book.get_sheet_by_name(name)
def get_sheet_by_index(self, index: int) -> CalamineSheet:
self.raise_if_bad_sheet_by_index(index)
return self.book.get_sheet_by_index(index)
def get_sheet_data(
self, sheet: CalamineSheet, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
def _convert_cell(value: _CellValueT) -> Scalar:
if isinstance(value, float):
val = int(value)
if val == value:
return val
else:
return value
elif isinstance(value, date):
return pd.Timestamp(value)
elif isinstance(value, timedelta):
return pd.Timedelta(value)
elif isinstance(value, time):
# cast needed here because Scalar doesn't include datetime.time
return cast(Scalar, value)
return value
rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False)
data: list[list[Scalar]] = []
for row in rows:
data.append([_convert_cell(cell) for cell in row])
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
return data
def pandas_monkeypatch() -> None:
ExcelFile._engines = { # type:ignore[attr-defined] # missing in pandas-stubs
"calamine": CalamineExcelReader,
**ExcelFile._engines, # type:ignore[attr-defined] # missing in pandas-stubs
}