Skip to content

Commit cce18b2

Browse files
authored
fix: deal with chartsheets in workbooks (#2433)
* fix(xlsx): deal with chartsheets in workbooks Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(xlsx): align test file names Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
1 parent f11f8c0 commit cce18b2

14 files changed

+718
-11
lines changed

docling/backend/msexcel_backend.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
TableData,
1919
)
2020
from openpyxl import load_workbook
21+
from openpyxl.chartsheet.chartsheet import Chartsheet
2122
from openpyxl.drawing.image import Image
2223
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
2324
from openpyxl.worksheet.worksheet import Worksheet
@@ -186,18 +187,18 @@ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
186187

187188
if self.workbook is not None:
188189
# Iterate over all sheets
189-
for sheet_name in self.workbook.sheetnames:
190-
_log.info(f"Processing sheet: {sheet_name}")
190+
for idx, name in enumerate(self.workbook.sheetnames):
191+
_log.info(f"Processing sheet {idx}: {name}")
191192

192-
sheet = self.workbook[sheet_name]
193-
page_no = self.workbook.index(sheet) + 1
193+
sheet = self.workbook[name]
194+
page_no = idx + 1
194195
# do not rely on sheet.max_column, sheet.max_row if there are images
195196
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
196197

197198
self.parents[0] = doc.add_group(
198199
parent=None,
199200
label=GroupLabel.SECTION,
200-
name=f"sheet: {sheet_name}",
201+
name=f"sheet: {name}",
201202
content_layer=self._get_sheet_content_layer(sheet),
202203
)
203204
doc = self._convert_sheet(doc, sheet)
@@ -208,7 +209,9 @@ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
208209

209210
return doc
210211

211-
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
212+
def _convert_sheet(
213+
self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet]
214+
) -> DoclingDocument:
212215
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
213216
214217
Args:
@@ -218,10 +221,11 @@ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocum
218221
Returns:
219222
The updated DoclingDocument.
220223
"""
224+
if isinstance(sheet, Worksheet):
225+
doc = self._find_tables_in_sheet(doc, sheet)
226+
doc = self._find_images_in_sheet(doc, sheet)
221227

222-
doc = self._find_tables_in_sheet(doc, sheet)
223-
224-
doc = self._find_images_in_sheet(doc, sheet)
228+
# TODO: parse charts in sheet
225229

226230
return doc
227231

File renamed without changes.
File renamed without changes.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
item-0 at level 0: unspecified: group _root_
2+
item-1 at level 1: section: group sheet: Duck Observations
3+
item-2 at level 2: table with [7x3]
4+
item-3 at level 1: section: group sheet: Duck Chart

0 commit comments

Comments
 (0)