Skip to content

Commit 3f21dbc

Browse files
committed
fix: comfort mypy
1 parent 35c80fa commit 3f21dbc

File tree

4 files changed

+67
-76
lines changed

4 files changed

+67
-76
lines changed

parsers/base_models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import logging
22
from abc import ABC, abstractmethod
33
from enum import Enum
4-
from typing import Any
54
from pathlib import Path
5+
from typing import Any
6+
67
from pydantic import BaseModel, Field
78

89
logger = logging.getLogger(__name__)

parsers/docx_parser.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import logging
1010
import time
1111
from pathlib import Path
12+
1213
from docling.datamodel.base_models import InputFormat
1314
from docling.document_converter import DocumentConverter, WordFormatOption
1415
from docling.pipeline.simple_pipeline import SimplePipeline
@@ -30,10 +31,10 @@
3031
ChunkType,
3132
DocumentData,
3233
DocumentParser,
33-
TableDataItem,
34+
FormulaDataItem,
3435
ImageDataItem,
36+
TableDataItem,
3537
TextDataItem,
36-
FormulaDataItem
3738
)
3839
from parsers.parser_registry import register_parser
3940

@@ -108,8 +109,10 @@ def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
108109
Returns:
109110
List[ChunkData]: 图片列表
110111
"""
111-
image_items = []
112+
image_items: list[ChunkData] = []
112113
for idx, picture in enumerate(pictures):
114+
if not picture.image:
115+
continue
113116
image_uri = str(picture.image.uri)
114117
caption = [caption.cref for caption in picture.captions]
115118
footnote = [footnote.cref for footnote in picture.footnotes]

parsers/excel_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
ChunkType,
2525
DocumentData,
2626
DocumentParser,
27+
ImageDataItem,
2728
TableDataItem,
2829
TextDataItem,
29-
ImageDataItem
3030
)
3131
from parsers.parser_registry import register_parser
3232

parsers/pdf_parser.py

Lines changed: 58 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -6,41 +6,41 @@
66
"""
77

88
import asyncio
9-
from typing import Any, LiteralString
10-
import logging
11-
import time
12-
import re
13-
import shutil
14-
import os
159
import base64
16-
import json
10+
import os
11+
import re
1712
import shutil
13+
import time
1814
from pathlib import Path
19-
from urllib.parse import urljoin, urlparse
20-
from loguru import logger
21-
from bs4 import BeautifulSoup
15+
from typing import Any
2216

23-
from mineru.cli.common import prepare_env, read_fn
24-
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
25-
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
26-
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
27-
from mineru.utils.enum_class import MakeMode
28-
from mineru.data.data_reader_writer import FileBasedDataWriter
17+
from bs4 import BeautifulSoup
18+
from loguru import logger
19+
from mineru.backend.pipeline.model_json_to_middle_json import (
20+
result_to_middle_json as pipeline_result_to_middle_json, # type: ignore
21+
)
22+
from mineru.backend.pipeline.pipeline_analyze import (
23+
doc_analyze as pipeline_doc_analyze, # type: ignore
24+
)
25+
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
26+
union_make as pipeline_union_make, # type: ignore
27+
)
28+
from mineru.cli.common import prepare_env, read_fn # type: ignore
29+
from mineru.data.data_reader_writer import FileBasedDataWriter # type: ignore
30+
from mineru.utils.enum_class import MakeMode # type: ignore
2931

3032
from parsers.base_models import (
3133
ChunkData,
3234
ChunkType,
3335
DocumentData,
3436
DocumentParser,
35-
TableDataItem,
37+
FormulaDataItem,
3638
ImageDataItem,
39+
TableDataItem,
3740
TextDataItem,
38-
FormulaDataItem
3941
)
4042
from parsers.parser_registry import register_parser
4143

42-
logger = logging.getLogger(__name__)
43-
4444

4545
@register_parser(['.pdf'])
4646
class PdfDocumentParser(DocumentParser):
@@ -72,23 +72,31 @@ async def parse(self, file_path: Path) -> DocumentData:
7272
local_image_dir, _ = prepare_env(self.output_dir, pdf_file_name, self.parse_method)
7373
loop = asyncio.get_event_loop()
7474
content_list = await loop.run_in_executor(
75-
None,
75+
None,
7676
self._parse_pdf_to_content_list,
7777
file_path, local_image_dir, self.lang, self.parse_method, self.formula_enable, self.table_enable
7878
)
7979
for idx, item in enumerate(content_list):
8080
if item["type"] == "image":
81-
images_chunks.append(self._process_image(idx, item))
81+
image_chunk = self._process_image(idx, item)
82+
if image_chunk:
83+
images_chunks.append(image_chunk)
8284
elif item["type"] == "table":
83-
tables_chunks.append(self._process_table(idx, item))
85+
table_chunk = self._process_table(idx, item)
86+
if table_chunk:
87+
tables_chunks.append(table_chunk)
8488
elif item["type"] == "equation":
85-
formulas_chunks.append(self._process_formula(idx, item))
89+
formula_chunk = self._process_formula(idx, item)
90+
if formula_chunk:
91+
formulas_chunks.append(formula_chunk)
8692
elif item["type"] == "text":
8793
if item.get("text_level") == 1:
8894
title = item.get("text", "")
8995
continue
90-
texts_chunks.append(self._process_text(idx, item))
91-
96+
text_chunk = self._process_text(idx, item)
97+
if text_chunk:
98+
texts_chunks.append(text_chunk)
99+
92100
shutil.rmtree(local_image_dir, ignore_errors=True)
93101
processing_time = time.time() - start_time
94102
logger.info(f"Successfully parsed DOCX: {file_path} (took {processing_time:.2f}s)")
@@ -120,7 +128,7 @@ def _parse_pdf_to_content_list(
120128
parse_method: str = "auto",
121129
formula_enable: bool = True,
122130
table_enable: bool = True,
123-
) -> LiteralString|list[dict[str, Any]]:
131+
) -> list[dict[str, Any]]:
124132

125133
# 1. 读取 PDF bytes
126134
try:
@@ -162,10 +170,12 @@ def _parse_pdf_to_content_list(
162170
except Exception as e:
163171
logger.error(f"Failed in pipeline_union_make: {e}")
164172
raise
165-
return content_list
173+
return list(content_list)
166174

167-
def _process_image(self, idx:int,image:dict[str, Any]) -> ChunkData:
168-
image_path = Path(image.get("img_path"))
175+
def _process_image(self, idx:int,image:dict[str, Any]) -> ChunkData|None:
176+
if not image.get("img_path") or not os.path.exists(str(image.get("img_path"))):
177+
return None
178+
image_path = Path(str(image.get("img_path")))
169179
with open(image_path, "rb") as img_file:
170180
img_data = img_file.read()
171181
base64_data = base64.b64encode(img_data).decode("utf-8")
@@ -185,18 +195,20 @@ def _process_image(self, idx:int,image:dict[str, Any]) -> ChunkData:
185195
)
186196
)
187197

188-
def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData:
198+
def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData|None:
189199
html_str = table.get("table_body", "")
190200
soup = BeautifulSoup(html_str, 'html.parser')
191201
table_body = soup.find('table')
202+
if not table_body:
203+
return None
192204
# 使用网格处理 rowspan 和 colspan
193-
grid = []
205+
grid: list[list[str]] = []
194206
max_col = 0
195207

196-
for row_idx, tr in enumerate(table_body.find_all('tr')):
208+
for row_idx, tr in enumerate(table_body.find_all('tr')): # type: ignore
197209
while len(grid) <= row_idx:
198210
grid.append([])
199-
current_row = grid[row_idx]
211+
current_row: list[str] = grid[row_idx]
200212
col_idx = 0
201213

202214
# 跳过被 rowspan 占据的位置
@@ -218,7 +230,7 @@ def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData:
218230

219231
# 扩展行
220232
while len(current_row) < col_idx + colspan:
221-
current_row.append(None)
233+
current_row.append("")
222234

223235
# 填入内容
224236
for r in range(rowspan):
@@ -227,7 +239,7 @@ def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData:
227239
grid.append([])
228240
actual_row = grid[actual_row_idx]
229241
while len(actual_row) < col_idx + colspan:
230-
actual_row.append(None)
242+
actual_row.append("")
231243
for c in range(colspan):
232244
actual_row[col_idx + c] = text
233245

@@ -254,51 +266,26 @@ def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData:
254266
)
255267

256268

257-
def _process_formula(self, idx:int,formula:dict[str, Any]) -> ChunkData:
269+
def _process_formula(self, idx:int,formula:dict[str, Any]) -> ChunkData|None:
270+
if not formula.get("text") or formula.get("text") == "":
271+
return None
258272
return ChunkData(
259273
type=ChunkType.FORMULA,
260274
name=f"#/formulas/{idx}",
261275
content=FormulaDataItem(
262-
text=formula.get("text", ""),
263-
text_format=formula.get("text_format", "")
276+
text=str(formula.get("text")),
277+
text_format=formula.get("text_format")
264278
)
265279
)
266280

267-
def _process_text(self, idx:int,text:dict[str, Any]) -> ChunkData:
281+
def _process_text(self, idx:int,text:dict[str, Any]) -> ChunkData|None:
282+
if not text.get("text") or text.get("text") == "":
283+
return None
268284
return ChunkData(
269285
type=ChunkType.TEXT,
270286
name=f"#/texts/{idx}",
271287
content=TextDataItem(
272-
text=text.get("text", ""),
273-
text_level=text.get("text_level", None)
288+
text=str(text.get("text")),
289+
text_level=int(text.get("text_level", 0))
274290
)
275291
)
276-
277-
# ==================== 使用示例 ====================
278-
279-
if __name__ == "__main__":
280-
import asyncio
281-
# 参数设置
282-
__dir__ = Path(__file__).parent
283-
pdf_path = Path(__file__).parent.parent / "examples" /"data"/ "1.pdf" # 替换为你的 PDF 文件
284-
output_dir = __dir__ / "output"
285-
output_dir.mkdir(exist_ok=True)
286-
287-
# 解析并获取 content_list
288-
try:
289-
content_list = asyncio.run(PdfDocumentParser().parse(
290-
file_path=pdf_path
291-
))
292-
293-
# 打印结果(四种模态)
294-
print(content_list.title)
295-
print(content_list.tables)
296-
print(content_list.texts)
297-
print(len(content_list.images))
298-
print(content_list.formulas)
299-
300-
# 如果你想查看完整结构
301-
# print(json.dumps(content_list, ensure_ascii=False, indent=2))
302-
303-
except Exception as e:
304-
logger.exception("Failed to parse PDF")

0 commit comments

Comments
 (0)