6
6
"""
7
7
8
8
import asyncio
9
- from typing import Any , LiteralString
10
- import logging
11
- import time
12
- import re
13
- import shutil
14
- import os
15
9
import base64
16
- import json
10
+ import os
11
+ import re
17
12
import shutil
13
+ import time
18
14
from pathlib import Path
19
- from urllib .parse import urljoin , urlparse
20
- from loguru import logger
21
- from bs4 import BeautifulSoup
15
+ from typing import Any
22
16
23
- from mineru .cli .common import prepare_env , read_fn
24
- from mineru .backend .pipeline .pipeline_analyze import doc_analyze as pipeline_doc_analyze
25
- from mineru .backend .pipeline .model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
26
- from mineru .backend .pipeline .pipeline_middle_json_mkcontent import union_make as pipeline_union_make
27
- from mineru .utils .enum_class import MakeMode
28
- from mineru .data .data_reader_writer import FileBasedDataWriter
17
+ from bs4 import BeautifulSoup
18
+ from loguru import logger
19
+ from mineru .backend .pipeline .model_json_to_middle_json import (
20
+ result_to_middle_json as pipeline_result_to_middle_json , # type: ignore
21
+ )
22
+ from mineru .backend .pipeline .pipeline_analyze import (
23
+ doc_analyze as pipeline_doc_analyze , # type: ignore
24
+ )
25
+ from mineru .backend .pipeline .pipeline_middle_json_mkcontent import (
26
+ union_make as pipeline_union_make , # type: ignore
27
+ )
28
+ from mineru .cli .common import prepare_env , read_fn # type: ignore
29
+ from mineru .data .data_reader_writer import FileBasedDataWriter # type: ignore
30
+ from mineru .utils .enum_class import MakeMode # type: ignore
29
31
30
32
from parsers .base_models import (
31
33
ChunkData ,
32
34
ChunkType ,
33
35
DocumentData ,
34
36
DocumentParser ,
35
- TableDataItem ,
37
+ FormulaDataItem ,
36
38
ImageDataItem ,
39
+ TableDataItem ,
37
40
TextDataItem ,
38
- FormulaDataItem
39
41
)
40
42
from parsers .parser_registry import register_parser
41
43
42
- logger = logging .getLogger (__name__ )
43
-
44
44
45
45
@register_parser (['.pdf' ])
46
46
class PdfDocumentParser (DocumentParser ):
@@ -72,23 +72,31 @@ async def parse(self, file_path: Path) -> DocumentData:
72
72
local_image_dir , _ = prepare_env (self .output_dir , pdf_file_name , self .parse_method )
73
73
loop = asyncio .get_event_loop ()
74
74
content_list = await loop .run_in_executor (
75
- None ,
75
+ None ,
76
76
self ._parse_pdf_to_content_list ,
77
77
file_path , local_image_dir , self .lang , self .parse_method , self .formula_enable , self .table_enable
78
78
)
79
79
for idx , item in enumerate (content_list ):
80
80
if item ["type" ] == "image" :
81
- images_chunks .append (self ._process_image (idx , item ))
81
+ image_chunk = self ._process_image (idx , item )
82
+ if image_chunk :
83
+ images_chunks .append (image_chunk )
82
84
elif item ["type" ] == "table" :
83
- tables_chunks .append (self ._process_table (idx , item ))
85
+ table_chunk = self ._process_table (idx , item )
86
+ if table_chunk :
87
+ tables_chunks .append (table_chunk )
84
88
elif item ["type" ] == "equation" :
85
- formulas_chunks .append (self ._process_formula (idx , item ))
89
+ formula_chunk = self ._process_formula (idx , item )
90
+ if formula_chunk :
91
+ formulas_chunks .append (formula_chunk )
86
92
elif item ["type" ] == "text" :
87
93
if item .get ("text_level" ) == 1 :
88
94
title = item .get ("text" , "" )
89
95
continue
90
- texts_chunks .append (self ._process_text (idx , item ))
91
-
96
+ text_chunk = self ._process_text (idx , item )
97
+ if text_chunk :
98
+ texts_chunks .append (text_chunk )
99
+
92
100
shutil .rmtree (local_image_dir , ignore_errors = True )
93
101
processing_time = time .time () - start_time
94
102
logger .info (f"Successfully parsed DOCX: { file_path } (took { processing_time :.2f} s)" )
@@ -120,7 +128,7 @@ def _parse_pdf_to_content_list(
120
128
parse_method : str = "auto" ,
121
129
formula_enable : bool = True ,
122
130
table_enable : bool = True ,
123
- ) -> LiteralString | list [dict [str , Any ]]:
131
+ ) -> list [dict [str , Any ]]:
124
132
125
133
# 1. 读取 PDF bytes
126
134
try :
@@ -162,10 +170,12 @@ def _parse_pdf_to_content_list(
162
170
except Exception as e :
163
171
logger .error (f"Failed in pipeline_union_make: { e } " )
164
172
raise
165
- return content_list
173
+ return list ( content_list )
166
174
167
- def _process_image (self , idx :int ,image :dict [str , Any ]) -> ChunkData :
168
- image_path = Path (image .get ("img_path" ))
175
+ def _process_image (self , idx :int ,image :dict [str , Any ]) -> ChunkData | None :
176
+ if not image .get ("img_path" ) or not os .path .exists (str (image .get ("img_path" ))):
177
+ return None
178
+ image_path = Path (str (image .get ("img_path" )))
169
179
with open (image_path , "rb" ) as img_file :
170
180
img_data = img_file .read ()
171
181
base64_data = base64 .b64encode (img_data ).decode ("utf-8" )
@@ -185,18 +195,20 @@ def _process_image(self, idx:int,image:dict[str, Any]) -> ChunkData:
185
195
)
186
196
)
187
197
188
- def _process_table (self , idx :int ,table :dict [str , Any ]) -> ChunkData :
198
+ def _process_table (self , idx :int ,table :dict [str , Any ]) -> ChunkData | None :
189
199
html_str = table .get ("table_body" , "" )
190
200
soup = BeautifulSoup (html_str , 'html.parser' )
191
201
table_body = soup .find ('table' )
202
+ if not table_body :
203
+ return None
192
204
# 使用网格处理 rowspan 和 colspan
193
- grid = []
205
+ grid : list [ list [ str ]] = []
194
206
max_col = 0
195
207
196
- for row_idx , tr in enumerate (table_body .find_all ('tr' )):
208
+ for row_idx , tr in enumerate (table_body .find_all ('tr' )): # type: ignore
197
209
while len (grid ) <= row_idx :
198
210
grid .append ([])
199
- current_row = grid [row_idx ]
211
+ current_row : list [ str ] = grid [row_idx ]
200
212
col_idx = 0
201
213
202
214
# 跳过被 rowspan 占据的位置
@@ -218,7 +230,7 @@ def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData:
218
230
219
231
# 扩展行
220
232
while len (current_row ) < col_idx + colspan :
221
- current_row .append (None )
233
+ current_row .append ("" )
222
234
223
235
# 填入内容
224
236
for r in range (rowspan ):
@@ -227,7 +239,7 @@ def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData:
227
239
grid .append ([])
228
240
actual_row = grid [actual_row_idx ]
229
241
while len (actual_row ) < col_idx + colspan :
230
- actual_row .append (None )
242
+ actual_row .append ("" )
231
243
for c in range (colspan ):
232
244
actual_row [col_idx + c ] = text
233
245
@@ -254,51 +266,26 @@ def _process_table(self, idx:int,table:dict[str, Any]) -> ChunkData:
254
266
)
255
267
256
268
257
- def _process_formula (self , idx :int ,formula :dict [str , Any ]) -> ChunkData :
269
+ def _process_formula (self , idx :int ,formula :dict [str , Any ]) -> ChunkData | None :
270
+ if not formula .get ("text" ) or formula .get ("text" ) == "" :
271
+ return None
258
272
return ChunkData (
259
273
type = ChunkType .FORMULA ,
260
274
name = f"#/formulas/{ idx } " ,
261
275
content = FormulaDataItem (
262
- text = formula .get ("text" , "" ),
263
- text_format = formula .get ("text_format" , "" )
276
+ text = str ( formula .get ("text" ) ),
277
+ text_format = formula .get ("text_format" )
264
278
)
265
279
)
266
280
267
- def _process_text (self , idx :int ,text :dict [str , Any ]) -> ChunkData :
281
+ def _process_text (self , idx :int ,text :dict [str , Any ]) -> ChunkData | None :
282
+ if not text .get ("text" ) or text .get ("text" ) == "" :
283
+ return None
268
284
return ChunkData (
269
285
type = ChunkType .TEXT ,
270
286
name = f"#/texts/{ idx } " ,
271
287
content = TextDataItem (
272
- text = text .get ("text" , "" ),
273
- text_level = text .get ("text_level" , None )
288
+ text = str ( text .get ("text" ) ),
289
+ text_level = int ( text .get ("text_level" , 0 ) )
274
290
)
275
291
)
276
-
277
- # ==================== 使用示例 ====================
278
-
279
- if __name__ == "__main__" :
280
- import asyncio
281
- # 参数设置
282
- __dir__ = Path (__file__ ).parent
283
- pdf_path = Path (__file__ ).parent .parent / "examples" / "data" / "1.pdf" # 替换为你的 PDF 文件
284
- output_dir = __dir__ / "output"
285
- output_dir .mkdir (exist_ok = True )
286
-
287
- # 解析并获取 content_list
288
- try :
289
- content_list = asyncio .run (PdfDocumentParser ().parse (
290
- file_path = pdf_path
291
- ))
292
-
293
- # 打印结果(四种模态)
294
- print (content_list .title )
295
- print (content_list .tables )
296
- print (content_list .texts )
297
- print (len (content_list .images ))
298
- print (content_list .formulas )
299
-
300
- # 如果你想查看完整结构
301
- # print(json.dumps(content_list, ensure_ascii=False, indent=2))
302
-
303
- except Exception as e :
304
- logger .exception ("Failed to parse PDF" )
0 commit comments