5
5
"""
6
6
7
7
import logging
8
- from abc import ABC , abstractmethod
9
8
from collections .abc import Callable
10
9
from pathlib import Path
11
10
12
- from .base_models import DocumentData
11
+ from .base_models import DocumentParser
13
12
14
13
logger = logging .getLogger (__name__ )
15
14
16
15
# 全局解析器注册表
17
- PARSER_REGISTRY : dict [str , type [' DocumentParser' ]] = {}
16
+ PARSER_REGISTRY : dict [str , type [DocumentParser ]] = {}
18
17
19
18
20
- class DocumentParser (ABC ):
21
- """文档解析器基类"""
22
-
23
- @abstractmethod
24
- async def parse (self , file_path : str ) -> DocumentData :
25
- """解析文档"""
26
- pass
27
-
28
-
29
- def register_parser (suffixes : list [str ]) -> Callable [[type ['DocumentParser' ]], type ['DocumentParser' ]]:
19
+ def register_parser (suffixes : list [str ]) -> Callable [[type [DocumentParser ]], type [DocumentParser ]]:
30
20
"""
31
21
解析器注册装饰器
32
22
@@ -41,7 +31,7 @@ def register_parser(suffixes: list[str]) -> Callable[[type['DocumentParser']], t
41
31
class DocxDocumentParser(DocumentParser):
42
32
...
43
33
"""
44
- def decorator (cls : type [' DocumentParser' ]) -> type [' DocumentParser' ]:
34
+ def decorator (cls : type [DocumentParser ]) -> type [DocumentParser ]:
45
35
# 验证类是否继承自 DocumentParser
46
36
if not issubclass (cls , DocumentParser ):
47
37
raise TypeError (f"解析器类 { cls .__name__ } 必须继承自 DocumentParser" )
@@ -59,7 +49,7 @@ def decorator(cls: type['DocumentParser']) -> type['DocumentParser']:
59
49
return decorator
60
50
61
51
62
- def get_parser (file_path : str ) -> ' DocumentParser' | None :
52
+ def get_parser (file_path : str ) -> DocumentParser | None :
63
53
"""
64
54
根据文件路径获取合适的解析器实例
65
55
@@ -83,22 +73,6 @@ def get_parser(file_path: str) -> 'DocumentParser' | None:
83
73
logger .error (f"创建解析器实例失败: { parser_class .__name__ } , 错误: { e } " )
84
74
return None
85
75
86
-
87
- def can_parse (file_path : str ) -> bool :
88
- """
89
- 检查文件是否可以被解析
90
-
91
- Args:
92
- file_path: 文件路径
93
-
94
- Returns:
95
- bool: 是否支持该文件格式
96
- """
97
- file = Path (file_path )
98
- suffix = file .suffix .lower ()
99
- return suffix in PARSER_REGISTRY
100
-
101
-
102
76
def get_supported_formats () -> list [str ]:
103
77
"""
104
78
获取所有支持的文件格式
@@ -109,7 +83,7 @@ def get_supported_formats() -> list[str]:
109
83
return list (PARSER_REGISTRY .keys ())
110
84
111
85
112
- def get_parser_class (suffix : str ) -> type [' DocumentParser' ] | None :
86
+ def get_parser_class (suffix : str ) -> type [DocumentParser ] | None :
113
87
"""
114
88
根据文件扩展名获取解析器类
115
89
0 commit comments