Skip to content

Commit 78ac128

Browse files
committed
feat: add parser registry
1 parent 9a7eccd commit 78ac128

File tree

7 files changed

+156
-76
lines changed

7 files changed

+156
-76
lines changed

parsers/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,13 @@
11
# Parsers package
2+
3+
from .parser_registry import (
4+
PARSER_REGISTRY,
5+
DocumentParser,
6+
can_parse,
7+
get_parser,
8+
get_supported_formats,
9+
list_registered_parsers,
10+
register_parser,
11+
)
12+
13+
__all__ = ['PARSER_REGISTRY', 'register_parser', 'DocumentParser', 'get_parser', 'can_parse', 'get_supported_formats', 'list_registered_parsers']

parsers/base_models.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import logging
2-
from abc import ABC, abstractmethod
32
from enum import Enum
43
from typing import Any
54

@@ -40,19 +39,3 @@ class DocumentData(BaseModel):
4039
processing_time: float = 0
4140
success: bool
4241
error_message: str | None = None
43-
44-
class DocumentParser(ABC):
45-
"""文档解析器基类"""
46-
47-
def __init__(self) -> None:
48-
self.supported_formats: list[str] = Field(default_factory=list)
49-
50-
@abstractmethod
51-
async def parse(self, file_path: str) -> DocumentData:
52-
"""解析文档"""
53-
pass
54-
55-
@abstractmethod
56-
def can_parse(self, file_path: str) -> bool:
57-
"""检查是否可以解析该文件"""
58-
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)

parsers/document_parser.py

Lines changed: 0 additions & 29 deletions
This file was deleted.

parsers/docx_parser.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,14 @@
2929
ChunkData,
3030
ChunkType,
3131
DocumentData,
32-
DocumentParser,
3332
TableDataItem,
3433
)
34+
from parsers.parser_registry import DocumentParser, register_parser
3535

3636
logger = logging.getLogger(__name__)
3737

3838

39+
@register_parser(['.docx'])
3940
class DocxDocumentParser(DocumentParser):
4041
"""DOCX文档解析器
4142
@@ -46,32 +47,20 @@ class DocxDocumentParser(DocumentParser):
4647
def __init__(self) -> None:
4748
"""初始化解析器"""
4849
super().__init__()
49-
self.supported_formats = [".docx"]
5050
self._converter = DocumentConverter(
5151
format_options={InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline)},
5252
allowed_formats=[InputFormat.DOCX]
5353
)
5454
logger.debug("DocxDocumentParser initialized with SimplePipeline")
5555

56-
def can_parse(self, file_path: str) -> bool:
57-
"""检查是否可以解析该文件
58-
59-
Args:
60-
file_path: 文件路径
61-
62-
Returns:
63-
bool: 是否支持该文件格式
64-
"""
65-
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
66-
6756
async def parse(self, file_path: str) -> DocumentData:
6857
"""异步解析DOCX文件
6958
7059
Args:
7160
file_path: DOCX文件路径
7261
7362
Returns:
74-
ParseResult: 解析结果,包含标题、内容、处理时间和错误信息
63+
DocumentData: 解析结果,包含标题、内容、处理时间和错误信息
7564
"""
7665
start_time = time.time()
7766
try:

parsers/excel_parser.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
ChunkData,
2424
ChunkType,
2525
DocumentData,
26-
DocumentParser,
2726
TableDataItem,
2827
)
28+
from parsers.parser_registry import DocumentParser, register_parser
2929

3030
# 忽略 openpyxl 的特定警告
3131
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
@@ -49,6 +49,7 @@ class ExcelParseError(Exception):
4949
pass
5050

5151

52+
@register_parser(['.xlsx', '.xls'])
5253
class ExcelParser(DocumentParser):
5354
"""Excel文件解析器类"""
5455

@@ -61,17 +62,6 @@ def __init__(self, config: ExcelParseConfig | None = None):
6162
super().__init__()
6263
self.config: ExcelParseConfig = config or ExcelParseConfig()
6364
self.image_index: int = 0
64-
self.supported_formats: list[str] = ['.xlsx', '.xls']
65-
66-
def can_parse(self, file_path: str) -> bool:
67-
"""
68-
验证输入文件
69-
Args:
70-
file_path: 文件路径
71-
Returns:
72-
bool: 是否支持解析
73-
"""
74-
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
7565

7666
async def parse(self, excel_path: str) -> DocumentData:
7767
"""
@@ -183,7 +173,7 @@ def _process_image_object(self, img_obj: Image) -> ChunkData | None:
183173
Args:
184174
img_obj: 图片对象
185175
Returns:
186-
Optional[DocumentData]: 图片信息,处理失败时返回None
176+
ChunkData|None: 图片信息,处理失败时返回None
187177
"""
188178
try:
189179
# 获取图片数据

parsers/parser_registry.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
"""
2+
解析器注册器模块
3+
4+
提供基于装饰器的解析器自动注册机制,支持多种文件格式的解析器注册和查找。
5+
"""
6+
7+
import logging
8+
from abc import ABC, abstractmethod
9+
from collections.abc import Callable
10+
from pathlib import Path
11+
12+
from .base_models import DocumentData
13+
14+
logger = logging.getLogger(__name__)
15+
16+
# 全局解析器注册表
17+
PARSER_REGISTRY: dict[str, type['DocumentParser']] = {}
18+
19+
20+
class DocumentParser(ABC):
21+
"""文档解析器基类"""
22+
23+
@abstractmethod
24+
async def parse(self, file_path: str) -> DocumentData:
25+
"""解析文档"""
26+
pass
27+
28+
29+
def register_parser(suffixes: list[str]) -> Callable[[type['DocumentParser']], type['DocumentParser']]:
30+
"""
31+
解析器注册装饰器
32+
33+
Args:
34+
suffixes: 支持的文件扩展名列表,如 ['.docx', '.doc']
35+
36+
Returns:
37+
装饰器函数
38+
39+
Example:
40+
@register_parser(['.docx'])
41+
class DocxDocumentParser(DocumentParser):
42+
...
43+
"""
44+
def decorator(cls: type['DocumentParser']) -> type['DocumentParser']:
45+
# 验证类是否继承自 DocumentParser
46+
if not issubclass(cls, DocumentParser):
47+
raise TypeError(f"解析器类 {cls.__name__} 必须继承自 DocumentParser")
48+
49+
# 注册到全局注册表
50+
for suffix in suffixes:
51+
suffix = suffix.lower() # 统一转换为小写
52+
if suffix in PARSER_REGISTRY:
53+
logger.warning(f"覆盖已存在的解析器: {suffix} -> {cls.__name__}")
54+
PARSER_REGISTRY[suffix] = cls
55+
logger.info(f"注册解析器: {suffix} -> {cls.__name__}")
56+
57+
return cls
58+
59+
return decorator
60+
61+
62+
def get_parser(file_path: str) -> 'DocumentParser' | None:
63+
"""
64+
根据文件路径获取合适的解析器实例
65+
66+
Args:
67+
file_path: 文件路径
68+
69+
Returns:
70+
解析器实例,如果没有找到则返回 None
71+
"""
72+
file = Path(file_path)
73+
suffix = file.suffix.lower()
74+
75+
if suffix not in PARSER_REGISTRY:
76+
logger.warning(f"未找到支持 {suffix} 格式的解析器")
77+
return None
78+
79+
parser_class = PARSER_REGISTRY[suffix]
80+
try:
81+
return parser_class()
82+
except Exception as e:
83+
logger.error(f"创建解析器实例失败: {parser_class.__name__}, 错误: {e}")
84+
return None
85+
86+
87+
def can_parse(file_path: str) -> bool:
88+
"""
89+
检查文件是否可以被解析
90+
91+
Args:
92+
file_path: 文件路径
93+
94+
Returns:
95+
bool: 是否支持该文件格式
96+
"""
97+
file = Path(file_path)
98+
suffix = file.suffix.lower()
99+
return suffix in PARSER_REGISTRY
100+
101+
102+
def get_supported_formats() -> list[str]:
103+
"""
104+
获取所有支持的文件格式
105+
106+
Returns:
107+
支持的文件扩展名列表
108+
"""
109+
return list(PARSER_REGISTRY.keys())
110+
111+
112+
def get_parser_class(suffix: str) -> type['DocumentParser'] | None:
113+
"""
114+
根据文件扩展名获取解析器类
115+
116+
Args:
117+
suffix: 文件扩展名,如 '.docx'
118+
119+
Returns:
120+
解析器类,如果没有找到则返回 None
121+
"""
122+
return PARSER_REGISTRY.get(suffix.lower())
123+
124+
125+
def list_registered_parsers() -> dict[str, str]:
126+
"""
127+
列出所有已注册的解析器
128+
129+
Returns:
130+
扩展名到解析器类名的映射字典
131+
"""
132+
return {suffix: cls.__name__ for suffix, cls in PARSER_REGISTRY.items()}

worker.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
from sanic import Sanic
55

66
from enhancers.information_enhancer import InformationEnhancerFactory
7+
from parsers import get_parser
78
from parsers.base_models import ChunkData
8-
from parsers.document_parser import DocumentParserFactory
99

1010

1111
async def worker(app: Sanic) -> dict[str, Any]:
1212
# 使用工厂获取合适的解析器
13-
parser_factory = DocumentParserFactory()
13+
1414
enhancer_factory = InformationEnhancerFactory()
1515
redis = app.ctx.redis
1616
while True:
@@ -19,7 +19,10 @@ async def worker(app: Sanic) -> dict[str, Any]:
1919
await asyncio.sleep(1)
2020
continue
2121
file_path = task.get("file_path")
22-
parse_result = await parser_factory.parse_document(file_path)
22+
parser = get_parser(file_path)
23+
if not parser:
24+
continue
25+
parse_result = await parser.parse(file_path)
2326
if not parse_result.success:
2427
continue
2528
chunk_list = parse_result.texts + parse_result.tables + parse_result.images + parse_result.formulas

0 commit comments

Comments
 (0)