Skip to content

Commit 2ee14a8

Browse files
committed
feat: add parser registry
1 parent 78ac128 commit 2ee14a8

File tree

8 files changed

+46
-44
lines changed

8 files changed

+46
-44
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,4 @@ cython_debug/
174174
# PyPI configuration file
175175
.pypirc
176176

177+
examples/

parsers/__init__.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,27 @@
11
# Parsers package
22

3+
from .base_models import DocumentData, DocumentParser
34
from .parser_registry import (
45
PARSER_REGISTRY,
5-
DocumentParser,
6-
can_parse,
76
get_parser,
87
get_supported_formats,
98
list_registered_parsers,
109
register_parser,
1110
)
1211

13-
__all__ = ['PARSER_REGISTRY', 'register_parser', 'DocumentParser', 'get_parser', 'can_parse', 'get_supported_formats', 'list_registered_parsers']
12+
__all__ = [
13+
'DocumentData',
14+
'DocumentParser',
15+
'PARSER_REGISTRY',
16+
'register_parser',
17+
'get_parser',
18+
'get_supported_formats',
19+
'list_registered_parsers',
20+
'load_all_parsers',
21+
]
22+
23+
def load_all_parsers() -> list[str]:
24+
"""加载所有解析器"""
25+
from .docx_parser import DocxDocumentParser
26+
from .excel_parser import ExcelParser
27+
return [DocxDocumentParser.__name__, ExcelParser.__name__]

parsers/base_models.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from abc import ABC, abstractmethod
23
from enum import Enum
34
from typing import Any
45

@@ -39,3 +40,11 @@ class DocumentData(BaseModel):
3940
processing_time: float = 0
4041
success: bool
4142
error_message: str | None = None
43+
44+
class DocumentParser(ABC):
45+
"""文档解析器基类"""
46+
47+
@abstractmethod
48+
async def parse(self, file_path: str) -> DocumentData:
49+
"""解析文档"""
50+
pass

parsers/docx_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@
2929
ChunkData,
3030
ChunkType,
3131
DocumentData,
32+
DocumentParser,
3233
TableDataItem,
3334
)
34-
from parsers.parser_registry import DocumentParser, register_parser
35+
from parsers.parser_registry import register_parser
3536

3637
logger = logging.getLogger(__name__)
3738

parsers/excel_parser.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@
2323
ChunkData,
2424
ChunkType,
2525
DocumentData,
26+
DocumentParser,
2627
TableDataItem,
2728
)
28-
from parsers.parser_registry import DocumentParser, register_parser
29+
from parsers.parser_registry import register_parser
2930

3031
# 忽略 openpyxl 的特定警告
3132
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
@@ -63,7 +64,7 @@ def __init__(self, config: ExcelParseConfig | None = None):
6364
self.config: ExcelParseConfig = config or ExcelParseConfig()
6465
self.image_index: int = 0
6566

66-
async def parse(self, excel_path: str) -> DocumentData:
67+
async def parse(self, file_path: str) -> DocumentData:
6768
"""
6869
将Excel文件转换为JSON格式
6970
Args:
@@ -81,7 +82,7 @@ async def parse(self, excel_path: str) -> DocumentData:
8182
images: list[ChunkData] = []
8283

8384
# 加载工作簿
84-
workbook = self._load_workbook(excel_path)
85+
workbook = self._load_workbook(file_path)
8586

8687
# 处理每个工作表
8788
for sheet_index, sheet_name in enumerate(workbook.sheetnames):
@@ -109,7 +110,7 @@ async def parse(self, excel_path: str) -> DocumentData:
109110
))
110111
processing_time = time.time() - start_time
111112
return DocumentData(
112-
title=Path(excel_path).stem,
113+
title=Path(file_path).stem,
113114
texts=texts,
114115
tables=tables,
115116
images=images,

parsers/parser_registry.py

Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,28 +5,18 @@
55
"""
66

77
import logging
8-
from abc import ABC, abstractmethod
98
from collections.abc import Callable
109
from pathlib import Path
1110

12-
from .base_models import DocumentData
11+
from .base_models import DocumentParser
1312

1413
logger = logging.getLogger(__name__)
1514

1615
# 全局解析器注册表
17-
PARSER_REGISTRY: dict[str, type['DocumentParser']] = {}
16+
PARSER_REGISTRY: dict[str, type[DocumentParser]] = {}
1817

1918

20-
class DocumentParser(ABC):
21-
"""文档解析器基类"""
22-
23-
@abstractmethod
24-
async def parse(self, file_path: str) -> DocumentData:
25-
"""解析文档"""
26-
pass
27-
28-
29-
def register_parser(suffixes: list[str]) -> Callable[[type['DocumentParser']], type['DocumentParser']]:
19+
def register_parser(suffixes: list[str]) -> Callable[[type[DocumentParser]], type[DocumentParser]]:
3020
"""
3121
解析器注册装饰器
3222
@@ -41,7 +31,7 @@ def register_parser(suffixes: list[str]) -> Callable[[type['DocumentParser']], t
4131
class DocxDocumentParser(DocumentParser):
4232
...
4333
"""
44-
def decorator(cls: type['DocumentParser']) -> type['DocumentParser']:
34+
def decorator(cls: type[DocumentParser]) -> type[DocumentParser]:
4535
# 验证类是否继承自 DocumentParser
4636
if not issubclass(cls, DocumentParser):
4737
raise TypeError(f"解析器类 {cls.__name__} 必须继承自 DocumentParser")
@@ -59,7 +49,7 @@ def decorator(cls: type['DocumentParser']) -> type['DocumentParser']:
5949
return decorator
6050

6151

62-
def get_parser(file_path: str) -> 'DocumentParser' | None:
52+
def get_parser(file_path: str) -> DocumentParser | None:
6353
"""
6454
根据文件路径获取合适的解析器实例
6555
@@ -83,22 +73,6 @@ def get_parser(file_path: str) -> 'DocumentParser' | None:
8373
logger.error(f"创建解析器实例失败: {parser_class.__name__}, 错误: {e}")
8474
return None
8575

86-
87-
def can_parse(file_path: str) -> bool:
88-
"""
89-
检查文件是否可以被解析
90-
91-
Args:
92-
file_path: 文件路径
93-
94-
Returns:
95-
bool: 是否支持该文件格式
96-
"""
97-
file = Path(file_path)
98-
suffix = file.suffix.lower()
99-
return suffix in PARSER_REGISTRY
100-
101-
10276
def get_supported_formats() -> list[str]:
10377
"""
10478
获取所有支持的文件格式
@@ -109,7 +83,7 @@ def get_supported_formats() -> list[str]:
10983
return list(PARSER_REGISTRY.keys())
11084

11185

112-
def get_parser_class(suffix: str) -> type['DocumentParser'] | None:
86+
def get_parser_class(suffix: str) -> type[DocumentParser] | None:
11387
"""
11488
根据文件扩展名获取解析器类
11589

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ warn_unreachable = true
8383
strict_equality = true
8484
exclude = [
8585
"tests/",
86-
"test_*.py"
86+
"test_*.py",
87+
"examples/"
8788
]
8889
# 解决模块路径冲突
8990
explicit_package_bases = true
@@ -94,7 +95,8 @@ target-version = "py312"
9495
line-length = 88
9596
exclude = [
9697
"tests/",
97-
"test_*.py"
98+
"test_*.py",
99+
"examples/"
98100
]
99101

100102
[tool.ruff.lint]

worker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44
from sanic import Sanic
55

66
from enhancers.information_enhancer import InformationEnhancerFactory
7-
from parsers import get_parser
7+
from parsers import get_parser, load_all_parsers
88
from parsers.base_models import ChunkData
99

1010

1111
async def worker(app: Sanic) -> dict[str, Any]:
1212
# 使用工厂获取合适的解析器
13-
13+
load_all_parsers()
1414
enhancer_factory = InformationEnhancerFactory()
1515
redis = app.ctx.redis
1616
while True:

0 commit comments

Comments
 (0)