feat: add parser registry

liningping · liningping · commit 2ee14a88282e · 2025-08-20T17:02:44.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -174,3 +174,4 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
+examples/
diff --git a/parsers/__init__.py b/parsers/__init__.py
@@ -1,13 +1,27 @@
 # Parsers package
 
+from .base_models import DocumentData, DocumentParser
 from .parser_registry import (
     PARSER_REGISTRY,
-    DocumentParser,
-    can_parse,
     get_parser,
     get_supported_formats,
     list_registered_parsers,
     register_parser,
 )
 
-__all__ = ['PARSER_REGISTRY', 'register_parser', 'DocumentParser', 'get_parser', 'can_parse', 'get_supported_formats', 'list_registered_parsers']
+__all__ = [
+    'DocumentData',
+    'DocumentParser',
+    'PARSER_REGISTRY',
+    'register_parser',
+    'get_parser',
+    'get_supported_formats',
+    'list_registered_parsers',
+    'load_all_parsers',
+]
+
+def load_all_parsers() -> list[str]:
+    """加载所有解析器"""
+    from .docx_parser import DocxDocumentParser
+    from .excel_parser import ExcelParser
+    return [DocxDocumentParser.__name__, ExcelParser.__name__]
diff --git a/parsers/base_models.py b/parsers/base_models.py
@@ -1,4 +1,5 @@
 import logging
+from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any
 
@@ -39,3 +40,11 @@ class DocumentData(BaseModel):
     processing_time: float = 0
     success: bool
     error_message: str | None = None
+
+class DocumentParser(ABC):
+    """文档解析器基类"""
+
+    @abstractmethod
+    async def parse(self, file_path: str) -> DocumentData:
+        """解析文档"""
+        pass
diff --git a/parsers/docx_parser.py b/parsers/docx_parser.py
@@ -29,9 +29,10 @@
     ChunkData,
     ChunkType,
     DocumentData,
+    DocumentParser,
     TableDataItem,
 )
-from parsers.parser_registry import DocumentParser, register_parser
+from parsers.parser_registry import register_parser
 
 logger = logging.getLogger(__name__)
 
diff --git a/parsers/excel_parser.py b/parsers/excel_parser.py
@@ -23,9 +23,10 @@
     ChunkData,
     ChunkType,
     DocumentData,
+    DocumentParser,
     TableDataItem,
 )
-from parsers.parser_registry import DocumentParser, register_parser
+from parsers.parser_registry import register_parser
 
 # 忽略 openpyxl 的特定警告
 warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
@@ -63,7 +64,7 @@ def __init__(self, config: ExcelParseConfig | None = None):
         self.config: ExcelParseConfig = config or ExcelParseConfig()
         self.image_index: int = 0
 
-    async def parse(self, excel_path: str) -> DocumentData:
+    async def parse(self, file_path: str) -> DocumentData:
         """
         将Excel文件转换为JSON格式
         Args:
@@ -81,7 +82,7 @@ async def parse(self, excel_path: str) -> DocumentData:
             images: list[ChunkData] = []
 
             # 加载工作簿
-            workbook = self._load_workbook(excel_path)
+            workbook = self._load_workbook(file_path)
 
             # 处理每个工作表
             for sheet_index, sheet_name in enumerate(workbook.sheetnames):
@@ -109,7 +110,7 @@ async def parse(self, excel_path: str) -> DocumentData:
                 ))
             processing_time = time.time() - start_time
             return DocumentData(
-                title=Path(excel_path).stem,
+                title=Path(file_path).stem,
                 texts=texts,
                 tables=tables,
                 images=images,
diff --git a/parsers/parser_registry.py b/parsers/parser_registry.py
@@ -5,28 +5,18 @@
 """
 
 import logging
-from abc import ABC, abstractmethod
 from collections.abc import Callable
 from pathlib import Path
 
-from .base_models import DocumentData
+from .base_models import DocumentParser
 
 logger = logging.getLogger(__name__)
 
 # 全局解析器注册表
-PARSER_REGISTRY: dict[str, type['DocumentParser']] = {}
+PARSER_REGISTRY: dict[str, type[DocumentParser]] = {}
 
 
-class DocumentParser(ABC):
-    """文档解析器基类"""
-
-    @abstractmethod
-    async def parse(self, file_path: str) -> DocumentData:
-        """解析文档"""
-        pass
-
-
-def register_parser(suffixes: list[str]) -> Callable[[type['DocumentParser']], type['DocumentParser']]:
+def register_parser(suffixes: list[str]) -> Callable[[type[DocumentParser]], type[DocumentParser]]:
     """
     解析器注册装饰器
 
@@ -41,7 +31,7 @@ def register_parser(suffixes: list[str]) -> Callable[[type['DocumentParser']], t
         class DocxDocumentParser(DocumentParser):
             ...
     """
-    def decorator(cls: type['DocumentParser']) -> type['DocumentParser']:
+    def decorator(cls: type[DocumentParser]) -> type[DocumentParser]:
         # 验证类是否继承自 DocumentParser
         if not issubclass(cls, DocumentParser):
             raise TypeError(f"解析器类 {cls.__name__} 必须继承自 DocumentParser")
@@ -59,7 +49,7 @@ def decorator(cls: type['DocumentParser']) -> type['DocumentParser']:
     return decorator
 
 
-def get_parser(file_path: str) -> 'DocumentParser' | None:
+def get_parser(file_path: str) -> DocumentParser | None:
     """
     根据文件路径获取合适的解析器实例
 
@@ -83,22 +73,6 @@ def get_parser(file_path: str) -> 'DocumentParser' | None:
         logger.error(f"创建解析器实例失败: {parser_class.__name__}, 错误: {e}")
         return None
 
-
-def can_parse(file_path: str) -> bool:
-    """
-    检查文件是否可以被解析
-
-    Args:
-        file_path: 文件路径
-
-    Returns:
-        bool: 是否支持该文件格式
-    """
-    file = Path(file_path)
-    suffix = file.suffix.lower()
-    return suffix in PARSER_REGISTRY
-
-
 def get_supported_formats() -> list[str]:
     """
     获取所有支持的文件格式
@@ -109,7 +83,7 @@ def get_supported_formats() -> list[str]:
     return list(PARSER_REGISTRY.keys())
 
 
-def get_parser_class(suffix: str) -> type['DocumentParser'] | None:
+def get_parser_class(suffix: str) -> type[DocumentParser] | None:
     """
     根据文件扩展名获取解析器类
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,7 +83,8 @@ warn_unreachable = true
 strict_equality = true
 exclude = [
     "tests/",
-    "test_*.py"
+    "test_*.py",
+    "examples/"
 ]
 # 解决模块路径冲突
 explicit_package_bases = true
@@ -94,7 +95,8 @@ target-version = "py312"
 line-length = 88
 exclude = [
     "tests/",
-    "test_*.py"
+    "test_*.py",
+    "examples/"
 ]
 
 [tool.ruff.lint]
diff --git a/worker.py b/worker.py
@@ -4,13 +4,13 @@
 from sanic import Sanic
 
 from enhancers.information_enhancer import InformationEnhancerFactory
-from parsers import get_parser
+from parsers import get_parser, load_all_parsers
 from parsers.base_models import ChunkData
 
 
 async def worker(app: Sanic) -> dict[str, Any]:
     # 使用工厂获取合适的解析器
-
+    load_all_parsers()
     enhancer_factory = InformationEnhancerFactory()
     redis = app.ctx.redis
     while True:

Original file line number	Diff line number	Diff line change
`@@ -174,3 +174,4 @@ cython_debug/`
`174`	`174`	`# PyPI configuration file`
`175`	`175`	`.pypirc`
`176`	`176`
	`177`	`+examples/`