|
| 1 | +""" |
| 2 | +解析器注册器模块 |
| 3 | +
|
| 4 | +提供基于装饰器的解析器自动注册机制,支持多种文件格式的解析器注册和查找。 |
| 5 | +""" |
| 6 | + |
| 7 | +import logging |
| 8 | +from abc import ABC, abstractmethod |
| 9 | +from collections.abc import Callable |
| 10 | +from pathlib import Path |
| 11 | + |
| 12 | +from .base_models import DocumentData |
| 13 | + |
| 14 | +logger = logging.getLogger(__name__) |
| 15 | + |
| 16 | +# 全局解析器注册表 |
| 17 | +PARSER_REGISTRY: dict[str, type['DocumentParser']] = {} |
| 18 | + |
| 19 | + |
| 20 | +class DocumentParser(ABC): |
| 21 | + """文档解析器基类""" |
| 22 | + |
| 23 | + @abstractmethod |
| 24 | + async def parse(self, file_path: str) -> DocumentData: |
| 25 | + """解析文档""" |
| 26 | + pass |
| 27 | + |
| 28 | + |
| 29 | +def register_parser(suffixes: list[str]) -> Callable[[type['DocumentParser']], type['DocumentParser']]: |
| 30 | + """ |
| 31 | + 解析器注册装饰器 |
| 32 | +
|
| 33 | + Args: |
| 34 | + suffixes: 支持的文件扩展名列表,如 ['.docx', '.doc'] |
| 35 | +
|
| 36 | + Returns: |
| 37 | + 装饰器函数 |
| 38 | +
|
| 39 | + Example: |
| 40 | + @register_parser(['.docx']) |
| 41 | + class DocxDocumentParser(DocumentParser): |
| 42 | + ... |
| 43 | + """ |
| 44 | + def decorator(cls: type['DocumentParser']) -> type['DocumentParser']: |
| 45 | + # 验证类是否继承自 DocumentParser |
| 46 | + if not issubclass(cls, DocumentParser): |
| 47 | + raise TypeError(f"解析器类 {cls.__name__} 必须继承自 DocumentParser") |
| 48 | + |
| 49 | + # 注册到全局注册表 |
| 50 | + for suffix in suffixes: |
| 51 | + suffix = suffix.lower() # 统一转换为小写 |
| 52 | + if suffix in PARSER_REGISTRY: |
| 53 | + logger.warning(f"覆盖已存在的解析器: {suffix} -> {cls.__name__}") |
| 54 | + PARSER_REGISTRY[suffix] = cls |
| 55 | + logger.info(f"注册解析器: {suffix} -> {cls.__name__}") |
| 56 | + |
| 57 | + return cls |
| 58 | + |
| 59 | + return decorator |
| 60 | + |
| 61 | + |
| 62 | +def get_parser(file_path: str) -> 'DocumentParser' | None: |
| 63 | + """ |
| 64 | + 根据文件路径获取合适的解析器实例 |
| 65 | +
|
| 66 | + Args: |
| 67 | + file_path: 文件路径 |
| 68 | +
|
| 69 | + Returns: |
| 70 | + 解析器实例,如果没有找到则返回 None |
| 71 | + """ |
| 72 | + file = Path(file_path) |
| 73 | + suffix = file.suffix.lower() |
| 74 | + |
| 75 | + if suffix not in PARSER_REGISTRY: |
| 76 | + logger.warning(f"未找到支持 {suffix} 格式的解析器") |
| 77 | + return None |
| 78 | + |
| 79 | + parser_class = PARSER_REGISTRY[suffix] |
| 80 | + try: |
| 81 | + return parser_class() |
| 82 | + except Exception as e: |
| 83 | + logger.error(f"创建解析器实例失败: {parser_class.__name__}, 错误: {e}") |
| 84 | + return None |
| 85 | + |
| 86 | + |
| 87 | +def can_parse(file_path: str) -> bool: |
| 88 | + """ |
| 89 | + 检查文件是否可以被解析 |
| 90 | +
|
| 91 | + Args: |
| 92 | + file_path: 文件路径 |
| 93 | +
|
| 94 | + Returns: |
| 95 | + bool: 是否支持该文件格式 |
| 96 | + """ |
| 97 | + file = Path(file_path) |
| 98 | + suffix = file.suffix.lower() |
| 99 | + return suffix in PARSER_REGISTRY |
| 100 | + |
| 101 | + |
| 102 | +def get_supported_formats() -> list[str]: |
| 103 | + """ |
| 104 | + 获取所有支持的文件格式 |
| 105 | +
|
| 106 | + Returns: |
| 107 | + 支持的文件扩展名列表 |
| 108 | + """ |
| 109 | + return list(PARSER_REGISTRY.keys()) |
| 110 | + |
| 111 | + |
| 112 | +def get_parser_class(suffix: str) -> type['DocumentParser'] | None: |
| 113 | + """ |
| 114 | + 根据文件扩展名获取解析器类 |
| 115 | +
|
| 116 | + Args: |
| 117 | + suffix: 文件扩展名,如 '.docx' |
| 118 | +
|
| 119 | + Returns: |
| 120 | + 解析器类,如果没有找到则返回 None |
| 121 | + """ |
| 122 | + return PARSER_REGISTRY.get(suffix.lower()) |
| 123 | + |
| 124 | + |
| 125 | +def list_registered_parsers() -> dict[str, str]: |
| 126 | + """ |
| 127 | + 列出所有已注册的解析器 |
| 128 | +
|
| 129 | + Returns: |
| 130 | + 扩展名到解析器类名的映射字典 |
| 131 | + """ |
| 132 | + return {suffix: cls.__name__ for suffix, cls in PARSER_REGISTRY.items()} |
0 commit comments