USTC-KnowledgeComputingLab · liningping · Aug 20, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 14, 2025
diff --git a/enhancers/__init__.py b/enhancers/__init__.py
diff --git a/enhancers/information_enhancer.py b/enhancers/information_enhancer.py
@@ -0,0 +1,62 @@
+from abc import ABC, abstractmethod
+
+from parsers.base_models import ChunkData, ChunkType
+
+
+class InformationEnhancer(ABC):
+    """信息增强器基类"""
+    @abstractmethod
+    async def enhance(self, information: ChunkData) -> ChunkData:
+        """增强信息"""
+        pass
+
+class TableInformationEnhancer(InformationEnhancer):
+    """表格信息增强器"""
+
+    async def enhance(self, information: ChunkData) -> ChunkData:
+        """增强信息"""
+        return information
+
+class FormulasInformationEnhancer(InformationEnhancer):
+    """公式信息增强器"""
+
+    async def enhance(self, information: ChunkData) -> ChunkData:
+        """增强信息"""
+        return information
+
+class ImageInformationEnhancer(InformationEnhancer):
+    """图片信息增强器"""
+
+    async def enhance(self, information: ChunkData) -> ChunkData:
+        """增强信息"""
+        return information
+
+class InformationEnhancerFactory:
+    """信息增强器工厂"""
+
+    def __init__(self) -> None:
+        self.enhancers = [
+            TableInformationEnhancer(),
+            FormulasInformationEnhancer(),
+            ImageInformationEnhancer()
+        ]
+
+    def get_enhancer(self, information: ChunkData) -> InformationEnhancer|None:
+        """获取信息增强器"""
+        match information.type:
+            case ChunkType.TABLE:
+                return TableInformationEnhancer()
+            case ChunkType.FORMULA:
+                return FormulasInformationEnhancer()
+            case ChunkType.IMAGE:
+                return ImageInformationEnhancer()
+            case _:
+                return None
+
+    async def enhance_information(self, information: ChunkData) -> ChunkData:
+        """增强信息"""
+        enhancer = self.get_enhancer(information)
+        if not enhancer:
+            raise ValueError(f"不支持的模态类型: {information.type}")
+        return await enhancer.enhance(information)
+
diff --git a/parsers/base_models.py b/parsers/base_models.py
@@ -0,0 +1,58 @@
+import logging
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+logger = logging.getLogger(__name__)
+
+
+class ChunkType(str, Enum):
+    """块类型"""
+    TEXT = "text"
+    IMAGE = "image"
+    TABLE = "table"
+    FORMULA = "formula"
+
+class TableDataItem(BaseModel):
+    """表格数据类"""
+    rows: int  # 行数
+    columns: int  # 列数
+    row_headers: list[Any] = Field(default_factory=list)  # 行头
+    column_headers: list[Any] = Field(default_factory=list)  # 列头
+    data: list[list[str]] = Field(default_factory=list)  # 数据
+
+class ChunkData(BaseModel):
+    """块数据类"""
+    type: ChunkType
+    name: str
+    content: str|TableDataItem = ""
+    description: str = ""
+
+class DocumentData(BaseModel):
+    """解析结果类"""
+    title: str = ""
+    texts: list[ChunkData] = Field(default_factory=list)
+    tables: list[ChunkData] = Field(default_factory=list)
+    images: list[ChunkData] = Field(default_factory=list)
+    formulas: list[ChunkData] = Field(default_factory=list)
+    processing_time: float = 0
+    success: bool
+    error_message: str | None = None
+
+class DocumentParser(ABC):
+    """文档解析器基类"""
+
+    def __init__(self) -> None:
+        self.supported_formats: list[str] = Field(default_factory=list)
+
+    @abstractmethod
+    async def parse(self, file_path: str) -> DocumentData:
+        """解析文档"""
+        pass
+
+    @abstractmethod
+    def can_parse(self, file_path: str) -> bool:
+        """检查是否可以解析该文件"""
+        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
diff --git a/parsers/document_parser.py b/parsers/document_parser.py
@@ -1,114 +1,16 @@
 import logging
-from abc import ABC, abstractmethod
-from typing import Any
 
-logger = logging.getLogger(__name__)
-
-class DocumentParser(ABC):
-    """文档解析器基类"""
-
-    def __init__(self) -> None:
-        self.supported_formats: list[str] = []
-
-    @abstractmethod
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
-        """解析文档"""
-        pass
-
-    @abstractmethod
-    def can_parse(self, file_path: str) -> bool:
-        """检查是否可以解析该文件"""
-        pass
-
-class PDFParser(DocumentParser):
-    """PDF文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.pdf']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
+from parsers.base_models import DocumentData, DocumentParser
+from parsers.excel_parser import ExcelParser
 
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
-        """解析PDF文档"""
-        try:
-            # 这里应该使用mineru库
-            # 暂时返回模拟数据
-            return {
-                "type": "pdf",
-                "text": f"PDF文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }
-        except Exception as e:
-            logger.error(f"解析PDF失败: {e}")
-            raise
-
-class DOCXParser(DocumentParser):
-    """DOCX文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.docx','.doc']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
-        """解析DOCX文档"""
-        try:
-            # 这里应该使用docling库
-            # 暂时返回模拟数据
-            return {
-                "type": "docx",
-                "text": f"DOCX文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }
-        except Exception as e:
-            logger.error(f"解析DOCX失败: {e}")
-            raise
-
-class XLSXParser(DocumentParser):
-    """XLSX文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.xlsx']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
-        """解析XLSX文档"""
-        try:
-            # 这里应该使用docling库
-            # 暂时返回模拟数据
-            return {
-                "type": "xlsx",
-                "text": f"XLSX文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }
-        except Exception as e:
-            logger.error(f"解析XLSX失败: {e}")
-            raise
+logger = logging.getLogger(__name__)
 
 class DocumentParserFactory:
     """文档解析器工厂"""
 
     def __init__(self) -> None:
-        self.parsers = [
-            PDFParser(),
-            DOCXParser(),
-            XLSXParser()
+        self.parsers: list[DocumentParser] = [
+            ExcelParser()
         ]
 
     def get_parser(self, file_path: str) -> DocumentParser | None:
@@ -118,10 +20,10 @@ def get_parser(self, file_path: str) -> DocumentParser | None:
                 return parser
         return None
 
-    async def parse_document(self, file_path: str, file_content: bytes) -> dict[str, Any]:
+    async def parse_document(self, file_path: str) -> DocumentData:
         """解析文档"""
         parser = self.get_parser(file_path)
         if not parser:
             raise ValueError(f"不支持的文件格式: {file_path}")
 
-        return await parser.parse(file_path, file_content)
+        return await parser.parse(file_path)