USTC-KnowledgeComputingLab
diff --git a/‎enhancers/information_enhancer.py‎
Lines changed: 13 additions & 24 deletions b/‎enhancers/information_enhancer.py‎
Lines changed: 13 additions & 24 deletions
diff --git a/‎parsers/document_parser.py‎
Lines changed: 18 additions & 108 deletions b/‎parsers/document_parser.py‎
Lines changed: 18 additions & 108 deletions
diff --git a/‎parsers/document_parser_factory.py‎
Lines changed: 29 additions & 0 deletions b/‎parsers/document_parser_factory.py‎
Lines changed: 29 additions & 0 deletions
@@ -1,44 +1,33 @@
 from abc import ABC, abstractmethod
-from typing import Any
 
-class InformationEnhancer(ABC):
-    """信息增强器基类"""
+from parsers.document_parser import DocumentData
 
-    def __init__(self) -> None:
-        pass
 
+class InformationEnhancer(ABC):
+    """信息增强器基类"""
     @abstractmethod
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         pass
 
 class TableInformationEnhancer(InformationEnhancer):
     """表格信息增强器"""
 
-    def __init__(self) -> None:
-        super().__init__()
-
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         return information
 
 class FormulasInformationEnhancer(InformationEnhancer):
     """公式信息增强器"""
 
-    def __init__(self) -> None:
-        super().__init__()
-
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         return information
 
 class ImageInformationEnhancer(InformationEnhancer):
     """图片信息增强器"""
 
-    def __init__(self) -> None:
-        super().__init__()
-
-    async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
+    async def enhance(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         return information
 
@@ -52,9 +41,9 @@ def __init__(self) -> None:
             ImageInformationEnhancer()
         ]
 
-    def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer:
+    def get_enhancer(self, information: DocumentData) -> InformationEnhancer|None:
         """获取信息增强器"""
-        match information.get("type"):
+        match information.type:
             case "table":
                 return TableInformationEnhancer()
             case "formulas":
@@ -63,11 +52,11 @@ def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer:
                 return ImageInformationEnhancer()
             case _:
                 return None
-    
-    async def enhance_information(self, information: dict[str, Any]) -> dict[str, Any]:
+
+    async def enhance_information(self, information: DocumentData) -> DocumentData:
         """增强信息"""
         enhancer = self.get_enhancer(information)
         if not enhancer:
-            raise ValueError(f"不支持的模态类型: {information.get('type')}")
+            raise ValueError(f"不支持的模态类型: {information.type}")
         return await enhancer.enhance(information)
-        
+
@@ -1,127 +1,37 @@
 import logging
 from abc import ABC, abstractmethod
-from typing import Any
+
+from pydantic import BaseModel
 
 logger = logging.getLogger(__name__)
 
+class DocumentData(BaseModel):
+    """文档数据类"""
+    type: str
+    name: str
+    content: str
+    description: str
+
+class ParseResult(BaseModel):
+    """解析结果类"""
+    title: str
+    document: list[DocumentData]
+    processing_time: float
+    success: bool
+    error_message: str | None = None
+
 class DocumentParser(ABC):
     """文档解析器基类"""
 
     def __init__(self) -> None:
         self.supported_formats: list[str] = []
 
     @abstractmethod
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
+    async def parse(self, file_path: str) -> ParseResult:
         """解析文档"""
         pass
 
     @abstractmethod
     def can_parse(self, file_path: str) -> bool:
         """检查是否可以解析该文件"""
         pass
-
-class PDFParser(DocumentParser):
-    """PDF文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.pdf']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
-        """解析PDF文档"""
-        try:
-            # 这里应该使用mineru库
-            # 暂时返回模拟数据
-            return [{
-                "type": "pdf",
-                "text": f"PDF文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }]
-        except Exception as e:
-            logger.error(f"解析PDF失败: {e}")
-            raise
-
-class DOCXParser(DocumentParser):
-    """DOCX文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.docx','.doc']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
-        """解析DOCX文档"""
-        try:
-            # 这里应该使用docling库
-            # 暂时返回模拟数据
-            return [{
-                "type": "docx",
-                "text": f"DOCX文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }]
-        except Exception as e:
-            logger.error(f"解析DOCX失败: {e}")
-            raise
-
-class XLSXParser(DocumentParser):
-    """XLSX文档解析器"""
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.supported_formats = ['.xlsx']
-
-    def can_parse(self, file_path: str) -> bool:
-        return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
-
-    async def parse(self, file_path: str) -> list[dict[str, Any]]:
-        """解析XLSX文档"""
-        try:
-            # 这里应该使用docling库
-            # 暂时返回模拟数据
-            return [{
-                "type": "xlsx",
-                "text": f"XLSX文档内容: {file_path}",
-                "pages": 1,
-                "images": [],
-                "tables": [],
-                "formulas": []
-            }]
-        except Exception as e:
-            logger.error(f"解析XLSX失败: {e}")
-            raise
-
-class DocumentParserFactory:
-    """文档解析器工厂"""
-
-    def __init__(self) -> None:
-        self.parsers = [
-            PDFParser(),
-            DOCXParser(),
-            XLSXParser()
-        ]
-
-    def get_parser(self, file_path: str) -> DocumentParser | None:
-        """根据文件路径获取合适的解析器"""
-        for parser in self.parsers:
-            if parser.can_parse(file_path):
-                return parser
-        return None
-
-    async def parse_document(self, file_path: str) -> list[dict[str, Any]]:
-        """解析文档"""
-        parser = self.get_parser(file_path)
-        if not parser:
-            raise ValueError(f"不支持的文件格式: {file_path}")
-
-        return await parser.parse(file_path)
@@ -0,0 +1,29 @@
+import logging
+
+from parsers.document_parser import DocumentParser, ParseResult
+from parsers.excel_parser import ExcelParser
+
+logger = logging.getLogger(__name__)
+
+class DocumentParserFactory:
+    """文档解析器工厂"""
+
+    def __init__(self) -> None:
+        self.parsers: list[DocumentParser] = [
+            ExcelParser()
+        ]
+
+    def get_parser(self, file_path: str) -> DocumentParser | None:
+        """根据文件路径获取合适的解析器"""
+        for parser in self.parsers:
+            if parser.can_parse(file_path):
+                return parser
+        return None
+
+    async def parse_document(self, file_path: str) -> ParseResult:
+        """解析文档"""
+        parser = self.get_parser(file_path)
+        if not parser:
+            raise ValueError(f"不支持的文件格式: {file_path}")
+
+        return await parser.parse(file_path)