|
1 | 1 | import logging
|
2 | 2 | from abc import ABC, abstractmethod
|
3 |
| -from typing import Any |
| 3 | + |
| 4 | +from pydantic import BaseModel |
4 | 5 |
|
5 | 6 | logger = logging.getLogger(__name__)
|
6 | 7 |
|
| 8 | +class DocumentData(BaseModel): |
| 9 | + """文档数据类""" |
| 10 | + type: str |
| 11 | + name: str |
| 12 | + content: str |
| 13 | + description: str |
| 14 | + |
| 15 | +class ParseResult(BaseModel): |
| 16 | + """解析结果类""" |
| 17 | + title: str |
| 18 | + document: list[DocumentData] |
| 19 | + processing_time: float |
| 20 | + success: bool |
| 21 | + error_message: str | None = None |
| 22 | + |
7 | 23 | class DocumentParser(ABC):
|
8 | 24 | """文档解析器基类"""
|
9 | 25 |
|
10 | 26 | def __init__(self) -> None:
|
11 | 27 | self.supported_formats: list[str] = []
|
12 | 28 |
|
13 | 29 | @abstractmethod
|
14 |
| - async def parse(self, file_path: str) -> list[dict[str, Any]]: |
| 30 | + async def parse(self, file_path: str) -> ParseResult: |
15 | 31 | """解析文档"""
|
16 | 32 | pass
|
17 | 33 |
|
18 | 34 | @abstractmethod
|
19 | 35 | def can_parse(self, file_path: str) -> bool:
|
20 | 36 | """检查是否可以解析该文件"""
|
21 | 37 | pass
|
22 |
| - |
23 |
| -class PDFParser(DocumentParser): |
24 |
| - """PDF文档解析器""" |
25 |
| - |
26 |
| - def __init__(self) -> None: |
27 |
| - super().__init__() |
28 |
| - self.supported_formats = ['.pdf'] |
29 |
| - |
30 |
| - def can_parse(self, file_path: str) -> bool: |
31 |
| - return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) |
32 |
| - |
33 |
| - async def parse(self, file_path: str) -> list[dict[str, Any]]: |
34 |
| - """解析PDF文档""" |
35 |
| - try: |
36 |
| - # 这里应该使用mineru库 |
37 |
| - # 暂时返回模拟数据 |
38 |
| - return [{ |
39 |
| - "type": "pdf", |
40 |
| - "text": f"PDF文档内容: {file_path}", |
41 |
| - "pages": 1, |
42 |
| - "images": [], |
43 |
| - "tables": [], |
44 |
| - "formulas": [] |
45 |
| - }] |
46 |
| - except Exception as e: |
47 |
| - logger.error(f"解析PDF失败: {e}") |
48 |
| - raise |
49 |
| - |
50 |
| -class DOCXParser(DocumentParser): |
51 |
| - """DOCX文档解析器""" |
52 |
| - |
53 |
| - def __init__(self) -> None: |
54 |
| - super().__init__() |
55 |
| - self.supported_formats = ['.docx','.doc'] |
56 |
| - |
57 |
| - def can_parse(self, file_path: str) -> bool: |
58 |
| - return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) |
59 |
| - |
60 |
| - async def parse(self, file_path: str) -> list[dict[str, Any]]: |
61 |
| - """解析DOCX文档""" |
62 |
| - try: |
63 |
| - # 这里应该使用docling库 |
64 |
| - # 暂时返回模拟数据 |
65 |
| - return [{ |
66 |
| - "type": "docx", |
67 |
| - "text": f"DOCX文档内容: {file_path}", |
68 |
| - "pages": 1, |
69 |
| - "images": [], |
70 |
| - "tables": [], |
71 |
| - "formulas": [] |
72 |
| - }] |
73 |
| - except Exception as e: |
74 |
| - logger.error(f"解析DOCX失败: {e}") |
75 |
| - raise |
76 |
| - |
77 |
| -class XLSXParser(DocumentParser): |
78 |
| - """XLSX文档解析器""" |
79 |
| - |
80 |
| - def __init__(self) -> None: |
81 |
| - super().__init__() |
82 |
| - self.supported_formats = ['.xlsx'] |
83 |
| - |
84 |
| - def can_parse(self, file_path: str) -> bool: |
85 |
| - return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) |
86 |
| - |
87 |
| - async def parse(self, file_path: str) -> list[dict[str, Any]]: |
88 |
| - """解析XLSX文档""" |
89 |
| - try: |
90 |
| - # 这里应该使用docling库 |
91 |
| - # 暂时返回模拟数据 |
92 |
| - return [{ |
93 |
| - "type": "xlsx", |
94 |
| - "text": f"XLSX文档内容: {file_path}", |
95 |
| - "pages": 1, |
96 |
| - "images": [], |
97 |
| - "tables": [], |
98 |
| - "formulas": [] |
99 |
| - }] |
100 |
| - except Exception as e: |
101 |
| - logger.error(f"解析XLSX失败: {e}") |
102 |
| - raise |
103 |
| - |
104 |
| -class DocumentParserFactory: |
105 |
| - """文档解析器工厂""" |
106 |
| - |
107 |
| - def __init__(self) -> None: |
108 |
| - self.parsers = [ |
109 |
| - PDFParser(), |
110 |
| - DOCXParser(), |
111 |
| - XLSXParser() |
112 |
| - ] |
113 |
| - |
114 |
| - def get_parser(self, file_path: str) -> DocumentParser | None: |
115 |
| - """根据文件路径获取合适的解析器""" |
116 |
| - for parser in self.parsers: |
117 |
| - if parser.can_parse(file_path): |
118 |
| - return parser |
119 |
| - return None |
120 |
| - |
121 |
| - async def parse_document(self, file_path: str) -> list[dict[str, Any]]: |
122 |
| - """解析文档""" |
123 |
| - parser = self.get_parser(file_path) |
124 |
| - if not parser: |
125 |
| - raise ValueError(f"不支持的文件格式: {file_path}") |
126 |
| - |
127 |
| - return await parser.parse(file_path) |
0 commit comments