Skip to content

Commit 480a717

Browse files
committed
feat: add excel parser test
1 parent 2c1b805 commit 480a717

File tree

8 files changed

+751
-144
lines changed

8 files changed

+751
-144
lines changed

enhancers/information_enhancer.py

Lines changed: 13 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,33 @@
11
from abc import ABC, abstractmethod
2-
from typing import Any
32

4-
class InformationEnhancer(ABC):
5-
"""信息增强器基类"""
3+
from parsers.document_parser import DocumentData
64

7-
def __init__(self) -> None:
8-
pass
95

6+
class InformationEnhancer(ABC):
7+
"""信息增强器基类"""
108
@abstractmethod
11-
async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
9+
async def enhance(self, information: DocumentData) -> DocumentData:
1210
"""增强信息"""
1311
pass
1412

1513
class TableInformationEnhancer(InformationEnhancer):
1614
"""表格信息增强器"""
1715

18-
def __init__(self) -> None:
19-
super().__init__()
20-
21-
async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
16+
async def enhance(self, information: DocumentData) -> DocumentData:
2217
"""增强信息"""
2318
return information
2419

2520
class FormulasInformationEnhancer(InformationEnhancer):
2621
"""公式信息增强器"""
2722

28-
def __init__(self) -> None:
29-
super().__init__()
30-
31-
async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
23+
async def enhance(self, information: DocumentData) -> DocumentData:
3224
"""增强信息"""
3325
return information
3426

3527
class ImageInformationEnhancer(InformationEnhancer):
3628
"""图片信息增强器"""
3729

38-
def __init__(self) -> None:
39-
super().__init__()
40-
41-
async def enhance(self, information: dict[str, Any]) -> dict[str, Any]:
30+
async def enhance(self, information: DocumentData) -> DocumentData:
4231
"""增强信息"""
4332
return information
4433

@@ -52,9 +41,9 @@ def __init__(self) -> None:
5241
ImageInformationEnhancer()
5342
]
5443

55-
def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer:
44+
def get_enhancer(self, information: DocumentData) -> InformationEnhancer|None:
5645
"""获取信息增强器"""
57-
match information.get("type"):
46+
match information.type:
5847
case "table":
5948
return TableInformationEnhancer()
6049
case "formulas":
@@ -63,11 +52,11 @@ def get_enhancer(self, information: dict[str, Any]) -> InformationEnhancer:
6352
return ImageInformationEnhancer()
6453
case _:
6554
return None
66-
67-
async def enhance_information(self, information: dict[str, Any]) -> dict[str, Any]:
55+
56+
async def enhance_information(self, information: DocumentData) -> DocumentData:
6857
"""增强信息"""
6958
enhancer = self.get_enhancer(information)
7059
if not enhancer:
71-
raise ValueError(f"不支持的模态类型: {information.get('type')}")
60+
raise ValueError(f"不支持的模态类型: {information.type}")
7261
return await enhancer.enhance(information)
73-
62+

parsers/document_parser.py

Lines changed: 18 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,127 +1,37 @@
11
import logging
22
from abc import ABC, abstractmethod
3-
from typing import Any
3+
4+
from pydantic import BaseModel
45

56
logger = logging.getLogger(__name__)
67

8+
class DocumentData(BaseModel):
9+
"""文档数据类"""
10+
type: str
11+
name: str
12+
content: str
13+
description: str
14+
15+
class ParseResult(BaseModel):
16+
"""解析结果类"""
17+
title: str
18+
document: list[DocumentData]
19+
processing_time: float
20+
success: bool
21+
error_message: str | None = None
22+
723
class DocumentParser(ABC):
824
"""文档解析器基类"""
925

1026
def __init__(self) -> None:
1127
self.supported_formats: list[str] = []
1228

1329
@abstractmethod
14-
async def parse(self, file_path: str) -> list[dict[str, Any]]:
30+
async def parse(self, file_path: str) -> ParseResult:
1531
"""解析文档"""
1632
pass
1733

1834
@abstractmethod
1935
def can_parse(self, file_path: str) -> bool:
2036
"""检查是否可以解析该文件"""
2137
pass
22-
23-
class PDFParser(DocumentParser):
24-
"""PDF文档解析器"""
25-
26-
def __init__(self) -> None:
27-
super().__init__()
28-
self.supported_formats = ['.pdf']
29-
30-
def can_parse(self, file_path: str) -> bool:
31-
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
32-
33-
async def parse(self, file_path: str) -> list[dict[str, Any]]:
34-
"""解析PDF文档"""
35-
try:
36-
# 这里应该使用mineru库
37-
# 暂时返回模拟数据
38-
return [{
39-
"type": "pdf",
40-
"text": f"PDF文档内容: {file_path}",
41-
"pages": 1,
42-
"images": [],
43-
"tables": [],
44-
"formulas": []
45-
}]
46-
except Exception as e:
47-
logger.error(f"解析PDF失败: {e}")
48-
raise
49-
50-
class DOCXParser(DocumentParser):
51-
"""DOCX文档解析器"""
52-
53-
def __init__(self) -> None:
54-
super().__init__()
55-
self.supported_formats = ['.docx','.doc']
56-
57-
def can_parse(self, file_path: str) -> bool:
58-
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
59-
60-
async def parse(self, file_path: str) -> list[dict[str, Any]]:
61-
"""解析DOCX文档"""
62-
try:
63-
# 这里应该使用docling库
64-
# 暂时返回模拟数据
65-
return [{
66-
"type": "docx",
67-
"text": f"DOCX文档内容: {file_path}",
68-
"pages": 1,
69-
"images": [],
70-
"tables": [],
71-
"formulas": []
72-
}]
73-
except Exception as e:
74-
logger.error(f"解析DOCX失败: {e}")
75-
raise
76-
77-
class XLSXParser(DocumentParser):
78-
"""XLSX文档解析器"""
79-
80-
def __init__(self) -> None:
81-
super().__init__()
82-
self.supported_formats = ['.xlsx']
83-
84-
def can_parse(self, file_path: str) -> bool:
85-
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
86-
87-
async def parse(self, file_path: str) -> list[dict[str, Any]]:
88-
"""解析XLSX文档"""
89-
try:
90-
# 这里应该使用docling库
91-
# 暂时返回模拟数据
92-
return [{
93-
"type": "xlsx",
94-
"text": f"XLSX文档内容: {file_path}",
95-
"pages": 1,
96-
"images": [],
97-
"tables": [],
98-
"formulas": []
99-
}]
100-
except Exception as e:
101-
logger.error(f"解析XLSX失败: {e}")
102-
raise
103-
104-
class DocumentParserFactory:
105-
"""文档解析器工厂"""
106-
107-
def __init__(self) -> None:
108-
self.parsers = [
109-
PDFParser(),
110-
DOCXParser(),
111-
XLSXParser()
112-
]
113-
114-
def get_parser(self, file_path: str) -> DocumentParser | None:
115-
"""根据文件路径获取合适的解析器"""
116-
for parser in self.parsers:
117-
if parser.can_parse(file_path):
118-
return parser
119-
return None
120-
121-
async def parse_document(self, file_path: str) -> list[dict[str, Any]]:
122-
"""解析文档"""
123-
parser = self.get_parser(file_path)
124-
if not parser:
125-
raise ValueError(f"不支持的文件格式: {file_path}")
126-
127-
return await parser.parse(file_path)

parsers/document_parser_factory.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import logging
2+
3+
from parsers.document_parser import DocumentParser, ParseResult
4+
from parsers.excel_parser import ExcelParser
5+
6+
logger = logging.getLogger(__name__)
7+
8+
class DocumentParserFactory:
9+
"""文档解析器工厂"""
10+
11+
def __init__(self) -> None:
12+
self.parsers: list[DocumentParser] = [
13+
ExcelParser()
14+
]
15+
16+
def get_parser(self, file_path: str) -> DocumentParser | None:
17+
"""根据文件路径获取合适的解析器"""
18+
for parser in self.parsers:
19+
if parser.can_parse(file_path):
20+
return parser
21+
return None
22+
23+
async def parse_document(self, file_path: str) -> ParseResult:
24+
"""解析文档"""
25+
parser = self.get_parser(file_path)
26+
if not parser:
27+
raise ValueError(f"不支持的文件格式: {file_path}")
28+
29+
return await parser.parse(file_path)

0 commit comments

Comments
 (0)