Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added enhancers/__init__.py
Empty file.
62 changes: 62 additions & 0 deletions enhancers/information_enhancer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from abc import ABC, abstractmethod

from parsers.base_models import ChunkData, ChunkType


class InformationEnhancer(ABC):
"""信息增强器基类"""
@abstractmethod
async def enhance(self, information: ChunkData) -> ChunkData:
"""增强信息"""
pass

class TableInformationEnhancer(InformationEnhancer):
"""表格信息增强器"""

async def enhance(self, information: ChunkData) -> ChunkData:
"""增强信息"""
return information

class FormulasInformationEnhancer(InformationEnhancer):
"""公式信息增强器"""

async def enhance(self, information: ChunkData) -> ChunkData:
"""增强信息"""
return information

class ImageInformationEnhancer(InformationEnhancer):
"""图片信息增强器"""

async def enhance(self, information: ChunkData) -> ChunkData:
"""增强信息"""
return information

class InformationEnhancerFactory:
"""信息增强器工厂"""

def __init__(self) -> None:
self.enhancers = [
TableInformationEnhancer(),
FormulasInformationEnhancer(),
ImageInformationEnhancer()
]

def get_enhancer(self, information: ChunkData) -> InformationEnhancer|None:
"""获取信息增强器"""
match information.type:
case ChunkType.TABLE:
return TableInformationEnhancer()
case ChunkType.FORMULA:
return FormulasInformationEnhancer()
case ChunkType.IMAGE:
return ImageInformationEnhancer()
case _:
return None

async def enhance_information(self, information: ChunkData) -> ChunkData:
"""增强信息"""
enhancer = self.get_enhancer(information)
if not enhancer:
raise ValueError(f"不支持的模态类型: {information.type}")
return await enhancer.enhance(information)

58 changes: 58 additions & 0 deletions parsers/base_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import logging
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any

from pydantic import BaseModel, Field

logger = logging.getLogger(__name__)


class ChunkType(str, Enum):
"""块类型"""
TEXT = "text"
IMAGE = "image"
TABLE = "table"
FORMULA = "formula"

class TableDataItem(BaseModel):
"""表格数据类"""
rows: int # 行数
columns: int # 列数
row_headers: list[Any] = Field(default_factory=list) # 行头
column_headers: list[Any] = Field(default_factory=list) # 列头
data: list[list[str]] = Field(default_factory=list) # 数据

class ChunkData(BaseModel):
"""块数据类"""
type: ChunkType
name: str
content: str|TableDataItem = ""
description: str = ""

class DocumentData(BaseModel):
"""解析结果类"""
title: str = ""
texts: list[ChunkData] = Field(default_factory=list)
tables: list[ChunkData] = Field(default_factory=list)
images: list[ChunkData] = Field(default_factory=list)
formulas: list[ChunkData] = Field(default_factory=list)
processing_time: float = 0
success: bool
error_message: str | None = None

class DocumentParser(ABC):
"""文档解析器基类"""

def __init__(self) -> None:
self.supported_formats: list[str] = Field(default_factory=list)

@abstractmethod
async def parse(self, file_path: str) -> DocumentData:
"""解析文档"""
pass

@abstractmethod
def can_parse(self, file_path: str) -> bool:
"""检查是否可以解析该文件"""
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
112 changes: 7 additions & 105 deletions parsers/document_parser.py
Original file line number Diff line number Diff line change
@@ -1,114 +1,16 @@
import logging
from abc import ABC, abstractmethod
from typing import Any

logger = logging.getLogger(__name__)

class DocumentParser(ABC):
"""文档解析器基类"""

def __init__(self) -> None:
self.supported_formats: list[str] = []

@abstractmethod
async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
"""解析文档"""
pass

@abstractmethod
def can_parse(self, file_path: str) -> bool:
"""检查是否可以解析该文件"""
pass

class PDFParser(DocumentParser):
"""PDF文档解析器"""

def __init__(self) -> None:
super().__init__()
self.supported_formats = ['.pdf']

def can_parse(self, file_path: str) -> bool:
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
from parsers.base_models import DocumentData, DocumentParser
from parsers.excel_parser import ExcelParser

async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
"""解析PDF文档"""
try:
# 这里应该使用mineru库
# 暂时返回模拟数据
return {
"type": "pdf",
"text": f"PDF文档内容: {file_path}",
"pages": 1,
"images": [],
"tables": [],
"formulas": []
}
except Exception as e:
logger.error(f"解析PDF失败: {e}")
raise

class DOCXParser(DocumentParser):
"""DOCX文档解析器"""

def __init__(self) -> None:
super().__init__()
self.supported_formats = ['.docx','.doc']

def can_parse(self, file_path: str) -> bool:
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)

async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
"""解析DOCX文档"""
try:
# 这里应该使用docling库
# 暂时返回模拟数据
return {
"type": "docx",
"text": f"DOCX文档内容: {file_path}",
"pages": 1,
"images": [],
"tables": [],
"formulas": []
}
except Exception as e:
logger.error(f"解析DOCX失败: {e}")
raise

class XLSXParser(DocumentParser):
"""XLSX文档解析器"""

def __init__(self) -> None:
super().__init__()
self.supported_formats = ['.xlsx']

def can_parse(self, file_path: str) -> bool:
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)

async def parse(self, file_path: str, file_content: bytes) -> dict[str, Any]:
"""解析XLSX文档"""
try:
# 这里应该使用docling库
# 暂时返回模拟数据
return {
"type": "xlsx",
"text": f"XLSX文档内容: {file_path}",
"pages": 1,
"images": [],
"tables": [],
"formulas": []
}
except Exception as e:
logger.error(f"解析XLSX失败: {e}")
raise
logger = logging.getLogger(__name__)

class DocumentParserFactory:
"""文档解析器工厂"""

def __init__(self) -> None:
self.parsers = [
PDFParser(),
DOCXParser(),
XLSXParser()
self.parsers: list[DocumentParser] = [
ExcelParser()
]

def get_parser(self, file_path: str) -> DocumentParser | None:
Expand All @@ -118,10 +20,10 @@ def get_parser(self, file_path: str) -> DocumentParser | None:
return parser
return None

async def parse_document(self, file_path: str, file_content: bytes) -> dict[str, Any]:
async def parse_document(self, file_path: str) -> DocumentData:
"""解析文档"""
parser = self.get_parser(file_path)
if not parser:
raise ValueError(f"不支持的文件格式: {file_path}")

return await parser.parse(file_path, file_content)
return await parser.parse(file_path)
Loading
Loading