Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,7 @@ TASK_TIMEOUT=3600
# 调试模式
DEBUG=false
# 日志级别
LOG_LEVEL=INFO
LOG_LEVEL=INFO

# PDF解析
MINERU_MODEL_SOURCE=local
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,5 @@ cython_debug/
# PyPI configuration file
.pypirc

examples/
examples/
models/
37 changes: 37 additions & 0 deletions mineru.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"bucket_info": {
"bucket-name-1": [
"ak",
"sk",
"endpoint"
],
"bucket-name-2": [
"ak",
"sk",
"endpoint"
]
},
"latex-delimiter-config": {
"display": {
"left": "$$",
"right": "$$"
},
"inline": {
"left": "$",
"right": "$"
}
},
"llm-aided-config": {
"title_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-32b-instruct",
"enable": false
}
},
"models-dir": {
"pipeline": "models",
"vlm": ""
},
"config_version": "1.3.0"
}
24 changes: 22 additions & 2 deletions parsers/base_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from abc import ABC, abstractmethod
from enum import Enum
from pathlib import Path
from typing import Any

from pydantic import BaseModel, Field
Expand All @@ -19,15 +20,34 @@ class TableDataItem(BaseModel):
"""表格数据类"""
rows: int # 行数
columns: int # 列数
grid: list[list[str]] = Field(default_factory=list) # 网格数据
row_headers: list[Any] = Field(default_factory=list) # 行头
column_headers: list[Any] = Field(default_factory=list) # 列头
data: list[list[str]] = Field(default_factory=list) # 数据
caption: list[str] = Field(default_factory=list) # 表格标题
footnote: list[str] = Field(default_factory=list) # 表格注脚

class TextDataItem(BaseModel):
"""文本数据类"""
text: str # 文本
text_level: int|None = None # 文本级别

class ImageDataItem(BaseModel):
"""图片数据类"""
uri: str|None = None # 图片 URI
caption: list[str] = Field(default_factory=list) # 图片标题
footnote: list[str] = Field(default_factory=list) # 图片注脚

class FormulaDataItem(BaseModel):
"""公式数据类"""
text: str # 公式
text_format: str|None = None # 公式格式

class ChunkData(BaseModel):
"""块数据类"""
type: ChunkType
name: str|None = None
content: str|TableDataItem|None = None
content: TableDataItem|TextDataItem|ImageDataItem|FormulaDataItem
description: str|None = None

class DocumentData(BaseModel):
Expand All @@ -45,6 +65,6 @@ class DocumentParser(ABC):
"""文档解析器基类"""

@abstractmethod
async def parse(self, file_path: str) -> DocumentData:
async def parse(self, file_path: Path) -> DocumentData:
"""解析文档"""
pass
164 changes: 100 additions & 64 deletions parsers/docx_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import asyncio
import logging
import time
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, WordFormatOption
Expand All @@ -30,7 +31,10 @@
ChunkType,
DocumentData,
DocumentParser,
FormulaDataItem,
ImageDataItem,
TableDataItem,
TextDataItem,
)
from parsers.parser_registry import register_parser

Expand All @@ -54,7 +58,7 @@ def __init__(self) -> None:
)
logger.debug("DocxDocumentParser initialized with SimplePipeline")

async def parse(self, file_path: str) -> DocumentData:
async def parse(self, file_path: Path) -> DocumentData:
"""异步解析DOCX文件

Args:
Expand All @@ -70,41 +74,64 @@ async def parse(self, file_path: str) -> DocumentData:
result = await loop.run_in_executor(None, self._converter.convert, file_path)
doc_data = result.document

# 确保文档数据包含所有必要的属性
if not hasattr(doc_data, 'name'):
doc_data.name = 'Unknown Document'
if not hasattr(doc_data, 'texts'):
doc_data.texts = []
if not hasattr(doc_data, 'pictures'):
doc_data.pictures = []
if not hasattr(doc_data, 'tables'):
doc_data.tables = []

title = self._extract_title(doc_data)
images = self._extract_images(doc_data.pictures)
tables = self._extract_tables(doc_data.tables)
texts = self._extract_texts(doc_data.texts)
# 并行处理不同类型的内容
document_data = await self._process_content_parallel(doc_data)

processing_time = time.time() - start_time
document_data.processing_time = processing_time
logger.info(f"Successfully parsed DOCX: {file_path} (took {processing_time:.2f}s)")
return DocumentData(
title=title,
texts=texts,
tables=tables,
images=images,
processing_time=processing_time,
success=True
)
return document_data

except Exception as e:
processing_time = time.time() - start_time
error_msg = f"Failed to parse DOCX file {file_path}: {type(e).__name__}: {e}"
logger.exception(error_msg) # 记录完整堆栈
return DocumentData(
success=False,
error_message=str(e),
processing_time=processing_time
)
raise Exception(f"Failed to parse DOCX file {file_path}") from e

async def _process_content_parallel(self, doc_data: DoclingDocument) -> DocumentData:
"""并行处理文档内容"""
# 创建任务列表
tasks = []

# 添加图片处理任务
if doc_data.pictures:
tasks.append(self._extract_images_async(doc_data.pictures))

# 添加表格处理任务
if doc_data.tables:
tasks.append(self._extract_tables_async(doc_data.tables))

# 添加文本处理任务
if doc_data.texts:
tasks.append(self._extract_texts_async(doc_data.texts))

# 并行执行所有任务
results = await asyncio.gather(*tasks, return_exceptions=True)

# 处理结果
images: list[ChunkData] = []
tables: list[ChunkData] = []
texts: list[ChunkData] = []

for i, result in enumerate(results):
if isinstance(result, Exception):
logger.error(f"Error processing content type {i}: {result}")
continue
if isinstance(result, list):
if result and result[0].type == ChunkType.IMAGE:
images = result
elif result and result[0].type == ChunkType.TABLE:
tables = result
elif result and result[0].type == ChunkType.TEXT:
texts = result

# 提取标题
title = self._extract_title(doc_data)

return DocumentData(
title=title,
texts=texts,
tables=tables,
images=images,
success=True
)

def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
"""提取文档中的图片
Expand All @@ -115,22 +142,22 @@ def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
Returns:
List[ChunkData]: 图片列表
"""
image_items = []
image_items: list[ChunkData] = []
for idx, picture in enumerate(pictures):
image_uri = ""
if hasattr(picture, 'image') and picture.image and hasattr(picture.image, 'uri'):
image_uri = str(picture.image.uri)

caption = ""
if hasattr(picture, 'captions') and picture.captions:
caption = str(picture.captions[0])

if not picture.image:
continue
image_uri = str(picture.image.uri)
caption = [caption.cref for caption in picture.captions]
footnote = [footnote.cref for footnote in picture.footnotes]
image_items.append(
ChunkData(
type=ChunkType.IMAGE,
name=getattr(picture, 'self_ref', None) or f"#/pictures/{idx}",
content=image_uri,
description=caption
name=f"#/pictures/{idx}",
content=ImageDataItem(
uri=image_uri,
caption=caption,
footnote=footnote
)
)
)

Expand All @@ -145,32 +172,22 @@ def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]:
Returns:
List[ChunkData]: 表格列表
"""
# 添加安全检查,确保 tables 参数存在且可迭代
if not tables or not hasattr(tables, '__iter__'):
return []

table_items: list[ChunkData] = []
for table in tables:
if not hasattr(table, 'data') or not hasattr(table.data, 'grid'):
continue
if len(table.data.grid) == 0:
continue

table_cells = table.data.grid
row_headers = [cell.text for cell in table_cells[0] if cell.row_header]
column_headers = [cell.text for cell in table_cells[0] if cell.column_header]
data = [[cell.text for cell in row] for row in table_cells[1:]]
caption = [caption.cref for caption in table.captions]
footnote = [footnote.cref for footnote in table.footnotes]
grid = [[cell.text if cell.text else '' for cell in row] for row in table.data.grid]
table_data = TableDataItem(
rows=table.data.num_rows,
columns=table.data.num_cols,
row_headers=row_headers,
column_headers=column_headers,
data=data
grid=grid,
caption=caption,
footnote=footnote
)
table_items.append(
ChunkData(
type=ChunkType.TABLE,
name=getattr(table, 'self_ref', None) or f"table-{len(table_items)}",
name=f"#/tables/{len(table_items)}",
content=table_data
)
)
Expand Down Expand Up @@ -212,16 +229,35 @@ def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeIte
text_items.append(
ChunkData(
type=ChunkType.FORMULA,
name=item.self_ref or f"formula-{len(text_items)}",
content=item.text
name=f"formula-{len(text_items)}",
content=FormulaDataItem(
text=item.text
)
)
)
case _:
text_items.append(
ChunkData(
type=ChunkType.TEXT,
name=f"text-{len(text_items)}",
content=item.text
name=f"#/texts/{len(text_items)}",
content=TextDataItem(
text=item.text
)
)
)
return text_items

async def _extract_images_async(self, pictures: list[PictureItem]) -> list[ChunkData]:
"""异步提取文档中的图片"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self._extract_images, pictures)

async def _extract_tables_async(self, tables: list[TableItem]) -> list[ChunkData]:
"""异步提取文档中的表格"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self._extract_tables, tables)

async def _extract_texts_async(self, texts: list[TitleItem|SectionHeaderItem|ListItem|CodeItem|FormulaItem|TextItem]) -> list[ChunkData]:
"""异步提取文档中的文本"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self._extract_texts, texts)
Loading
Loading