USTC-KnowledgeComputingLab · liningping · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/.env.template b/.env.template
@@ -48,4 +48,7 @@ TASK_TIMEOUT=3600
 # 调试模式
 DEBUG=false
 # 日志级别
-LOG_LEVEL=INFO
+LOG_LEVEL=INFO
+
+# PDF解析
+MINERU_MODEL_SOURCE=local
diff --git a/.gitignore b/.gitignore
@@ -174,4 +174,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
-examples/
+examples/
+models/
diff --git a/mineru.json b/mineru.json
@@ -0,0 +1,37 @@
+{
+    "bucket_info": {
+        "bucket-name-1": [
+            "ak",
+            "sk",
+            "endpoint"
+        ],
+        "bucket-name-2": [
+            "ak",
+            "sk",
+            "endpoint"
+        ]
+    },
+    "latex-delimiter-config": {
+        "display": {
+            "left": "$$",
+            "right": "$$"
+        },
+        "inline": {
+            "left": "$",
+            "right": "$"
+        }
+    },
+    "llm-aided-config": {
+        "title_aided": {
+            "api_key": "your_api_key",
+            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            "model": "qwen2.5-32b-instruct",
+            "enable": false
+        }
+    },
+    "models-dir": {
+        "pipeline": "models",
+        "vlm": ""
+    },
+    "config_version": "1.3.0"
+}
diff --git a/parsers/base_models.py b/parsers/base_models.py
@@ -1,6 +1,7 @@
 import logging
 from abc import ABC, abstractmethod
 from enum import Enum
+from pathlib import Path
 from typing import Any
 
 from pydantic import BaseModel, Field
@@ -19,15 +20,34 @@ class TableDataItem(BaseModel):
     """表格数据类"""
     rows: int  # 行数
     columns: int  # 列数
+    grid: list[list[str]] = Field(default_factory=list)  # 网格数据
     row_headers: list[Any] = Field(default_factory=list)  # 行头
     column_headers: list[Any] = Field(default_factory=list)  # 列头
     data: list[list[str]] = Field(default_factory=list)  # 数据
+    caption: list[str] = Field(default_factory=list)  # 表格标题
+    footnote: list[str] = Field(default_factory=list)  # 表格注脚
+
+class TextDataItem(BaseModel):
+    """文本数据类"""
+    text: str  # 文本
+    text_level: int|None = None  # 文本级别
+
+class ImageDataItem(BaseModel):
+    """图片数据类"""
+    uri: str|None = None  # 图片 URI
+    caption: list[str] = Field(default_factory=list)  # 图片标题
+    footnote: list[str] = Field(default_factory=list)  # 图片注脚
+
+class FormulaDataItem(BaseModel):
+    """公式数据类"""
+    text: str  # 公式
+    text_format: str|None = None  # 公式格式
 
 class ChunkData(BaseModel):
     """块数据类"""
     type: ChunkType
     name: str|None = None
-    content: str|TableDataItem|None = None
+    content: TableDataItem|TextDataItem|ImageDataItem|FormulaDataItem
     description: str|None = None
 
 class DocumentData(BaseModel):
@@ -45,6 +65,6 @@ class DocumentParser(ABC):
     """文档解析器基类"""
 
     @abstractmethod
-    async def parse(self, file_path: str) -> DocumentData:
+    async def parse(self, file_path: Path) -> DocumentData:
         """解析文档"""
         pass
diff --git a/parsers/docx_parser.py b/parsers/docx_parser.py
@@ -8,6 +8,7 @@
 import asyncio
 import logging
 import time
+from pathlib import Path
 
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import DocumentConverter, WordFormatOption
@@ -30,7 +31,10 @@
     ChunkType,
     DocumentData,
     DocumentParser,
+    FormulaDataItem,
+    ImageDataItem,
     TableDataItem,
+    TextDataItem,
 )
 from parsers.parser_registry import register_parser
 
@@ -54,7 +58,7 @@ def __init__(self) -> None:
         )
         logger.debug("DocxDocumentParser initialized with SimplePipeline")
 
-    async def parse(self, file_path: str) -> DocumentData:
+    async def parse(self, file_path: Path) -> DocumentData:
         """异步解析DOCX文件
 
         Args:
@@ -70,41 +74,64 @@ async def parse(self, file_path: str) -> DocumentData:
             result = await loop.run_in_executor(None, self._converter.convert, file_path)
             doc_data = result.document
 
-            # 确保文档数据包含所有必要的属性
-            if not hasattr(doc_data, 'name'):
-                doc_data.name = 'Unknown Document'
-            if not hasattr(doc_data, 'texts'):
-                doc_data.texts = []
-            if not hasattr(doc_data, 'pictures'):
-                doc_data.pictures = []
-            if not hasattr(doc_data, 'tables'):
-                doc_data.tables = []
-
-            title = self._extract_title(doc_data)
-            images = self._extract_images(doc_data.pictures)
-            tables = self._extract_tables(doc_data.tables)
-            texts = self._extract_texts(doc_data.texts)
+            # 并行处理不同类型的内容
+            document_data = await self._process_content_parallel(doc_data)
 
             processing_time = time.time() - start_time
+            document_data.processing_time = processing_time
             logger.info(f"Successfully parsed DOCX: {file_path} (took {processing_time:.2f}s)")
-            return DocumentData(
-                title=title,
-                texts=texts,
-                tables=tables,
-                images=images,
-                processing_time=processing_time,
-                success=True
-            )
+            return document_data
 
         except Exception as e:
-            processing_time = time.time() - start_time
-            error_msg = f"Failed to parse DOCX file {file_path}: {type(e).__name__}: {e}"
-            logger.exception(error_msg)  # 记录完整堆栈
-            return DocumentData(
-                success=False,
-                error_message=str(e),
-                processing_time=processing_time
-            )
+            raise Exception(f"Failed to parse DOCX file {file_path}") from e
+
+    async def _process_content_parallel(self, doc_data: DoclingDocument) -> DocumentData:
+        """并行处理文档内容"""
+        # 创建任务列表
+        tasks = []
+
+        # 添加图片处理任务
+        if doc_data.pictures:
+            tasks.append(self._extract_images_async(doc_data.pictures))
+
+        # 添加表格处理任务
+        if doc_data.tables:
+            tasks.append(self._extract_tables_async(doc_data.tables))
+
+        # 添加文本处理任务
+        if doc_data.texts:
+            tasks.append(self._extract_texts_async(doc_data.texts))
+
+        # 并行执行所有任务
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # 处理结果
+        images: list[ChunkData] = []
+        tables: list[ChunkData] = []
+        texts: list[ChunkData] = []
+
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.error(f"Error processing content type {i}: {result}")
+                continue
+            if isinstance(result, list):
+                if result and result[0].type == ChunkType.IMAGE:
+                    images = result
+                elif result and result[0].type == ChunkType.TABLE:
+                    tables = result
+                elif result and result[0].type == ChunkType.TEXT:
+                    texts = result
+
+        # 提取标题
+        title = self._extract_title(doc_data)
+
+        return DocumentData(
+            title=title,
+            texts=texts,
+            tables=tables,
+            images=images,
+            success=True
+        )
 
     def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
         """提取文档中的图片
@@ -115,22 +142,22 @@ def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
         Returns:
             List[ChunkData]: 图片列表
         """
-        image_items = []
+        image_items: list[ChunkData] = []
         for idx, picture in enumerate(pictures):
-            image_uri = ""
-            if hasattr(picture, 'image') and picture.image and hasattr(picture.image, 'uri'):
-                image_uri = str(picture.image.uri)
-
-            caption = ""
-            if hasattr(picture, 'captions') and picture.captions:
-                caption = str(picture.captions[0])
-
+            if not picture.image:
+                continue
+            image_uri = str(picture.image.uri)
+            caption = [caption.cref for caption in picture.captions]
+            footnote = [footnote.cref for footnote in picture.footnotes]
             image_items.append(
                 ChunkData(
                     type=ChunkType.IMAGE,
-                    name=getattr(picture, 'self_ref', None) or f"#/pictures/{idx}",
-                    content=image_uri,
-                    description=caption
+                    name=f"#/pictures/{idx}",
+                    content=ImageDataItem(
+                        uri=image_uri,
+                        caption=caption,
+                        footnote=footnote
+                    )
                 )
             )
 
@@ -145,32 +172,22 @@ def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]:
         Returns:
             List[ChunkData]: 表格列表
         """
-        # 添加安全检查，确保 tables 参数存在且可迭代
-        if not tables or not hasattr(tables, '__iter__'):
-            return []
-
         table_items: list[ChunkData] = []
         for table in tables:
-            if not hasattr(table, 'data') or not hasattr(table.data, 'grid'):
-                continue
-            if len(table.data.grid) == 0:
-                continue
-
-            table_cells = table.data.grid
-            row_headers = [cell.text for cell in table_cells[0] if cell.row_header]
-            column_headers = [cell.text for cell in table_cells[0] if cell.column_header]
-            data = [[cell.text for cell in row] for row in table_cells[1:]]
+            caption = [caption.cref for caption in table.captions]
+            footnote = [footnote.cref for footnote in table.footnotes]
+            grid = [[cell.text if cell.text else '' for cell in row] for row in table.data.grid]
             table_data = TableDataItem(
                 rows=table.data.num_rows,
                 columns=table.data.num_cols,
-                row_headers=row_headers,
-                column_headers=column_headers,
-                data=data
+                grid=grid,
+                caption=caption,
+                footnote=footnote
             )
             table_items.append(
                 ChunkData(
                     type=ChunkType.TABLE,
-                    name=getattr(table, 'self_ref', None) or f"table-{len(table_items)}",
+                    name=f"#/tables/{len(table_items)}",
                     content=table_data
                 )
             )
@@ -212,16 +229,35 @@ def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeIte
                     text_items.append(
                         ChunkData(
                             type=ChunkType.FORMULA,
-                            name=item.self_ref or f"formula-{len(text_items)}",
-                            content=item.text
+                            name=f"formula-{len(text_items)}",
+                            content=FormulaDataItem(
+                                text=item.text
+                            )
                         )
                     )
                 case _:
                     text_items.append(
                         ChunkData(
                             type=ChunkType.TEXT,
-                            name=f"text-{len(text_items)}",
-                            content=item.text
+                            name=f"#/texts/{len(text_items)}",
+                            content=TextDataItem(
+                                text=item.text
+                            )
                         )
                     )
         return text_items
+
+    async def _extract_images_async(self, pictures: list[PictureItem]) -> list[ChunkData]:
+        """异步提取文档中的图片"""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._extract_images, pictures)
+
+    async def _extract_tables_async(self, tables: list[TableItem]) -> list[ChunkData]:
+        """异步提取文档中的表格"""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._extract_tables, tables)
+
+    async def _extract_texts_async(self, texts: list[TitleItem|SectionHeaderItem|ListItem|CodeItem|FormulaItem|TextItem]) -> list[ChunkData]:
+        """异步提取文档中的文本"""
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._extract_texts, texts)