Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions runtime/datamate-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,26 @@ python -m venv .venv
source .venv/bin/activate
```

3. 安装依赖:
3. 安装依赖
由于项目使用poetry管理依赖,你可以使用以下命令安装:

```bash
pip install -r requirements.txt
pip install poetry
poetry install
```
或者直接使用pip安装(如果poetry不可用):

4. 准备环境变量(示例)

创建 `.env` 并设置必要的变量,例如:
```bash
pip install -e .
```

- DATABASE_URL(或根据项目配置使用具体变量)
- LABEL_STUDIO_BASE_URL
- LABEL_STUDIO_USER_TOKEN
4. 配置环境变量
复制环境变量示例文件并配置:

(具体变量请参考 `.env.example`)
```bash
cp .env.example .env
```
编辑.env文件,设置必要的环境变量,如数据库连接、Label Studio配置等。

5. 数据库迁移(开发环境):

Expand Down
Empty file.
4 changes: 2 additions & 2 deletions runtime/datamate-python/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Config:

host: str = "0.0.0.0"
port: int = 18000

# CORS
# allowed_origins: List[str] = ["*"]
# allowed_methods: List[str] = ["*"]
Expand All @@ -36,7 +36,7 @@ class Config:
mysql_database: str = "datamate"

database_url: str = "" # Will be overridden by build_database_url() if not provided

@model_validator(mode='after')
def build_database_url(self):
"""如果没有提供 database_url,则根据 MySQL 配置构建"""
Expand Down
197 changes: 197 additions & 0 deletions runtime/datamate-python/app/db/models/data_synthesis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
import uuid
from xml.etree.ElementTree import tostring

from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func
from sqlalchemy.orm import relationship

from app.db.session import Base
from app.module.generation.schema.generation import CreateSynthesisTaskRequest


async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
"""保存数据合成任务。"""
# 转换为模型实例
gid = str(uuid.uuid4())
synthesis_task_instance = DataSynthesisInstance(
id=gid,
name=synthesis_task.name,
description=synthesis_task.description,
status="pending",
model_id=synthesis_task.model_id,
synthesis_type=synthesis_task.synthesis_type.value,
progress=0,
result_data_location=f"/dataset/synthesis_results/{gid}/",
text_split_config=synthesis_task.text_split_config.model_dump(),
synthesis_config=synthesis_task.synthesis_config.model_dump(),
source_file_id=synthesis_task.source_file_id,
total_files=len(synthesis_task.source_file_id),
processed_files=0,
total_chunks=0,
processed_chunks=0,
total_synthesis_data=0,
created_at=func.now(),
updated_at=func.now(),
created_by="system",
updated_by="system"
)
db_session.add(synthesis_task_instance)
await db_session.commit()
await db_session.refresh(synthesis_task_instance)
return synthesis_task_instance


class DataSynthesisInstance(Base):
"""数据合成任务表,对应表 t_data_synthesis_instances

create table if not exists t_data_synthesis_instances
(
id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
name VARCHAR(255) NOT NULL COMMENT '任务名称',
description TEXT COMMENT '任务描述',
status VARCHAR(20) COMMENT '任务状态',
synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型',
model_id VARCHAR(255) NOT NULL COMMENT '模型ID',
progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
result_data_location VARCHAR(1000) COMMENT '结果数据存储位置',
text_split_config JSON NOT NULL COMMENT '文本切片配置',
synthesis_config JSON NOT NULL COMMENT '合成配置',
source_file_id JSON NOT NULL COMMENT '原始文件ID列表',
total_files INT DEFAULT 0 COMMENT '总文件数',
processed_files INT DEFAULT 0 COMMENT '已处理文件数',
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者'
) COMMENT='数据合成任务表(UUID 主键)';
"""

__tablename__ = "t_data_synthesis_instances"

id = Column(String(36), primary_key=True, index=True, comment="UUID")
name = Column(String(255), nullable=False, comment="任务名称")
description = Column(Text, nullable=True, comment="任务描述")
status = Column(String(20), nullable=True, comment="任务状态")
synthesis_type = Column(String(20), nullable=False, comment="合成类型")
model_id = Column(String(255), nullable=False, comment="模型ID")
progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位置")
text_split_config = Column(JSON, nullable=False, comment="文本切片配置")
synthesis_config = Column(JSON, nullable=False, comment="合成配置")
source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表")
total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")

created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间")
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")


class DataSynthesisFileInstance(Base):
"""数据合成文件任务表,对应表 t_data_synthesis_file_instances

create table if not exists t_data_synthesis_file_instances (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
synthesis_instance_id VARCHAR(36) COMMENT '数据合成任务ID',
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
source_file_id VARCHAR(255) NOT NULL COMMENT '原始文件ID',
target_file_location VARCHAR(1000) NOT NULL COMMENT '目标文件存储位置',
status VARCHAR(20) COMMENT '任务状态',
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者'
) COMMENT='数据合成文件任务表(UUID 主键)';
"""

__tablename__ = "t_data_synthesis_file_instances"

id = Column(String(36), primary_key=True, index=True, comment="UUID")
synthesis_instance_id = Column(
String(36),
nullable=False,
comment="数据合成任务ID",
index=True,
)
file_name = Column(String(255), nullable=False, comment="文件名")
source_file_id = Column(String(255), nullable=False, comment="原始文件ID")
target_file_location = Column(String(1000), nullable=False, comment="目标文件存储位置")
status = Column(String(20), nullable=True, comment="任务状态")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")

created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
updated_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
onupdate=func.current_timestamp(),
nullable=True,
comment="更新时间",
)
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")


class DataSynthesisChunkInstance(Base):
"""数据合成分块任务表,对应表 t_data_synthesis_chunk_instances

create table if not exists t_data_synthesis_chunk_instances (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID',
chunk_index INT COMMENT '分块索引',
chunk_content TEXT COMMENT '分块内容',
metadata JSON COMMENT '分块元数据'
) COMMENT='数据合成分块任务表(UUID 主键)';
"""

__tablename__ = "t_data_synthesis_chunk_instances"

id = Column(String(36), primary_key=True, index=True, comment="UUID")
synthesis_file_instance_id = Column(
String(36),
nullable=False,
comment="数据合成文件任务ID",
index=True,
)
chunk_index = Column(Integer, nullable=True, comment="分块索引")
chunk_content = Column(Text, nullable=True, comment="分块内容")
# SQLAlchemy Declarative 保留了属性名 'metadata',这里使用 chunk_metadata 作为属性名,
# 底层列名仍为 'metadata' 以保持与表结构兼容。
chunk_metadata = Column("metadata", JSON, nullable=True, comment="分块元数据")


class SynthesisData(Base):
"""数据合成结果表,对应表 t_synthesis_data

create table if not exists t_synthesis_data (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
data json COMMENT '合成的数据',
synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID',
chunk_instance_id VARCHAR(36) COMMENT '分块任务ID'
) COMMENT='数据合成任务队列表(UUID 主键)';
"""

__tablename__ = "t_data_synthesis_data"

id = Column(String(36), primary_key=True, index=True, comment="UUID")
data = Column(JSON, nullable=True, comment="合成的数据")
synthesis_file_instance_id = Column(
String(36),
nullable=False,
comment="数据合成文件任务ID",
index=True,
)
chunk_instance_id = Column(
String(36),
nullable=False,
comment="分块任务ID",
index=True,
)
57 changes: 57 additions & 0 deletions runtime/datamate-python/app/db/models/model_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from sqlalchemy import Column, String, Integer, TIMESTAMP, select

from app.db.session import Base


async def get_model_by_id(db_session, model_id: str):
"""根据 ID 获取单个模型配置。"""
result =await db_session.execute(select(ModelConfig).where(ModelConfig.id == model_id))
model_config = result.scalar_one_or_none()
return model_config

class ModelConfig(Base):
"""模型配置表,对应表 t_model_config

CREATE TABLE IF NOT EXISTS t_model_config (
id VARCHAR(36) PRIMARY KEY COMMENT '主键ID',
model_name VARCHAR(100) NOT NULL COMMENT '模型名称(如 qwen2)',
provider VARCHAR(50) NOT NULL COMMENT '模型提供商(如 Ollama、OpenAI、DeepSeek)',
base_url VARCHAR(255) NOT NULL COMMENT 'API 基础地址',
api_key VARCHAR(512) DEFAULT '' COMMENT 'API 密钥(无密钥则为空)',
type VARCHAR(50) NOT NULL COMMENT '模型类型(如 chat、embedding)',
is_enabled TINYINT DEFAULT 1 COMMENT '是否启用:1-启用,0-禁用',
is_default TINYINT DEFAULT 0 COMMENT '是否默认:1-默认,0-非默认',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者',
UNIQUE KEY uk_model_provider (model_name, provider)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4 COMMENT ='模型配置表';
"""

__tablename__ = "t_model_config"

id = Column(String(36), primary_key=True, index=True, comment="主键ID")
model_name = Column(String(100), nullable=False, comment="模型名称(如 qwen2)")
provider = Column(String(50), nullable=False, comment="模型提供商(如 Ollama、OpenAI、DeepSeek)")
base_url = Column(String(255), nullable=False, comment="API 基础地址")
api_key = Column(String(512), nullable=False, default="", comment="API 密钥(无密钥则为空)")
type = Column(String(50), nullable=False, comment="模型类型(如 chat、embedding)")

# 使用 Integer 存储 TINYINT,后续可在业务层将 0/1 转为 bool
is_enabled = Column(Integer, nullable=False, default=1, comment="是否启用:1-启用,0-禁用")
is_default = Column(Integer, nullable=False, default=0, comment="是否默认:1-默认,0-非默认")

created_at = Column(TIMESTAMP, nullable=True, comment="创建时间")
updated_at = Column(TIMESTAMP, nullable=True, comment="更新时间")
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")

__table_args__ = (
# 与 DDL 中的 uk_model_provider 保持一致
{
"mysql_engine": "InnoDB",
"mysql_charset": "utf8mb4",
"comment": "模型配置表",
},
)
5 changes: 2 additions & 3 deletions runtime/datamate-python/app/db/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

# 创建会话工厂
AsyncSessionLocal = async_sessionmaker(
engine,
class_=AsyncSession,
engine,
class_=AsyncSession,
expire_on_commit=False
)

Expand All @@ -29,4 +29,3 @@ async def get_db() -> AsyncGenerator[AsyncSession, None]:
yield session
finally:
await session.close()

8 changes: 4 additions & 4 deletions runtime/datamate-python/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

@asynccontextmanager
async def lifespan(app: FastAPI):

# @startup
logger.info("DataMate Python Backend starting...")

Expand All @@ -43,7 +43,7 @@ async def lifespan(app: FastAPI):
logger.info(f"Label Studio: {settings.label_studio_base_url}")

yield

# @shutdown
logger.info("DataMate Python Backend shutting down ...\n\n")

Expand Down Expand Up @@ -105,11 +105,11 @@ async def root():

if __name__ == "__main__":
import uvicorn

uvicorn.run(
"app.main:app",
host=settings.host,
port=settings.port,
reload=settings.debug,
log_level=settings.log_level.lower()
)
)
2 changes: 2 additions & 0 deletions runtime/datamate-python/app/module/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .system.interface import router as system_router
from .annotation.interface import router as annotation_router
from .synthesis.interface import router as ratio_router
from .generation.interface import router as generation_router

router = APIRouter(
prefix="/api"
Expand All @@ -11,5 +12,6 @@
router.include_router(system_router)
router.include_router(annotation_router)
router.include_router(ratio_router)
router.include_router(generation_router)

__all__ = ["router"]
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from fastapi import APIRouter

router = APIRouter(
prefix="/synth",
tags = ["synth"]
)

# Include sub-routers
from .generation_api import router as generation_router_router

router.include_router(generation_router_router)
Loading