Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: 模块内部管控调用迁移到 V2 #853

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
22 changes: 13 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ print(resp["result"])
```python
from qianfan.dataset import Dataset

ds = Dataset.load(qianfan_dataset_id="your_dataset_id")
ds = Dataset.load(qianfan_dataset_version_id="your_dataset_id")
```

且千帆 Python SDK 集成了一系列本地的数据处理功能,允许用户在本地对来自多个数据源的数据进行增删改查等操作,详见[Dataset 框架](./docs/dataset.md)。
Expand All @@ -192,34 +192,38 @@ from qianfan.dataset import Dataset
# 从本地文件导入
ds = Dataset.load(data_file="path/to/dataset_file.json")


def filter_func(row: Dict[str, Any]) -> bool:
return "answer" in row.keys()
return "answer" in row.keys()


def map_func(row: Dict[str, Any]) -> Dict[str, Any]:
return {
"prompt": row["question"],
"response": row["answer"],
}
return {
"prompt": row["question"],
"response": row["answer"],
}


# 链式调用处理数据
ds.filter(filter_func).map(map_func).pack()

# 上传到千帆
# 数据集只有上传到千帆后才可以用于训练
# 请确保你的数据集格式符合要求
ds.save(qianfan_dataset_id="your_dataset_id")
ds.save(qianfan_dataset_version_id="your_dataset_id")
```

#### Trainer

千帆 Python SDK 以Pipeline为基础串联整个模型训练的流程,同时允许用户更好的把控训练流程状态 [Trainer 框架](./docs/trainer.md)。
以下是一个快速实现ERNIE-Speed-8K fine-tuning的例子:

```python
from qianfan.dataset import Dataset
from qianfan.trainer import Finetune

# 加载千帆平台上的数据集
ds: Dataset = Dataset.load(qianfan_dataset_id="ds-xxx")
ds: Dataset = Dataset.load(qianfan_dataset_version_id="ds-xxx")

# 新建trainer LLMFinetune,最少传入train_type和dataset
# 注意fine-tune任务需要指定的数据集类型要求为有标注的非排序对话数据集。
Expand All @@ -242,7 +246,7 @@ trainer.run()
from qianfan.model import Model
from qianfan.dataset import Dataset

ds = Dataset.load(qianfan_dataset_id="ds-xx")
ds = Dataset.load(qianfan_dataset_version_id="ds-xx")
m = Model(version_id="amv-xx")

m.batch_inference(dataset=ds)
Expand Down
2 changes: 1 addition & 1 deletion docs/batch_inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ Dataset 还支持使用平台上预置的模型或者用户训练完成的模型

```python
# 加载千帆平台上的数据集
qianfan_ds = Dataset.load(qianfan_dataset_id=cloud_dataset_id)
qianfan_ds = Dataset.load(qianfan_dataset_version_id=cloud_dataset_id)

result = qianfan_ds.test_using_llm(model_version_id="amv-qb8ijukaish3")
print(result[0])
Expand Down
34 changes: 17 additions & 17 deletions docs/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ from qianfan.dataset import Dataset
ds = Dataset.load(data_file="path/to/dataset_file.jsonl")

# 从千帆导入
ds = Dataset.load(qianfan_dataset_id="your_dataset_id")
ds = Dataset.load(qianfan_dataset_version_id="your_dataset_id")
```

### 处理数据集
Expand Down Expand Up @@ -107,7 +107,7 @@ print(ds.filter(filter_func).map(map_func).list())
new_ds = ds.save(data_file="path/to/local_file.csv")

# 导出到千帆平台
new_ds = ds.save(qianfan_dataset_id="your_dataset_id")
new_ds = ds.save(qianfan_dataset_version_id="your_dataset_id")

# 或者导出到它导入的地方
new_ds = ds.save()
Expand Down Expand Up @@ -269,7 +269,7 @@ new_ds = ds.save(data_file="file.json", batch_size=100)
```python
from qianfan.dataset import Dataset

ds_qianfan = Dataset.load(qianfan_dataset_id="your_dataset_id")
ds_qianfan = Dataset.load(qianfan_dataset_version_id="your_dataset_id")
print(ds_qianfan.list())
```

Expand All @@ -280,7 +280,7 @@ print(ds_qianfan.list())
```python
from qianfan.dataset import Dataset

ds_local = Dataset.load(qianfan_dataset_id="your_dataset_id").save(data_file="your_file_path")
ds_local = Dataset.load(qianfan_dataset_version_id="your_dataset_id").save(data_file="your_file_path")
print(ds_local.list())
```

Expand Down Expand Up @@ -312,7 +312,7 @@ ds_qianfan.save(
+ 另一种导出方式是增量导出到已经存在的数据集当中:填写 `save` 函数的 `qianfan_dataset_id` 参数(和 `load` 方法一致)。如果是导出到原本导入的数据集,则可以忽略 `qianfan_dataset_id` 参数。

```python
ds_qianfan.save(qianfan_dataset_id="your_dataset_id")
ds_qianfan.save(qianfan_dataset_version_id="your_dataset_id")
# 如果是导出到原本导入的数据集,可以忽略该参数
ds_qianfan.save()
```
Expand Down Expand Up @@ -412,7 +412,7 @@ ds_pyarrow_table = Dataset.create_from_pyarrow_table(Table.from_pandas(...))
除此之外,当用户以 jsonl \ txt 格式导入类数组形式文件,或者导入的是千帆平台的数据集时,SDK 支持传入 `organize_data_as_group` 参数,来指定将数据集组织成 SDK 内部的二维表格形式。这种格式包含了分组信息。并且可以通过 `pack()` 与 `unpack()` 函数进行格式之间的互相转换。

```python
ds = Dataset.load(qianfan_dataset_id="your_dataset_id", organize_data_as_group=True)
ds = Dataset.load(qianfan_dataset_version_id="your_dataset_id", organize_data_as_group=True)
```

设置 `organize_data_as_group=True` 或使用 `unpack()` 函数得到的千帆平台的数据集格式如下所示
Expand Down Expand Up @@ -548,7 +548,7 @@ print(ds[["column_name1", "column_name3"]])
```python
from qianfan.dataset import Dataset

ds_qianfan = Dataset.load(qianfan_dataset_id="your_dataset_id")
ds_qianfan = Dataset.load(qianfan_dataset_version_id="your_dataset_id")

# 单独检视某一实体
print(ds_qianfan[0])
Expand Down Expand Up @@ -605,19 +605,19 @@ ds = ds \
```python
from qianfan.dataset import Dataset
from qianfan.dataset.qianfan_data_operators import (
RemoveInvisibleCharacter,
FilterCheckNumberWords,
DeduplicationSimhash,
ReplaceEmails,
RemoveInvisibleCharacter,
FilterCheckNumberWords,
DeduplicationSimhash,
ReplaceEmails,
)

ds_qianfan = Dataset.load(qianfan_dataset_id="your_dataset_id")
ds_qianfan = Dataset.load(qianfan_dataset_version_id="your_dataset_id")

ds_qianfan.online_data_process([
RemoveInvisibleCharacter(),
FilterCheckNumberWords(number_words_max_cutoff=1024),
DeduplicationSimhash(distance=5),
ReplaceEmails()
RemoveInvisibleCharacter(),
FilterCheckNumberWords(number_words_max_cutoff=1024),
DeduplicationSimhash(distance=5),
ReplaceEmails()
])
```

Expand All @@ -632,7 +632,7 @@ from qianfan.dataset.schema import QianfanNonSortedConversation
schema = QianfanNonSortedConversation()

# 在 load 时使用
ds_qianfan = Dataset.load(qianfan_dataset_id="your_dataset_id", schema=schema)
ds_qianfan = Dataset.load(qianfan_dataset_version_id="your_dataset_id", schema=schema)

# 在 save 时使用
# 如果在 load 时就已经传入了 schema ,
Expand Down
7 changes: 4 additions & 3 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@
from qianfan.dataset import Dataset
from qianfan.evaluation import EvaluationManager
from qianfan.evaluation.evaluator import QianfanRuleEvaluator, QianfanRefereeEvaluator
from qianfan.evaluation.consts import QianfanRefereeEvaluatorDefaultMetrics, QianfanRefereeEvaluatorDefaultSteps, QianfanRefereeEvaluatorDefaultMaxScore
from qianfan.evaluation.consts import QianfanRefereeEvaluatorDefaultMetrics, QianfanRefereeEvaluatorDefaultSteps,
QianfanRefereeEvaluatorDefaultMaxScore
from qianfan.model import Model

your_qianfan_dataset_id = "your_dataset_id"
ds = Dataset.load(qianfan_dataset_id=your_qianfan_dataset_id)
ds = Dataset.load(qianfan_dataset_version_id=your_qianfan_dataset_id)

user_app_id = 123

Expand Down Expand Up @@ -117,7 +118,7 @@ result = em.eval([your_service], ds)

```python
your_qianfan_dataset_id = "your_dataset_id"
ds = Dataset.load(qianfan_dataset_id=your_qianfan_dataset_id)
ds = Dataset.load(qianfan_dataset_version_id=your_qianfan_dataset_id)

em = EvaluationManager(local_evaluators=local_evaluators)
result = em.eval([Model(version_id="amv-qb8ijukaish3")], ds)
Expand Down
4 changes: 2 additions & 2 deletions docs/model.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ m = trainer.output["model"]
from qianfan.model import Model
from qianfan.dataset import Dataset

ds = Dataset.load(qianfan_dataset_id="ds-xx")
ds = Dataset.load(qianfan_dataset_version_id="ds-xx")
m = Model(version_id="amv-xx")

m.batch_inference(dataset=ds)
Expand All @@ -47,7 +47,7 @@ from qianfan.model import Model
from qianfan.evaluation.evaluator import QianfanRuleEvaluator
from qianfan.evaluation import EvaluationManager

ds = Dataset.load(qianfan_dataset_id="ds-xxx")
ds = Dataset.load(qianfan_dataset_version_id="ds-xxx")
m = Model(version_id="amv-xx")

# 千帆平台规则评估器:
Expand Down
35 changes: 17 additions & 18 deletions docs/trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
以下以LLMFinetune(对应千帆平台 SFT语言大模型)为例,介绍如何使用`Trainer`进行训练。

```python
import os
import os

os.environ["QIANFAN_ACCESS_KEY"] = "your_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_sk"
Expand All @@ -19,13 +19,13 @@ from qianfan.dataset import Dataset
from qianfan.trainer import LLMFinetune

# 加载千帆平台上的数据集
ds: Dataset = Dataset.load(qianfan_dataset_id="111")
ds: Dataset = Dataset.load(qianfan_dataset_version_id="111")

# 新建trainer LLMFinetune,最少传入train_type和dataset
# 注意fine-tune任务需要指定的数据集类型要求为有标注的非排序对话数据集。
trainer = LLMFinetune(
train_type="ERNIE-Speed",
dataset=ds,
dataset=ds,
)

trainer.run()
Expand All @@ -41,7 +41,7 @@ from qianfan.trainer.consts import PeftType
from qianfan.dataset import Dataset

# 泛文本 数据集
ds = Dataset.load(qianfan_dataset_id="ds-ag138")
ds = Dataset.load(qianfan_dataset_version_id="ds-ag138")

# postpretrain
trainer = PostPreTrain(
Expand All @@ -52,9 +52,8 @@ trainer.run()
# 这一步可以拿到训练完成的PostPretrain任务信息:
print(trainer.output)


# sft数据集
sft_ds = Dataset.load(qianfan_dataset_id="ds-47j7ztjxfz60wb8x")
sft_ds = Dataset.load(qianfan_dataset_version_id="ds-47j7ztjxfz60wb8x")
ppt_sft_trainer = LLMFinetune(
train_type="ERNIE-Speed",
dataset=sft_ds,
Expand All @@ -65,7 +64,7 @@ ppt_sft_trainer = LLMFinetune(
peft_type=PeftType.ALL,
),
name="qianfantrainer01"
previous_trainer=trainer,
previous_trainer = trainer,
)

ppt_sft_trainer.run()
Expand All @@ -75,8 +74,9 @@ print(ppt_sft_trainer.output)

### 自定义训练参数
如果需要自定义训练参数,可以根据不同的模型传入不同的TrainConfig 以指定训练过程中的参数,需要注意的是不同模型支持的参数不同,具体以API文档为准。

```python
import os
import os

os.environ["QIANFAN_ACCESS_KEY"] = "your_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_sk"
Expand All @@ -86,17 +86,16 @@ from qianfan.trainer import LLMFinetune
from qianfan.trainer.configs import TrainConfig, DatasetConfig, CorpusConfig, CorpusConfigItem, PeftType, ResourceConfig
from qianfan.resources.console import consts as console_consts


ds = Dataset.load(qianfan_dataset_id="ds-pt19ixpeqrhtgc92")
ds = Dataset.load(qianfan_dataset_version_id="ds-pt19ixpeqrhtgc92")
trainer = LLMFinetune(
train_type="ERNIE-Speed-8K",
dataset=DatasetConfig(
datasets=[ds],
eval_split_ratio=10,
eval_split_ratio=10,
sampling_rate=1,
),
train_config=TrainConfig(
peft_type=PeftType.LoRA, # 必传,指定SFT or LoRA
peft_type=PeftType.LoRA, # 必传,指定SFT or LoRA
epoch=1,
learning_rate=0.0003,
max_seq_len=4096,
Expand All @@ -110,22 +109,22 @@ trainer = LLMFinetune(
node_num=4,
),
corpus_config=CorpusConfig(
data_copy=False, # 仅一言语料使用,如果为True,则当语料库不足以混入时,则拷贝重复数据混入
data_copy=False, # 仅一言语料使用,如果为True,则当语料库不足以混入时,则拷贝重复数据混入
corpus_configs=[
# CorpusConfigItem( # 千帆通用语料
# corpus_type=console_consts.FinetuneCorpusType.QianfanCommon,
# corpus_proportion="1%", # 总通用语料共n条,混入比例的取值范围x%为[0-100]%, 则混入n * x%
# ),
CorpusConfigItem( # 一言垂类
CorpusConfigItem( # 一言垂类
corpus_labels=["文本创作"],
corpus_type=console_consts.FinetuneCorpusType.YiyanVertical,
corpus_proportion="1:2", # 1:x 表示一条用户数据对应x条一言语料数据
corpus_proportion="1:2", # 1:x 表示一条用户数据对应x条一言语料数据
),
CorpusConfigItem( # 一言通用
CorpusConfigItem( # 一言通用
corpus_type=console_consts.FinetuneCorpusType.YiyanCommon,
corpus_proportion="1:1", # 1:x 表示一条用户数据对应x条一言语料数据
corpus_proportion="1:1", # 1:x 表示一条用户数据对应x条一言语料数据
),

],
)
)
Expand Down
4 changes: 2 additions & 2 deletions python/qianfan/common/cli/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def load_dataset(path: str, **kwargs: Any) -> Dataset:
"""Load dataset from platform or local file based on the format of path."""
qianfan_dataset_id = extract_id_from_path(path)
if qianfan_dataset_id:
return Dataset.load(qianfan_dataset_id=qianfan_dataset_id, **kwargs)
return Dataset.load(qianfan_dataset_version_id=qianfan_dataset_id, **kwargs)
return Dataset.load(data_file=path, **kwargs)


Expand Down Expand Up @@ -171,7 +171,7 @@ def save(
region = client_utils.bos_bucket_region(bucket)
with console.status("Saving dataset to platform..."):
src_dataset.save(
qianfan_dataset_id=dst_dataset_id,
qianfan_dataset_version_id=dst_dataset_id,
sup_storage_id=bucket,
sup_storage_path=path,
sup_storage_region=region,
Expand Down
8 changes: 4 additions & 4 deletions python/qianfan/common/cli/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,10 @@ def run(
enable_referee_evaluator: bool = typer.Option(
False, help="Enable referee evaluator."
),
app_id: Optional[int] = typer.Option(
api_name: Optional[str] = typer.Option(
None,
help=(
"The appid to which the model belongs to. The model will be used to"
"The api_name to which the model belongs to. The model will be used to"
" evaluate the results."
),
rich_help_panel=REFEREE_EVALUATOR_PANEL,
Expand Down Expand Up @@ -215,12 +215,12 @@ def run(
)
)
if enable_referee_evaluator:
if app_id is None:
if api_name is None:
print_error_msg("App_id is required for referee evaluator.")
raise typer.Exit(1)
evaluators.append(
QianfanRefereeEvaluator(
app_id=app_id,
api_name=api_name,
prompt_metrics=prompt_metrics,
prompt_steps=prompt_steps,
prompt_max_score=prompt_max_score,
Expand Down
Loading
Loading