From 19711e51a49b0ca2a2dae103e1a98d1b77664ff3 Mon Sep 17 00:00:00 2001 From: yinhaofeng <1841837261@qq.com> Date: Tue, 27 Oct 2020 06:34:53 +0000 Subject: [PATCH] Fix data process bug --- models/rank/deepfm/data/get_slot_data.py | 1 + models/rank/dnn/README.md | 28 ++++++++++++++++++- models/rank/dnn/config.yaml | 8 +++--- models/rank/dnn/data/get_slot_data.py | 1 + models/rank/fm/data/get_slot_data.py | 1 + .../logistic_regression/data/get_slot_data.py | 1 + 6 files changed, 35 insertions(+), 5 deletions(-) diff --git a/models/rank/deepfm/data/get_slot_data.py b/models/rank/deepfm/data/get_slot_data.py index 1d4c5c47d..191428397 100755 --- a/models/rank/deepfm/data/get_slot_data.py +++ b/models/rank/deepfm/data/get_slot_data.py @@ -80,6 +80,7 @@ def data_iter(): for j in v: s += " " + k + ":" + str(j) print(s.strip()) # add print for data preprocessing + yield None return data_iter diff --git a/models/rank/dnn/README.md b/models/rank/dnn/README.md index 9b5c14eff..8f0b309d0 100644 --- a/models/rank/dnn/README.md +++ b/models/rank/dnn/README.md @@ -176,7 +176,6 @@ click:0 dense_feature:0.05 dense_feature:0.00663349917081 dense_feature:0.05 den ... ``` -# ## 模型组网 ### 数据输入声明 正如数据准备章节所介绍,Criteo数据集中,分为连续数据与离散(稀疏)数据,所以整体而言,CTR-DNN模型的数据输入层包括三个,分别是:`dense_input`用于输入连续数据,维度由超参数`dense_input_dim`指定,数据类型是归一化后的浮点型数据。`sparse_inputs`用于记录离散数据,在Criteo数据集中,共有26个slot,所以我们创建了名为`1~26`的26个稀疏参数输入,数据类型为整数;最后是每条样本的`label`,代表了是否被点击,数据类型是整数,0代表负样例,1代表正样例。 @@ -259,6 +258,33 @@ avg_cost = fluid.layers.reduce_mean(cost) ``` 完成上述组网后,我们最终可以通过训练拿到`BATCH_AUC`与`auc`两个重要指标。 + +### 效果复现 +为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。 在全量数据下模型的指标如下: +| 模型 | auc | batch_size | thread_num| epoch_num| Time of each epoch | +| :------| :------ | :------| :------ | :------| :------ | +| dnn | 0.7748 | 512 | 10 | 4 | 约3.5小时 | + +1. 确认您当前所在目录为PaddleRec/models/rank/dnn +2. 在data目录下运行数据一键处理脚本,处理时间较长,请耐心等待。命令如下: +``` +cd data +sh run.sh +cd .. +``` +3. 退回dnn目录中,打开文件config.yaml,更改其中的参数 +将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径) +将dataloader_train中的batch_size从2改为512 +将dataloader_train中的data_path改为{workspace}/data/slot_train_data_full +将dataset_infer中的batch_size从2改为512 +将dataset_infer中的data_path改为{workspace}/data/slot_test_data_full +根据自己的需求调整phase中的线程数 +4. 运行命令,模型会进行四个epoch的训练,然后预测第四个epoch,并获得相应auc指标 +``` +python -m paddlerec.run -m ./config.yaml +``` +5. 经过全量数据训练后,执行预测的结果示例如下: + ``` PaddleRec: Runner single_cpu_infer Begin Executor Mode: infer diff --git a/models/rank/dnn/config.yaml b/models/rank/dnn/config.yaml index 75826684d..4428faa9e 100755 --- a/models/rank/dnn/config.yaml +++ b/models/rank/dnn/config.yaml @@ -22,19 +22,19 @@ dataset: type: DataLoader # or QueueDataset data_path: "{workspace}/data/sample_data/train" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" - dense_slots: "dense_var:13" + dense_slots: "dense_feature:13" - name: dataset_train # name of dataset to distinguish different datasets batch_size: 2 type: QueueDataset # or DataLoader data_path: "{workspace}/data/sample_data/train" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" - dense_slots: "dense_var:13" + dense_slots: "dense_feature:13" - name: dataset_infer # name batch_size: 2 type: DataLoader # or QueueDataset data_path: "{workspace}/data/sample_data/train" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" - dense_slots: "dense_var:13" + dense_slots: "dense_feature:13" # hyper parameters of user-defined network hyper_parameters: @@ -77,7 +77,7 @@ runner: epochs: 1 # device to run training or infer device: cpu - init_model_path: "increment_dnn" # load model path + init_model_path: "increment_dnn/3" # load model path phases: [phase2] - name: ps_cluster diff --git a/models/rank/dnn/data/get_slot_data.py b/models/rank/dnn/data/get_slot_data.py index 105a01db3..a50f74a49 100755 --- a/models/rank/dnn/data/get_slot_data.py +++ b/models/rank/dnn/data/get_slot_data.py @@ -62,6 +62,7 @@ def reader(): for i in range(1, 1 + len(categorical_range_)): s += " " + str(i) + ":" + str(sparse_feature[i - 1][0]) print(s.strip()) # add print for data preprocessing + yield None return reader diff --git a/models/rank/fm/data/get_slot_data.py b/models/rank/fm/data/get_slot_data.py index f9605d50b..e9c25fd00 100644 --- a/models/rank/fm/data/get_slot_data.py +++ b/models/rank/fm/data/get_slot_data.py @@ -80,6 +80,7 @@ def data_iter(): for j in v: s += " " + k + ":" + str(j) print(s.strip()) # add print for data preprocessing + yield None return data_iter diff --git a/models/rank/logistic_regression/data/get_slot_data.py b/models/rank/logistic_regression/data/get_slot_data.py index f9605d50b..e9c25fd00 100644 --- a/models/rank/logistic_regression/data/get_slot_data.py +++ b/models/rank/logistic_regression/data/get_slot_data.py @@ -80,6 +80,7 @@ def data_iter(): for j in v: s += " " + k + ":" + str(j) print(s.strip()) # add print for data preprocessing + yield None return data_iter