From 34326456ab886b610d0eabf12a2923520e41a1bc Mon Sep 17 00:00:00 2001 From: MrChengmo Date: Sun, 1 Nov 2020 19:56:52 +0800 Subject: [PATCH] ctr benchmark --- models/benchmark/ctr_dnn/backend.yaml | 0 models/benchmark/ctr_dnn/config.yaml | 99 +++++++++++++++ models/benchmark/ctr_dnn/dataset_generator.py | 64 ++++++++++ models/benchmark/ctr_dnn/download_data.sh | 13 ++ models/benchmark/ctr_dnn/model.py | 117 ++++++++++++++++++ models/benchmark/simnet_bow/config.yaml | 96 ++++++++++++++ models/benchmark/word2vec/config.yaml | 96 ++++++++++++++ 7 files changed, 485 insertions(+) create mode 100644 models/benchmark/ctr_dnn/backend.yaml create mode 100644 models/benchmark/ctr_dnn/config.yaml create mode 100644 models/benchmark/ctr_dnn/dataset_generator.py create mode 100644 models/benchmark/ctr_dnn/download_data.sh create mode 100644 models/benchmark/ctr_dnn/model.py create mode 100644 models/benchmark/simnet_bow/config.yaml create mode 100644 models/benchmark/word2vec/config.yaml diff --git a/models/benchmark/ctr_dnn/backend.yaml b/models/benchmark/ctr_dnn/backend.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/models/benchmark/ctr_dnn/config.yaml b/models/benchmark/ctr_dnn/config.yaml new file mode 100644 index 000000000..302e4f9f2 --- /dev/null +++ b/models/benchmark/ctr_dnn/config.yaml @@ -0,0 +1,99 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "./" + +hyper_parameters: + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + sparse_inputs_slots: 27 + sparse_feature_number: 1000001 + sparse_feature_dim: 10 + dense_feature_dim: 13 + fc_sizes: [400, 400, 400] + +mode: [local_train] +runner: +- name: ps_cpu + class: cluster_train + epochs: 10 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: ps_gpu + class: cluster_train + epochs: 10 + device: gpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: ps_heter + class: cluster_train + epochs: 10 + device: gpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: local_infer + class: infer + epochs: 1 + device: cpu + init_model_path: "" + phases: [phase2] + +- name: local_train + class: train + epochs: 1 + device: cpu + phases: [phase2] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 + +- name: phase2 + model: "{workspace}/model.py" + dataset_name: dataset_infer + thread_num: 1 + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_converter: "{workspace}/dataset_generator.py" + data_path: "{workspace}/train_data" +- name: dataset_train + batch_size: 2 + type: QueueDataset + data_converter: "{workspace}/dataset_generator.py" + data_path: "{workspace}/train_data" +- name: dataset_infer + batch_size: 2 + type: DataLoader + data_converter: "{workspace}/dataset_generator.py" + data_path: "{workspace}/test_data" diff --git a/models/benchmark/ctr_dnn/dataset_generator.py b/models/benchmark/ctr_dnn/dataset_generator.py new file mode 100644 index 000000000..9ac9bd719 --- /dev/null +++ b/models/benchmark/ctr_dnn/dataset_generator.py @@ -0,0 +1,64 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddlerec.core.reader import ReaderBase + + +class Reader(ReaderBase): + """ + DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading + Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675 + """ + + def init(self): + self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + self.cont_max_ = [ + 20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 + ] + self.cont_diff_ = [ + 20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 + ] + self.hash_dim_ = 1000001 + self.continuous_range_ = range(1, 14) + self.categorical_range_ = range(14, 40) + + def generate_sample(self, line): + """ + Read the data line by line and process it as a dictionary + """ + + def reader(): + """ + This function needs to be implemented by the user, based on data format + """ + features = line.rstrip('\n').split('\t') + dense_feature = [] + sparse_feature = [] + for idx in self.continuous_range_: + if features[idx] == "": + dense_feature.append(0.0) + else: + dense_feature.append( + (float(features[idx]) - self.cont_min_[idx - 1]) / + self.cont_diff_[idx - 1]) + for idx in self.categorical_range_: + sparse_feature.append( + [hash(str(idx) + features[idx]) % self.hash_dim_]) + label = [int(features[0])] + feature_name = ["dense_feature"] + for idx in self.categorical_range_: + feature_name.append("C" + str(idx - 13)) + feature_name.append("label") + yield zip(feature_name, [dense_feature] + sparse_feature + [label]) + + return reader diff --git a/models/benchmark/ctr_dnn/download_data.sh b/models/benchmark/ctr_dnn/download_data.sh new file mode 100644 index 000000000..56816f7d6 --- /dev/null +++ b/models/benchmark/ctr_dnn/download_data.sh @@ -0,0 +1,13 @@ +wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz +tar -zxvf ctr_data.tar.gz +mv ./raw_data ./train_data_full +mkdir train_data && cd train_data +cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd .. +mv ./test_data ./test_data_full +mkdir test_data && cd test_data +cp ../test_data_full/part-220 ./ && cd .. +echo "Complete data download." +echo "Full Train data stored in ./train_data_full " +echo "Full Test data stored in ./test_data_full " +echo "Rapid Verification train data stored in ./train_data " +echo "Rapid Verification test data stored in ./test_data " diff --git a/models/benchmark/ctr_dnn/model.py b/models/benchmark/ctr_dnn/model.py new file mode 100644 index 000000000..498d77b1a --- /dev/null +++ b/models/benchmark/ctr_dnn/model.py @@ -0,0 +1,117 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.fluid as fluid + +from paddlerec.core.utils import envs +from paddlerec.core.model import ModelBase + + +class Model(ModelBase): + def __init__(self, config): + ModelBase.__init__(self, config) + + def _init_hyper_parameters(self): + self.dense_feature_dim = envs.get_global_env( + "hyper_parameters.dense_feature_dim") + self.sparse_feature_number = envs.get_global_env( + "hyper_parameters.sparse_feature_number") + self.sparse_feature_dim = envs.get_global_env( + "hyper_parameters.sparse_feature_dim") + self.learning_rate = envs.get_global_env( + "hyper_parameters.optimizer.learning_rate") + + def input_data(self, is_infer=False, **kwargs): + dense_input = paddle.data( + name="dense_input", + shape=[self.dense_feature_dim], + dtype="float32") + + sparse_input_ids = [ + paddle.data( + name="C" + str(i), shape=[1], lod_level=1, dtype="int64") + for i in range(1, 27) + ] + + label = paddle.data(name="label", shape=[1], dtype="float32") + + inputs = [dense_input] + sparse_input_ids + [label] + return inputs + + def net(self, input, is_infer=False): + self.dense_input = input[0] + self.sparse_input = input[1:-1] + self.label_input = self.input[-1] + + def embedding_layer(input): + emb = paddle.static.nn.embedding( + input=input, + is_sparse=True, + is_distributed=self.is_distributed, + size=[self.sparse_feature_number, self.sparse_feature_dim], + param_attr=paddle.ParamAttr( + name="SparseFeatFactors", + initializer=fluid.initializer.Uniform())) + emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum') + return emb_sum + + sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs)) + concated = paddle.concat(sparse_embed_seq + [self.dense_input], axis=1) + + fcs = [concated] + hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes") + + for size in hidden_layers: + output = paddle.static.nn.fc( + input=fcs[-1], + size=size, + act='relu', + param_attr=paddle.ParamAttr( + initializer=fluid.initializer.Normal( + scale=1.0 / math.sqrt(fcs[-1].shape[1])))) + fcs.append(output) + + predict = paddle.static.nn.fc( + input=fcs[-1], + size=2, + act="softmax", + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1 / math.sqrt(fcs[-1].shape[1])))) + + self.predict = predict + + auc, batch_auc, _ = fluid.layers.auc(input=self.predict, + label=self.label_input, + num_thresholds=2**12, + slide_steps=20) + if is_infer: + self._infer_results["AUC"] = auc + self._infer_results["BATCH_AUC"] = batch_auc + return + + self._metrics["AUC"] = auc + self._metrics["BATCH_AUC"] = batch_auc + cost = paddle.nn.functional.cross_entropy( + input=self.predict, label=self.label_input) + avg_cost = fluid.layers.reduce_mean(cost) + self._cost = avg_cost + + def optimizer(self): + optimizer = paddle.optimizer.Adam(self.learning_rate, lazy_mode=True) + return optimizer + + def infer_net(self): + pass diff --git a/models/benchmark/simnet_bow/config.yaml b/models/benchmark/simnet_bow/config.yaml new file mode 100644 index 000000000..f675df890 --- /dev/null +++ b/models/benchmark/simnet_bow/config.yaml @@ -0,0 +1,96 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "./" + +hyper_parameters: + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + sparse_inputs_slots: 27 + sparse_feature_number: 1000001 + sparse_feature_dim: 10 + dense_input_dim: 13 + fc_sizes: [400, 400, 400] + +mode: [ps_cpu] +runner: +- name: ps_cpu + class: cluster_train + epochs: 10 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: ps_gpu + class: cluster_train + epochs: 10 + device: gpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: ps_heter + class: cluster_train + epochs: 10 + device: gpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: local_infer + class: infer + epochs: 1 + device: cpu + init_model_path: "" + phases: [phase2] + +- name: local_train + class: infer + epochs: 1 + device: cpu + phases: [phase2] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 + +- name: phase2 + model: "{workspace}/model.py" + dataset_name: dataset_infer + thread_num: 1 + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/data/sample_data/train" +- name: dataset_train + batch_size: 2 + type: QueueDataset + data_path: "{workspace}/data/sample_data/train" +- name: dataset_infer + batch_size: 2 + type: DataLoader + data_path: "{workspace}/data/sample_data/train" diff --git a/models/benchmark/word2vec/config.yaml b/models/benchmark/word2vec/config.yaml new file mode 100644 index 000000000..f675df890 --- /dev/null +++ b/models/benchmark/word2vec/config.yaml @@ -0,0 +1,96 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace: "./" + +hyper_parameters: + optimizer: + class: Adam + learning_rate: 0.001 + strategy: async + sparse_inputs_slots: 27 + sparse_feature_number: 1000001 + sparse_feature_dim: 10 + dense_input_dim: 13 + fc_sizes: [400, 400, 400] + +mode: [ps_cpu] +runner: +- name: ps_cpu + class: cluster_train + epochs: 10 + device: cpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: ps_gpu + class: cluster_train + epochs: 10 + device: gpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: ps_heter + class: cluster_train + epochs: 10 + device: gpu + fleet_mode: ps + save_checkpoint_interval: 1 + save_checkpoint_path: "increment_dnn" + print_interval: 1 + phases: [phase1] + +- name: local_infer + class: infer + epochs: 1 + device: cpu + init_model_path: "" + phases: [phase2] + +- name: local_train + class: infer + epochs: 1 + device: cpu + phases: [phase2] + +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: dataloader_train + thread_num: 1 + +- name: phase2 + model: "{workspace}/model.py" + dataset_name: dataset_infer + thread_num: 1 + +dataset: +- name: dataloader_train + batch_size: 2 + type: DataLoader + data_path: "{workspace}/data/sample_data/train" +- name: dataset_train + batch_size: 2 + type: QueueDataset + data_path: "{workspace}/data/sample_data/train" +- name: dataset_infer + batch_size: 2 + type: DataLoader + data_path: "{workspace}/data/sample_data/train"