From 34326456ab886b610d0eabf12a2923520e41a1bc Mon Sep 17 00:00:00 2001
From: MrChengmo <cmchengmo@163.com>
Date: Sun, 1 Nov 2020 19:56:52 +0800
Subject: [PATCH] ctr benchmark

---
 models/benchmark/ctr_dnn/backend.yaml         |   0
 models/benchmark/ctr_dnn/config.yaml          |  99 +++++++++++++++
 models/benchmark/ctr_dnn/dataset_generator.py |  64 ++++++++++
 models/benchmark/ctr_dnn/download_data.sh     |  13 ++
 models/benchmark/ctr_dnn/model.py             | 117 ++++++++++++++++++
 models/benchmark/simnet_bow/config.yaml       |  96 ++++++++++++++
 models/benchmark/word2vec/config.yaml         |  96 ++++++++++++++
 7 files changed, 485 insertions(+)
 create mode 100644 models/benchmark/ctr_dnn/backend.yaml
 create mode 100644 models/benchmark/ctr_dnn/config.yaml
 create mode 100644 models/benchmark/ctr_dnn/dataset_generator.py
 create mode 100644 models/benchmark/ctr_dnn/download_data.sh
 create mode 100644 models/benchmark/ctr_dnn/model.py
 create mode 100644 models/benchmark/simnet_bow/config.yaml
 create mode 100644 models/benchmark/word2vec/config.yaml

diff --git a/models/benchmark/ctr_dnn/backend.yaml b/models/benchmark/ctr_dnn/backend.yaml
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/benchmark/ctr_dnn/config.yaml b/models/benchmark/ctr_dnn/config.yaml
new file mode 100644
index 000000000..302e4f9f2
--- /dev/null
+++ b/models/benchmark/ctr_dnn/config.yaml
@@ -0,0 +1,99 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+workspace: "./"
+
+hyper_parameters:
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 10
+  dense_feature_dim: 13
+  fc_sizes: [400, 400, 400]
+
+mode: [local_train]
+runner:
+- name: ps_cpu
+  class: cluster_train
+  epochs: 10
+  device: cpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: ps_gpu
+  class: cluster_train
+  epochs: 10
+  device: gpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1 
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: ps_heter
+  class: cluster_train
+  epochs: 10
+  device: gpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1 
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: local_infer
+  class: infer
+  epochs: 1
+  device: cpu
+  init_model_path: ""
+  phases: [phase2]
+
+- name: local_train
+  class: train
+  epochs: 1
+  device: cpu
+  phases: [phase2]
+
+phase:
+- name: phase1
+  model: "{workspace}/model.py" 
+  dataset_name: dataloader_train 
+  thread_num: 1
+
+- name: phase2
+  model: "{workspace}/model.py" 
+  dataset_name: dataset_infer 
+  thread_num: 1
+
+dataset:
+- name: dataloader_train 
+  batch_size: 2
+  type: DataLoader 
+  data_converter: "{workspace}/dataset_generator.py"
+  data_path: "{workspace}/train_data"
+- name: dataset_train 
+  batch_size: 2
+  type: QueueDataset 
+  data_converter: "{workspace}/dataset_generator.py"
+  data_path: "{workspace}/train_data"
+- name: dataset_infer
+  batch_size: 2
+  type: DataLoader
+  data_converter: "{workspace}/dataset_generator.py"
+  data_path: "{workspace}/test_data"
diff --git a/models/benchmark/ctr_dnn/dataset_generator.py b/models/benchmark/ctr_dnn/dataset_generator.py
new file mode 100644
index 000000000..9ac9bd719
--- /dev/null
+++ b/models/benchmark/ctr_dnn/dataset_generator.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlerec.core.reader import ReaderBase
+
+
+class Reader(ReaderBase):
+    """
+    DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading
+    Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675
+    """
+
+    def init(self):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.cont_diff_ = [
+            20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
+        ]
+        self.hash_dim_ = 1000001
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+
+        def reader():
+            """
+            This function needs to be implemented by the user, based on data format
+            """
+            features = line.rstrip('\n').split('\t')
+            dense_feature = []
+            sparse_feature = []
+            for idx in self.continuous_range_:
+                if features[idx] == "":
+                    dense_feature.append(0.0)
+                else:
+                    dense_feature.append(
+                        (float(features[idx]) - self.cont_min_[idx - 1]) /
+                        self.cont_diff_[idx - 1])
+            for idx in self.categorical_range_:
+                sparse_feature.append(
+                    [hash(str(idx) + features[idx]) % self.hash_dim_])
+            label = [int(features[0])]
+            feature_name = ["dense_feature"]
+            for idx in self.categorical_range_:
+                feature_name.append("C" + str(idx - 13))
+            feature_name.append("label")
+            yield zip(feature_name, [dense_feature] + sparse_feature + [label])
+
+        return reader
diff --git a/models/benchmark/ctr_dnn/download_data.sh b/models/benchmark/ctr_dnn/download_data.sh
new file mode 100644
index 000000000..56816f7d6
--- /dev/null
+++ b/models/benchmark/ctr_dnn/download_data.sh
@@ -0,0 +1,13 @@
+wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
+tar -zxvf ctr_data.tar.gz
+mv ./raw_data ./train_data_full
+mkdir train_data && cd train_data
+cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd ..
+mv ./test_data ./test_data_full
+mkdir test_data && cd test_data
+cp ../test_data_full/part-220 ./  && cd ..
+echo "Complete data download."
+echo "Full Train data stored in ./train_data_full "
+echo "Full Test data stored in ./test_data_full "
+echo "Rapid Verification train data stored in ./train_data "
+echo "Rapid Verification test data stored in ./test_data "
diff --git a/models/benchmark/ctr_dnn/model.py b/models/benchmark/ctr_dnn/model.py
new file mode 100644
index 000000000..498d77b1a
--- /dev/null
+++ b/models/benchmark/ctr_dnn/model.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.fluid as fluid
+
+from paddlerec.core.utils import envs
+from paddlerec.core.model import ModelBase
+
+
+class Model(ModelBase):
+    def __init__(self, config):
+        ModelBase.__init__(self, config)
+
+    def _init_hyper_parameters(self):
+        self.dense_feature_dim = envs.get_global_env(
+            "hyper_parameters.dense_feature_dim")
+        self.sparse_feature_number = envs.get_global_env(
+            "hyper_parameters.sparse_feature_number")
+        self.sparse_feature_dim = envs.get_global_env(
+            "hyper_parameters.sparse_feature_dim")
+        self.learning_rate = envs.get_global_env(
+            "hyper_parameters.optimizer.learning_rate")
+
+    def input_data(self, is_infer=False, **kwargs):
+        dense_input = paddle.data(
+            name="dense_input",
+            shape=[self.dense_feature_dim],
+            dtype="float32")
+
+        sparse_input_ids = [
+            paddle.data(
+                name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
+            for i in range(1, 27)
+        ]
+
+        label = paddle.data(name="label", shape=[1], dtype="float32")
+
+        inputs = [dense_input] + sparse_input_ids + [label]
+        return inputs
+
+    def net(self, input, is_infer=False):
+        self.dense_input = input[0]
+        self.sparse_input = input[1:-1]
+        self.label_input = self.input[-1]
+
+        def embedding_layer(input):
+            emb = paddle.static.nn.embedding(
+                input=input,
+                is_sparse=True,
+                is_distributed=self.is_distributed,
+                size=[self.sparse_feature_number, self.sparse_feature_dim],
+                param_attr=paddle.ParamAttr(
+                    name="SparseFeatFactors",
+                    initializer=fluid.initializer.Uniform()))
+            emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+            return emb_sum
+
+        sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
+        concated = paddle.concat(sparse_embed_seq + [self.dense_input], axis=1)
+
+        fcs = [concated]
+        hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")
+
+        for size in hidden_layers:
+            output = paddle.static.nn.fc(
+                input=fcs[-1],
+                size=size,
+                act='relu',
+                param_attr=paddle.ParamAttr(
+                    initializer=fluid.initializer.Normal(
+                        scale=1.0 / math.sqrt(fcs[-1].shape[1]))))
+            fcs.append(output)
+
+        predict = paddle.static.nn.fc(
+            input=fcs[-1],
+            size=2,
+            act="softmax",
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                scale=1 / math.sqrt(fcs[-1].shape[1]))))
+
+        self.predict = predict
+
+        auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
+                                             label=self.label_input,
+                                             num_thresholds=2**12,
+                                             slide_steps=20)
+        if is_infer:
+            self._infer_results["AUC"] = auc
+            self._infer_results["BATCH_AUC"] = batch_auc
+            return
+
+        self._metrics["AUC"] = auc
+        self._metrics["BATCH_AUC"] = batch_auc
+        cost = paddle.nn.functional.cross_entropy(
+            input=self.predict, label=self.label_input)
+        avg_cost = fluid.layers.reduce_mean(cost)
+        self._cost = avg_cost
+
+    def optimizer(self):
+        optimizer = paddle.optimizer.Adam(self.learning_rate, lazy_mode=True)
+        return optimizer
+
+    def infer_net(self):
+        pass
diff --git a/models/benchmark/simnet_bow/config.yaml b/models/benchmark/simnet_bow/config.yaml
new file mode 100644
index 000000000..f675df890
--- /dev/null
+++ b/models/benchmark/simnet_bow/config.yaml
@@ -0,0 +1,96 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+workspace: "./"
+
+hyper_parameters:
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 10
+  dense_input_dim: 13
+  fc_sizes: [400, 400, 400]
+
+mode: [ps_cpu]
+runner:
+- name: ps_cpu
+  class: cluster_train
+  epochs: 10
+  device: cpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: ps_gpu
+  class: cluster_train
+  epochs: 10
+  device: gpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1 
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: ps_heter
+  class: cluster_train
+  epochs: 10
+  device: gpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1 
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: local_infer
+  class: infer
+  epochs: 1
+  device: cpu
+  init_model_path: ""
+  phases: [phase2]
+
+- name: local_train
+  class: infer
+  epochs: 1
+  device: cpu
+  phases: [phase2]
+
+phase:
+- name: phase1
+  model: "{workspace}/model.py" 
+  dataset_name: dataloader_train 
+  thread_num: 1
+
+- name: phase2
+  model: "{workspace}/model.py" 
+  dataset_name: dataset_infer 
+  thread_num: 1
+
+dataset:
+- name: dataloader_train 
+  batch_size: 2
+  type: DataLoader 
+  data_path: "{workspace}/data/sample_data/train"
+- name: dataset_train 
+  batch_size: 2
+  type: QueueDataset 
+  data_path: "{workspace}/data/sample_data/train"
+- name: dataset_infer
+  batch_size: 2
+  type: DataLoader
+  data_path: "{workspace}/data/sample_data/train"
diff --git a/models/benchmark/word2vec/config.yaml b/models/benchmark/word2vec/config.yaml
new file mode 100644
index 000000000..f675df890
--- /dev/null
+++ b/models/benchmark/word2vec/config.yaml
@@ -0,0 +1,96 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+workspace: "./"
+
+hyper_parameters:
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 10
+  dense_input_dim: 13
+  fc_sizes: [400, 400, 400]
+
+mode: [ps_cpu]
+runner:
+- name: ps_cpu
+  class: cluster_train
+  epochs: 10
+  device: cpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: ps_gpu
+  class: cluster_train
+  epochs: 10
+  device: gpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1 
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: ps_heter
+  class: cluster_train
+  epochs: 10
+  device: gpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1 
+  save_checkpoint_path: "increment_dnn" 
+  print_interval: 1
+  phases: [phase1]
+
+- name: local_infer
+  class: infer
+  epochs: 1
+  device: cpu
+  init_model_path: ""
+  phases: [phase2]
+
+- name: local_train
+  class: infer
+  epochs: 1
+  device: cpu
+  phases: [phase2]
+
+phase:
+- name: phase1
+  model: "{workspace}/model.py" 
+  dataset_name: dataloader_train 
+  thread_num: 1
+
+- name: phase2
+  model: "{workspace}/model.py" 
+  dataset_name: dataset_infer 
+  thread_num: 1
+
+dataset:
+- name: dataloader_train 
+  batch_size: 2
+  type: DataLoader 
+  data_path: "{workspace}/data/sample_data/train"
+- name: dataset_train 
+  batch_size: 2
+  type: QueueDataset 
+  data_path: "{workspace}/data/sample_data/train"
+- name: dataset_infer
+  batch_size: 2
+  type: DataLoader
+  data_path: "{workspace}/data/sample_data/train"