Skip to content

Commit

Permalink
ctr benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
MrChengmo committed Nov 1, 2020
1 parent f8a9b21 commit 3432645
Show file tree
Hide file tree
Showing 7 changed files with 485 additions and 0 deletions.
Empty file.
99 changes: 99 additions & 0 deletions models/benchmark/ctr_dnn/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

workspace: "./"

hyper_parameters:
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
sparse_inputs_slots: 27
sparse_feature_number: 1000001
sparse_feature_dim: 10
dense_feature_dim: 13
fc_sizes: [400, 400, 400]

mode: [local_train]
runner:
- name: ps_cpu
class: cluster_train
epochs: 10
device: cpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
print_interval: 1
phases: [phase1]

- name: ps_gpu
class: cluster_train
epochs: 10
device: gpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
print_interval: 1
phases: [phase1]

- name: ps_heter
class: cluster_train
epochs: 10
device: gpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
print_interval: 1
phases: [phase1]

- name: local_infer
class: infer
epochs: 1
device: cpu
init_model_path: ""
phases: [phase2]

- name: local_train
class: train
epochs: 1
device: cpu
phases: [phase2]

phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataloader_train
thread_num: 1

- name: phase2
model: "{workspace}/model.py"
dataset_name: dataset_infer
thread_num: 1

dataset:
- name: dataloader_train
batch_size: 2
type: DataLoader
data_converter: "{workspace}/dataset_generator.py"
data_path: "{workspace}/train_data"
- name: dataset_train
batch_size: 2
type: QueueDataset
data_converter: "{workspace}/dataset_generator.py"
data_path: "{workspace}/train_data"
- name: dataset_infer
batch_size: 2
type: DataLoader
data_converter: "{workspace}/dataset_generator.py"
data_path: "{workspace}/test_data"
64 changes: 64 additions & 0 deletions models/benchmark/ctr_dnn/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlerec.core.reader import ReaderBase


class Reader(ReaderBase):
"""
DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading
Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675
"""

def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.cont_diff_ = [
20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50
]
self.hash_dim_ = 1000001
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)

def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""

def reader():
"""
This function needs to be implemented by the user, based on data format
"""
features = line.rstrip('\n').split('\t')
dense_feature = []
sparse_feature = []
for idx in self.continuous_range_:
if features[idx] == "":
dense_feature.append(0.0)
else:
dense_feature.append(
(float(features[idx]) - self.cont_min_[idx - 1]) /
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
sparse_feature.append(
[hash(str(idx) + features[idx]) % self.hash_dim_])
label = [int(features[0])]
feature_name = ["dense_feature"]
for idx in self.categorical_range_:
feature_name.append("C" + str(idx - 13))
feature_name.append("label")
yield zip(feature_name, [dense_feature] + sparse_feature + [label])

return reader
13 changes: 13 additions & 0 deletions models/benchmark/ctr_dnn/download_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
tar -zxvf ctr_data.tar.gz
mv ./raw_data ./train_data_full
mkdir train_data && cd train_data
cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd ..
mv ./test_data ./test_data_full
mkdir test_data && cd test_data
cp ../test_data_full/part-220 ./ && cd ..
echo "Complete data download."
echo "Full Train data stored in ./train_data_full "
echo "Full Test data stored in ./test_data_full "
echo "Rapid Verification train data stored in ./train_data "
echo "Rapid Verification test data stored in ./test_data "
117 changes: 117 additions & 0 deletions models/benchmark/ctr_dnn/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import paddle
import paddle.fluid as fluid

from paddlerec.core.utils import envs
from paddlerec.core.model import ModelBase


class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)

def _init_hyper_parameters(self):
self.dense_feature_dim = envs.get_global_env(
"hyper_parameters.dense_feature_dim")
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")

def input_data(self, is_infer=False, **kwargs):
dense_input = paddle.data(
name="dense_input",
shape=[self.dense_feature_dim],
dtype="float32")

sparse_input_ids = [
paddle.data(
name="C" + str(i), shape=[1], lod_level=1, dtype="int64")
for i in range(1, 27)
]

label = paddle.data(name="label", shape=[1], dtype="float32")

inputs = [dense_input] + sparse_input_ids + [label]
return inputs

def net(self, input, is_infer=False):
self.dense_input = input[0]
self.sparse_input = input[1:-1]
self.label_input = self.input[-1]

def embedding_layer(input):
emb = paddle.static.nn.embedding(
input=input,
is_sparse=True,
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=paddle.ParamAttr(
name="SparseFeatFactors",
initializer=fluid.initializer.Uniform()))
emb_sum = fluid.layers.sequence_pool(input=emb, pool_type='sum')
return emb_sum

sparse_embed_seq = list(map(embedding_layer, self.sparse_inputs))
concated = paddle.concat(sparse_embed_seq + [self.dense_input], axis=1)

fcs = [concated]
hidden_layers = envs.get_global_env("hyper_parameters.fc_sizes")

for size in hidden_layers:
output = paddle.static.nn.fc(
input=fcs[-1],
size=size,
act='relu',
param_attr=paddle.ParamAttr(
initializer=fluid.initializer.Normal(
scale=1.0 / math.sqrt(fcs[-1].shape[1]))))
fcs.append(output)

predict = paddle.static.nn.fc(
input=fcs[-1],
size=2,
act="softmax",
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
scale=1 / math.sqrt(fcs[-1].shape[1]))))

self.predict = predict

auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
label=self.label_input,
num_thresholds=2**12,
slide_steps=20)
if is_infer:
self._infer_results["AUC"] = auc
self._infer_results["BATCH_AUC"] = batch_auc
return

self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc
cost = paddle.nn.functional.cross_entropy(
input=self.predict, label=self.label_input)
avg_cost = fluid.layers.reduce_mean(cost)
self._cost = avg_cost

def optimizer(self):
optimizer = paddle.optimizer.Adam(self.learning_rate, lazy_mode=True)
return optimizer

def infer_net(self):
pass
96 changes: 96 additions & 0 deletions models/benchmark/simnet_bow/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

workspace: "./"

hyper_parameters:
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
sparse_inputs_slots: 27
sparse_feature_number: 1000001
sparse_feature_dim: 10
dense_input_dim: 13
fc_sizes: [400, 400, 400]

mode: [ps_cpu]
runner:
- name: ps_cpu
class: cluster_train
epochs: 10
device: cpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
print_interval: 1
phases: [phase1]

- name: ps_gpu
class: cluster_train
epochs: 10
device: gpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
print_interval: 1
phases: [phase1]

- name: ps_heter
class: cluster_train
epochs: 10
device: gpu
fleet_mode: ps
save_checkpoint_interval: 1
save_checkpoint_path: "increment_dnn"
print_interval: 1
phases: [phase1]

- name: local_infer
class: infer
epochs: 1
device: cpu
init_model_path: ""
phases: [phase2]

- name: local_train
class: infer
epochs: 1
device: cpu
phases: [phase2]

phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataloader_train
thread_num: 1

- name: phase2
model: "{workspace}/model.py"
dataset_name: dataset_infer
thread_num: 1

dataset:
- name: dataloader_train
batch_size: 2
type: DataLoader
data_path: "{workspace}/data/sample_data/train"
- name: dataset_train
batch_size: 2
type: QueueDataset
data_path: "{workspace}/data/sample_data/train"
- name: dataset_infer
batch_size: 2
type: DataLoader
data_path: "{workspace}/data/sample_data/train"
Loading

0 comments on commit 3432645

Please sign in to comment.