Skip to content

Commit

Permalink
add ctr backend
Browse files Browse the repository at this point in the history
  • Loading branch information
MrChengmo committed Nov 6, 2020
1 parent 741110d commit 529d4cc
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 9 deletions.
42 changes: 42 additions & 0 deletions benchmark/ctr_dnn/backend.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
backend: "PaddleCloud"
cluster_type: mpi # k8s可选

config:
paddle_version: "1.8.2"

# hdfs/afs的配置信息填写
fs_name: "afs://yinglong.afs.baidu.com:9902"
fs_ugi: "paddle,paddle"

# 填任务输出目录的远程地址,如afs:/user/your/path/ 则此处填 /user/your/path
output_path: "/user/paddle/chengmo/heter_ps/ctr_dnn/output"

# for mpi
# 填远程数据及地址,如afs:/user/your/path/ 则此处填 /user/your/path
train_data_path: "/user/paddle/benchmark/ctr/train_data_paddle"
# test_data_path: ""
# thirdparty_path: ""

submit:
# PaddleCloud 个人信息 AK 及 SK
ak: "3acae1baa7505ca1b074e7c93b81e2d2"
sk: "9a4262b4de7557c086e4ddfa778ae5a3"

# 任务运行优先级,默认high
priority: "high"

# 任务名称
job_name: "PaddleRec_benchmark_CTR"

# 训练资源所在组
group: "paddle"

# 节点上的任务启动命令
start_cmd: "python -m paddlerec.run -m ./config.yaml"

# 本地需要上传到节点工作目录的文件
files: ./*.py ./*.yaml

# for mpi ps-cpu
# mpi 参数服务器模式下,任务的节点数
nodes: 2
6 changes: 3 additions & 3 deletions benchmark/ctr_dnn/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ hyper_parameters:
dense_feature_dim: 13
fc_sizes: [400, 400, 400]

mode: [collective]
mode: [ps_cpu]
runner:
- name: ps_cpu
class: local_cluster_train
class: cluster_train
epochs: 1
device: cpu
fleet_mode: ps
Expand Down Expand Up @@ -72,7 +72,7 @@ phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: dataloader_train
thread_num: 1
thread_num: 2

- name: phase2
model: "{workspace}/model.py"
Expand Down
4 changes: 4 additions & 0 deletions benchmark/ctr_dnn/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ def embedding_layer(input):
emb = paddle.reshape(emb, [-1, self.sparse_feature_dim])
return emb

slr = fluid.global_scope().find_var("var")
data = np.array(slr.get_selected_rows().get_tensor())
rows = np.array(slr.get_selected_rows().rows())

sparse_embed_seq = list(map(embedding_layer, self.sparse_input))
concated = paddle.concat(sparse_embed_seq + [self.dense_input], axis=1)

Expand Down
12 changes: 6 additions & 6 deletions core/engine/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,19 @@ def __init_impl__(self):

def start_worker_procs(self):
if (envs.get_runtime_environ("fleet_mode") == "COLLECTIVE"):
#trainer_ports = os.getenv("TRAINER_PORTS", None).split(",")
#trainer_ports = os.getenv("TRAINER_PORTS", None).split(",")
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_visible_devices is None or cuda_visible_devices == "":
selected_gpus = range(int(os.getenv("TRAINER_GPU_CARD_COUNT")))
else:
# change selected_gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
# therefore selected_gpus=0,1,2,3
# change selected_gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
# therefore selected_gpus=0,1,2,3
cuda_visible_devices_list = cuda_visible_devices.split(',')
for x in range(int(os.getenv("TRAINER_GPU_CARD_COUNT"))):
assert x in cuda_visible_devices_list, "Can't find "\
"your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
% (x, cuda_visible_devices)
"your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
% (x, cuda_visible_devices)
selected_gpus = [cuda_visible_devices_list.index(x)]
print("selected_gpus:{}".format(selected_gpus))

Expand Down

0 comments on commit 529d4cc

Please sign in to comment.