Skip to content

Conversation

MARD1NO
Copy link
Contributor

@MARD1NO MARD1NO commented Jul 26, 2022


fp32: 
================ Test Evaluation ================
Rank[0], Epoch 7, Step 75000, AUC 0.802074, LogLoss 0.125882, Eval_time 19.48 s, Metrics_time 6.48 s, Eval_samples 89140000, GPU_Memory 15074 MiB, Host_Memory 10726 MiB, 2022-07-27 11:03:38

对应HugeCTR脚本:

import hugectr
from mpi4py import MPI

data_dir = "/RAID0/liujuncheng/criteo1t_parquet_40M_long"

print(f"{data_dir}/train/_file_list.txt")

solver = hugectr.CreateSolver(batchsize_eval = 55296,# real value
                              batchsize = 55296, # 55296 or 69120
                              lr = 0.0025, # 对齐
                              warmup_steps = 2750, 
                              decay_start = 40000, 
                              decay_steps = 40000, 
                              decay_power = 2.0,
                              end_lr = 1e-6,
                              enable_tf32_compute = True,
                              #use_mixed_precision = True,
                              #scaler = 1024,
                              vvgpu = [[0,1,2,3]], # 8 gpus
                              repeat_dataset = True,
                              use_algorithm_search=False,
                              i64_input_key = True) # in32, False

reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                  source = [f"{data_dir}/train/_file_list.txt"],
                                  eval_source = f"{data_dir}/test/_file_list.txt",
                                  slot_size_array = [62774, 8001, 2901, 74279, 7513, 3369, 1392, 21627, 7919, 21, 276, 1231236, 9643, 39873199, 38853, 17240, 7421, 20263, 3, 7103, 1540, 63, 38457188, 2929249, 400771, 10, 2209, 11910, 152, 4, 976, 14, 39976779, 25414584, 39639858, 583095, 12929, 108, 36],  # real value
                                  check_type = hugectr.Check_t.Non)
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam,
                                    update_type = hugectr.Update_t.Local, #有可能会影响性能
                                    beta1 = 0.9,
                                    beta2 = 0.999,
                                    epsilon = 1e-8)

dropout_rate = 0.05

model = hugectr.Model(solver, reader, optimizer)
model.add(hugectr.Input(label_dim = 1, label_name = "labels",
                        dense_dim = 0, 
                        dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("data1", 2, False, 39)])) # 2 False 的含义

model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.LocalizedSlotSparseEmbeddingHash, #有三种可以选 
                            workspace_size_per_gpu_in_mb = 15000,#bigger enough
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "data1",
                            slot_size_array = [62774, 8001, 2901, 74279, 7513, 3369, 1392, 21627, 7919, 21, 276, 1231236, 9643, 39873199, 38853, 17240, 7421, 20263, 3, 7103, 1540, 63, 38457188, 2929249, 400771, 10, 2209, 11910, 152, 4, 976, 14, 39976779, 25414584, 39639858, 583095, 12929, 108, 36], 
                            optimizer = optimizer))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape_sparse_embedding"],
                            leading_dim=16 * 39))  

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCross,
                            bottom_names = ["reshape_sparse_embedding"],
                            top_names = ["multicross1"],
                            num_layers=4))

# layer1
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["reshape_sparse_embedding"],
                            top_names = ["fc1"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu1"],
                            top_names = ["dropout1"],
                            dropout_rate=dropout_rate))

# layer2
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout1"],
                            top_names = ["fc2"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu2"],
                            top_names = ["dropout2"],
                            dropout_rate=dropout_rate))

# layer3
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout2"],
                            top_names = ["fc3"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc3"],
                            top_names = ["relu3"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu3"],
                            top_names = ["dropout3"],
                            dropout_rate=dropout_rate))

# layer4
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout3"],
                            top_names = ["fc4"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc4"],
                            top_names = ["relu4"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu4"],
                            top_names = ["dropout4"],
                            dropout_rate=dropout_rate))

# layer5
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout4"],
                            top_names = ["fc5"],
                            num_output=1000))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc5"],
                            top_names = ["relu5"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu5"],
                            top_names = ["dropout5"],
                            dropout_rate=dropout_rate))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["dropout5", "multicross1"],
                            top_names = ["concat2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat2"],
                            top_names = ["fc6"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["fc6", "labels"],
                            top_names = ["loss"]))
model.compile()
model.summary()
# model.fit(max_iter = 2300, display = 200, eval_interval = 1000, snapshot = 1000000, snapshot_prefix = "dcn")
model.fit(max_iter = 75000, display = 1000, eval_interval = 4999, snapshot = 1000000, snapshot_prefix = "dcn")

[HCTR][02:21:10.156][INFO][RK0][main]: Evaluation, AUC: 0.804863
[HCTR][02:21:10.156][INFO][RK0][main]: Eval Time for 100 iters: 3.16018s

@MARD1NO MARD1NO changed the title DCN 40M DCN 40M fp32 Jul 27, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant