Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#423 from wangzhen38/wz38-stuff
Browse files Browse the repository at this point in the history
【Upgrade】xdeepfm
  • Loading branch information
frankwhzhang authored Apr 26, 2021
2 parents 5c53d1b + 851cb3e commit 9b286ef
Show file tree
Hide file tree
Showing 15 changed files with 860 additions and 4 deletions.
4 changes: 2 additions & 2 deletions models/rank/ffm/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ runner:
use_gpu: False
use_auc: True
train_batch_size: 2
epochs: 3
epochs: 1
print_interval: 2
#model_init_path: "output_model/0" # init model
model_save_path: "output_model_ffm"
Expand All @@ -27,7 +27,7 @@ runner:
infer_batch_size: 5
infer_load_path: "output_model_ffm"
infer_start_epoch: 0
infer_end_epoch: 3
infer_end_epoch: 1

# hyper parameters of user-defined network
hyper_parameters:
Expand Down
4 changes: 2 additions & 2 deletions models/rank/ffm/config_bigdata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ runner:
use_gpu: True
use_auc: True
train_batch_size: 4096
epochs: 10
epochs: 1
print_interval: 10
#model_init_path: "output_model/0" # init model
model_save_path: "output_model_all_ffm"
Expand All @@ -26,7 +26,7 @@ runner:
infer_batch_size: 512
infer_load_path: "output_model_all_ffm"
infer_start_epoch: 0
infer_end_epoch: 4
infer_end_epoch: 1

hyper_parameters:
# optimizer config
Expand Down
13 changes: 13 additions & 0 deletions models/rank/xdeepfm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
49 changes: 49 additions & 0 deletions models/rank/xdeepfm/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


runner:
train_data_dir: "data/sample_data/train"
train_reader_path: "criteo_reader" # importlib format
use_gpu: True
use_auc: True
train_batch_size: 5
epochs: 1
print_interval: 2
#model_init_path: "output_model/0" # init model
model_save_path: "output_model_deepfm"
test_data_dir: "data/sample_data/train"
infer_reader_path: "criteo_reader" # importlib format
infer_batch_size: 5
infer_load_path: "output_model_deepfm"
infer_start_epoch: 0
infer_end_epoch: 3


# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_inputs_slots: 27
sparse_feature_number: 1000001
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
layer_sizes_dnn: [512, 256, 128]
layer_sizes_cin: [128, 32]
distributed_embedding: 0
49 changes: 49 additions & 0 deletions models/rank/xdeepfm/config_bigdata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# global settings

runner:
train_data_dir: "../../../datasets/criteo/slot_train_data_full"
train_reader_path: "criteo_reader" # importlib format
use_gpu: True
use_auc: False
train_batch_size: 4096
epochs: 1
print_interval: 10
#model_init_path: "output_model/0" # init model
model_save_path: "output_model_bigdata_deepfm_dy"
test_data_dir: "../../../datasets/criteo/slot_test_data_full"
infer_reader_path: "criteo_reader" # importlib format
infer_batch_size: 4096
infer_load_path: "output_model_bigdata_deepfm_dy"
infer_start_epoch: 0
infer_end_epoch: 1

# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_inputs_slots: 27
sparse_feature_number: 1000001
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [400, 400, 400]
distributed_embedding: 0
layer_sizes_dnn: [400, 400]
layer_sizes_cin: [200, 200, 200]
81 changes: 81 additions & 0 deletions models/rank/xdeepfm/criteo_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import numpy as np

from paddle.io import IterableDataset


class RecDataset(IterableDataset):
def __init__(self, file_list, config):
super(RecDataset, self).__init__()
self.file_list = file_list
self.init()

def init(self):
from operator import mul
padding = 0
sparse_slots = "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
self.sparse_slots = sparse_slots.strip().split(" ")
self.dense_slots = ["dense_feature"]
self.dense_slots_shape = [13]
self.slots = self.sparse_slots + self.dense_slots
self.slot2index = {}
self.visit = {}
for i in range(len(self.slots)):
self.slot2index[self.slots[i]] = i
self.visit[self.slots[i]] = False
self.padding = padding

def __iter__(self):
full_lines = []
self.data = []
for file in self.file_list:
with open(file, "r") as rf:
for l in rf:
line = l.strip().split(" ")
output = [(i, []) for i in self.slots]
for i in line:
slot_feasign = i.split(":")
slot = slot_feasign[0]
if slot not in self.slots:
continue
if slot in self.sparse_slots:
feasign = int(slot_feasign[1])
else:
feasign = float(slot_feasign[1])
output[self.slot2index[slot]][1].append(feasign)
self.visit[slot] = True
for i in self.visit:
slot = i
if not self.visit[slot]:
if i in self.dense_slots:
output[self.slot2index[i]][1].extend(
[self.padding] *
self.dense_slots_shape[self.slot2index[i]])
else:
output[self.slot2index[i]][1].extend(
[self.padding])
else:
self.visit[slot] = False
# sparse
output_list = []
for key, value in output[:-1]:
output_list.append(np.array(value).astype('int64'))
# dense
output_list.append(
np.array(output[-1][1]).astype("float32"))
# list
yield output_list
Loading

0 comments on commit 9b286ef

Please sign in to comment.