Merge pull request PaddlePaddle#423 from wangzhen38/wz38-stuff

【Upgrade】xdeepfm
sqsxwj520 · Apr 26, 2021 · 9b286ef · 9b286ef
2 parents 5c53d1b + 851cb3e
commit 9b286ef
Show file tree

Hide file tree

Showing 15 changed files with 860 additions and 4 deletions.
diff --git a/models/rank/ffm/config.yaml b/models/rank/ffm/config.yaml
@@ -18,7 +18,7 @@ runner:
   use_gpu: False 
   use_auc: True
   train_batch_size: 2
-  epochs: 3
+  epochs: 1
   print_interval: 2
   #model_init_path: "output_model/0" # init model
   model_save_path: "output_model_ffm"
@@ -27,7 +27,7 @@ runner:
   infer_batch_size: 5
   infer_load_path: "output_model_ffm"
   infer_start_epoch: 0
-  infer_end_epoch: 3
+  infer_end_epoch: 1
 
 # hyper parameters of user-defined network
 hyper_parameters:

diff --git a/models/rank/ffm/config_bigdata.yaml b/models/rank/ffm/config_bigdata.yaml
@@ -17,7 +17,7 @@ runner:
   use_gpu: True
   use_auc: True
   train_batch_size: 4096
-  epochs: 10
+  epochs: 1
   print_interval: 10
   #model_init_path: "output_model/0" # init model
   model_save_path: "output_model_all_ffm"
@@ -26,7 +26,7 @@ runner:
   infer_batch_size: 512
   infer_load_path: "output_model_all_ffm"
   infer_start_epoch: 0
-  infer_end_epoch: 4
+  infer_end_epoch: 1
 
 hyper_parameters:
   # optimizer config

diff --git a/models/rank/xdeepfm/__init__.py b/models/rank/xdeepfm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/models/rank/xdeepfm/config.yaml b/models/rank/xdeepfm/config.yaml
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+runner:
+  train_data_dir: "data/sample_data/train"
+  train_reader_path: "criteo_reader" # importlib format
+  use_gpu: True
+  use_auc: True
+  train_batch_size: 5
+  epochs: 1
+  print_interval: 2
+  #model_init_path: "output_model/0" # init model
+  model_save_path: "output_model_deepfm"
+  test_data_dir: "data/sample_data/train"
+  infer_reader_path: "criteo_reader" # importlib format
+  infer_batch_size: 5
+  infer_load_path: "output_model_deepfm"
+  infer_start_epoch: 0
+  infer_end_epoch: 3
+
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [512, 256, 128, 32]
+  layer_sizes_dnn: [512, 256, 128]
+  layer_sizes_cin: [128, 32]
+  distributed_embedding: 0
diff --git a/models/rank/xdeepfm/config_bigdata.yaml b/models/rank/xdeepfm/config_bigdata.yaml
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# global settings 
+
+runner:
+  train_data_dir: "../../../datasets/criteo/slot_train_data_full"
+  train_reader_path: "criteo_reader" # importlib format
+  use_gpu: True
+  use_auc: False
+  train_batch_size: 4096
+  epochs: 1
+  print_interval: 10
+  #model_init_path: "output_model/0" # init model
+  model_save_path: "output_model_bigdata_deepfm_dy"
+  test_data_dir: "../../../datasets/criteo/slot_test_data_full"
+  infer_reader_path: "criteo_reader" # importlib format
+  infer_batch_size: 4096
+  infer_load_path: "output_model_bigdata_deepfm_dy"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  sparse_inputs_slots: 27
+  sparse_feature_number: 1000001
+  sparse_feature_dim: 9
+  dense_input_dim: 13
+  fc_sizes: [400, 400, 400]
+  distributed_embedding: 0
+  layer_sizes_dnn: [400, 400]
+  layer_sizes_cin: [200, 200, 200]
diff --git a/models/rank/xdeepfm/criteo_reader.py b/models/rank/xdeepfm/criteo_reader.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+
+from paddle.io import IterableDataset
+
+
+class RecDataset(IterableDataset):
+    def __init__(self, file_list, config):
+        super(RecDataset, self).__init__()
+        self.file_list = file_list
+        self.init()
+
+    def init(self):
+        from operator import mul
+        padding = 0
+        sparse_slots = "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
+        self.sparse_slots = sparse_slots.strip().split(" ")
+        self.dense_slots = ["dense_feature"]
+        self.dense_slots_shape = [13]
+        self.slots = self.sparse_slots + self.dense_slots
+        self.slot2index = {}
+        self.visit = {}
+        for i in range(len(self.slots)):
+            self.slot2index[self.slots[i]] = i
+            self.visit[self.slots[i]] = False
+        self.padding = padding
+
+    def __iter__(self):
+        full_lines = []
+        self.data = []
+        for file in self.file_list:
+            with open(file, "r") as rf:
+                for l in rf:
+                    line = l.strip().split(" ")
+                    output = [(i, []) for i in self.slots]
+                    for i in line:
+                        slot_feasign = i.split(":")
+                        slot = slot_feasign[0]
+                        if slot not in self.slots:
+                            continue
+                        if slot in self.sparse_slots:
+                            feasign = int(slot_feasign[1])
+                        else:
+                            feasign = float(slot_feasign[1])
+                        output[self.slot2index[slot]][1].append(feasign)
+                        self.visit[slot] = True
+                    for i in self.visit:
+                        slot = i
+                        if not self.visit[slot]:
+                            if i in self.dense_slots:
+                                output[self.slot2index[i]][1].extend(
+                                    [self.padding] *
+                                    self.dense_slots_shape[self.slot2index[i]])
+                            else:
+                                output[self.slot2index[i]][1].extend(
+                                    [self.padding])
+                        else:
+                            self.visit[slot] = False
+                    # sparse
+                    output_list = []
+                    for key, value in output[:-1]:
+                        output_list.append(np.array(value).astype('int64'))
+                    # dense
+                    output_list.append(
+                        np.array(output[-1][1]).astype("float32"))
+                    # list
+                    yield output_list