PaddlePaddle · frankwhzhang · May 12, 2022 · Apr 26, 2022 · Apr 26, 2022 · Apr 26, 2022
diff --git a/README_CN.md b/README_CN.md
@@ -171,6 +171,7 @@ python -u tools/static_trainer.py -m models/rank/dnn/config.yaml #  静态图训
   |   排序   |                                                                       [AutoFIS](models/rank/autofis/)                                                                       |  -  |       ✓     |     ✓     | >=2.1.0 | [KDD 2020][AutoFIS: Automatic Feature Interaction Selection in Factorization Models for Click-Through Rate Prediction](https://arxiv.org/pdf/2003.11235v3.pdf) |
   |   排序   |                                                                        [DCN_V2](models/rank/dcn_v2/)                                                                        |  -  |       ✓     |     ✓     | >=2.1.0 | [WWW 2021][DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/pdf/2008.13535v2.pdf)|
   |   排序   |                                                                          [AITM](models/rank/aitm/)                                                                          |  -  |       ✓     |     ✓     | >=2.1.0 | [KDD 2021][Modeling the Sequential Dependence among Audience Multi-step Conversions withMulti-task Learning in Targeted Display Advertising](https://arxiv.org/pdf/2105.08489v2.pdf)  |
+  |   排序   |                  [DSIN](models/rank/dsin/)                                                                          |  -  |       ✓     |     ✓     | >=2.1.0 | [IJCAI 2019][Deep Session Interest Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.06482v1.pdf)  |
   |  多任务  |                                  [PLE](models/multitask/ple/)([文档](https://paddlerec.readthedocs.io/en/latest/models/multitask/ple.html))                                   |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238938)  |       ✓     |     ✓     |  >=2.1.0 | [RecSys 2020][Progressive Layered Extraction (PLE): A Novel Multi-Task Learning (MTL) Model for Personalized Recommendations](https://dl.acm.org/doi/abs/10.1145/3383313.3412236)                                                              |
   |  多任务  |                                 [ESMM](models/multitask/esmm/)([文档](https://paddlerec.readthedocs.io/en/latest/models/multitask/esmm.html))                                 |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238583)  |       ✓     |     ✓     | >=2.1.0 | [SIGIR 2018][Entire Space Multi-Task Model: An Effective Approach for Estimating Post-Click Conversion Rate](https://arxiv.org/abs/1804.07931)                                                              |
   |  多任务  |                                 [MMOE](models/multitask/mmoe/)([文档](https://paddlerec.readthedocs.io/en/latest/models/multitask/mmoe.html))                                 |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238934)  |       ✓     |     ✓     | >=2.1.0 | [KDD 2018][Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts](https://dl.acm.org/doi/abs/10.1145/3219819.3220007)                                                       |

diff --git a/README_EN.md b/README_EN.md
@@ -161,6 +161,7 @@ python -u tools/static_trainer.py -m models/rank/dnn/config.yaml #  Training wit
   |   Rank   |                     [AutoFIS](models/rank/autofis/)                     |  -  |       ✓     |     ✓     | >=2.1.0 | [KDD 2020][AutoFIS: Automatic Feature Interaction Selection in Factorization Models for Click-Through Rate Prediction](https://arxiv.org/pdf/2003.11235v3.pdf)                                                                                                          |
   |   Rank   |                     [DCN_V2](models/rank/dcn_v2/)                     |  -  |       ✓     |     ✓     | >=2.1.0 | [WWW 2021][DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/pdf/2008.13535v2.pdf)|
   |   Rank   |                                                                          [AITM](models/rank/aitm/)                                                                          |  -  |       ✓     |     ✓     | >=2.1.0 | [KDD 2021][Modeling the Sequential Dependence among Audience Multi-step Conversions withMulti-task Learning in Targeted Display Advertising](https://arxiv.org/pdf/2105.08489v2.pdf)  |
+  |   Rank   |                  [DSIN](models/rank/dsin/)                                                                          |  -  |       ✓     |     ✓     | >=2.1.0 | [IJCAI 2019][Deep Session Interest Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.06482v1.pdf)  |
   |      Multi-Task       |                  [PLE](models/multitask/ple/)<br>([doc](https://paddlerec.readthedocs.io/en/latest/models/multitask/ple.html))                   |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238938)  |     ✓     |     ✓     |  >=2.1.0 | [RecSys 2020][Progressive Layered Extraction (PLE): A Novel Multi-Task Learning (MTL) Model for Personalized Recommendations](https://dl.acm.org/doi/abs/10.1145/3383313.3412236)                                                              |
   |      Multi-Task       |                  [ESMM](models/multitask/esmm/)<br>([doc](https://paddlerec.readthedocs.io/en/latest/models/multitask/esmm.html))                   |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238583)  |         ✓         |     ✓     |      >=2.1.0     | [SIGIR 2018][Entire Space Multi-Task Model: An Effective Approach for Estimating Post-Click Conversion Rate](https://arxiv.org/abs/1804.07931)                                                              |
   |      Multi-Task       |                  [MMOE](models/multitask/mmoe/)<br>([doc](https://paddlerec.readthedocs.io/en/latest/models/multitask/mmoe.html))                   |  [Python CPU/GPU](https://aistudio.baidu.com/aistudio/projectdetail/3238934)  |         ✓         |     ✓     |      >=2.1.0     | [KDD 2018][Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts](https://dl.acm.org/doi/abs/10.1145/3219819.3220007)                                                       |

diff --git a/datasets/Ali_Display_Ad_Click_DSIN/get_data.sh b/datasets/Ali_Display_Ad_Click_DSIN/get_data.sh
@@ -0,0 +1,10 @@
+mkdir raw_data
+cd raw_data
+wget https://paddlerec.bj.bcebos.com/datasets/dmr/user_profile.csv.tar.gz
+tar -zxvf user_profile.csv.tar.gz
+wget https://paddlerec.bj.bcebos.com/datasets/dmr/raw_sample.csv.tar.gz
+tar -zxvf raw_sample.csv.tar.gz
+wget https://paddlerec.bj.bcebos.com/datasets/dmr/behavior_log.csv.tar.gz
+tar -zxvf behavior_log.csv.tar.gz
+wget https://paddlerec.bj.bcebos.com/datasets/dmr/ad_feature.csv.tar.gz
+tar -zxvf ad_feature.csv.tar.gz
diff --git a/datasets/Ali_Display_Ad_Click_DSIN/readme.md b/datasets/Ali_Display_Ad_Click_DSIN/readme.md
@@ -0,0 +1,58 @@
+# Ali_Display_Ad_Click数据集
+[Ali_Display_Ad_Click](https://tianchi.aliyun.com/dataset/dataDetail?dataId=56)是阿里巴巴提供的一个淘宝展示广告点击率预估数据集
+
+## 原始数据集介绍
+- 原始样本骨架raw_sample：淘宝网站中随机抽样了114万用户8天内的广告展示/点击日志（2600万条记录），构成原始的样本骨架
+1. user：脱敏过的用户ID；
+2. adgroup_id：脱敏过的广告单元ID；
+3. time_stamp：时间戳；
+4. pid：资源位；
+5. nonclk：为1代表没有点击；为0代表点击；
+6. clk：为0代表没有点击；为1代表点击；
+
+```
+user,time_stamp,adgroup_id,pid,nonclk,clk
+581738,1494137644,1,430548_1007,1,0
+```
+
+- 广告基本信息表ad_feature：本数据集涵盖了raw_sample中全部广告的基本信息
+1. adgroup_id：脱敏过的广告ID；
+2. cate_id：脱敏过的商品类目ID；
+3. campaign_id：脱敏过的广告计划ID；
+4. customer: 脱敏过的广告主ID；
+5. brand：脱敏过的品牌ID；
+6. price: 宝贝的价格
+```
+adgroup_id,cate_id,campaign_id,customer,brand,price
+63133,6406,83237,1,95471,170.0
+```
+
+- 用户基本信息表user_profile：本数据集涵盖了raw_sample中全部用户的基本信息
+1. userid：脱敏过的用户ID；
+2. cms_segid：微群ID；
+3. cms_group_id：cms_group_id；
+4. final_gender_code：性别 1:男,2:女；
+5. age_level：年龄层次； 1234
+6. pvalue_level：消费档次，1:低档，2:中档，3:高档；
+7. shopping_level：购物深度，1:浅层用户,2:中度用户,3:深度用户
+8. occupation：是否大学生 ，1:是,0:否
+9. new_user_class_level：城市层级
+```
+userid,cms_segid,cms_group_id,final_gender_code,age_level,pvalue_level,shopping_level,occupation,new_user_class_level 
+234,0,5,2,5,,3,0,3
+```
+
+- 用户的行为日志behavior_log：本数据集涵盖了raw_sample中全部用户22天内的购物行为
+1. user：脱敏过的用户ID；
+2. time_stamp：时间戳；
+3. btag：行为类型, 包括以下四种：(pv:浏览),(cart:加入购物车),(fav:喜欢),(buy:购买)
+4. cate：脱敏过的商品类目id；
+5. brand: 脱敏过的品牌id；
+```
+user,time_stamp,btag,cate,brand
+558157,1493741625,pv,6250,91286
+```
+
+## 预处理数据集介绍
+对原始数据集中的四个文件，参考[原论文的数据预处理过程](https://github.com/shenweichen/DSIN/tree/master/code)对数据进行处理，形成满足DSIN论文条件且可以被reader直接读取的数据集。
+数据集共有八个pkl文件，训练集和测试集各自拥有四个，以训练集为例，这四个文件为train_feat_input.pkl、train_sess_input、train_sess_length和train_label.pkl。各自存储了按0.25的采样比进行采样后的user及item特征输入，用户会话特征输入、用户会话长度和标签数据。
diff --git a/datasets/Ali_Display_Ad_Click_DSIN/run.sh b/datasets/Ali_Display_Ad_Click_DSIN/run.sh
@@ -0,0 +1,12 @@
+mkdir big_train
+mkdir big_test
+wget -O model_input.tar.gz https://bj.bcebos.com/v1/ai-studio-online/53e61a9bcfc54e0581044883d0f876d9841cb4d0a68848f1a1d568a84591da6f?responseContentDisposition=attachment%3B%20filename%3Dmodel_input.tar.gz&authorization=bce-auth-v1%2F0ef6765c1e494918bc0d4c3ca3e5c6d1%2F2022-04-21T01%3A43%3A00Z%2F-1%2F%2F665a728726f0569e1ef9dd423adfa40a2a5e798f86a8d5d68804a2f21cc03624
+tar -zxvf model_input.tar.gz
+mv model_input/test_feat_input.pkl big_test/
+mv model_input/test_label.pkl big_test/
+mv model_input/test_sess_input.pkl big_test/
+mv model_input/test_session_length.pkl big_test/
+mv model_input/train_feat_input.pkl big_train/
+mv model_input/train_label.pkl big_train/
+mv model_input/train_sess_input.pkl big_train/
+mv model_input/train_session_length.pkl big_train/
diff --git a/doc/imgs/dsin.png b/doc/imgs/dsin.png
diff --git a/doc/source/models/rank/dsin.md b/doc/source/models/rank/dsin.md
@@ -0,0 +1,82 @@
+# dsin (Deep Session Interest Network for Click-Through Rate Prediction)
+
+代码请参考：[dsin](https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/dsin)  
+如果我们的代码对您有用，还请点个star啊~  
+
+## 内容
+
+- [模型简介](#模型简介)
+- [数据准备](#数据准备)
+- [运行环境](#运行环境)
+- [快速开始](#快速开始)
+- [模型组网](#模型组网)
+- [效果复现](#效果复现)
+- [进阶使用](#进阶使用)
+- [FAQ](#FAQ)
+
+## 模型简介
+本模型主要聚焦于用户的历史会话行为，通过Self-Attention和BiLSTM对历史会话行为进行学习，最后通过Activation Unit得到最终的session表征向量，再结合其他特征送入MLP计算最后的ctr score。[Deep Session Interest Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.06482v1.pdf)文章通过 Transformer 和 BiLSTM 来学习用户的 Session Interest Interacting，提升模型的表达能力。 
+
+## 数据准备
+本模型使用论文中的数据集Alimama Dataset，参考[原文作者的数据预处理过程](https://github.com/shenweichen/DSIN/tree/master/code)对数据进行处理。在模型目录的data目录下为您准备了快速运行的示例数据，若需要使用全量数据可以参考下方[效果复现](#效果复现)部分。
+
+## 运行环境
+PaddlePaddle>=2.0
+
+python 3.5/3.6/3.7
+
+os : windows/linux/macos 
+
+## 快速开始
+本文提供了样例数据可以供您快速体验，在任意目录下均可执行。在DSIN模型目录的快速执行命令如下： 
+```bash
+# 进入模型目录
+# cd models/rank/dmr # 在任意目录均可运行
+# 动态图训练
+python -u ../../../tools/trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
+# 动态图预测
+python -u ../../../tools/infer.py -m config.yaml 
+
+# 静态图训练
+python -u ../../../tools/static_trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
+# 静态图预测
+python -u ../../../tools/static_infer.py -m config.yaml 
+``` 
+
+## 模型组网
+论文[Deep Session Interest Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1905.06482v1.pdf)中的网络结构如图所示:  
+<p align="center">
+<img align="center" src="../../../doc/imgs/dsin.png">
+<p>
+
+## 效果复现
+为了方便使用者能够快速的跑通每一个模型，我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。  
+在全量数据下模型的指标如下：
+
+| 模型 | auc | batch_size | epoch_num | Time of each epoch |
+| :------| :------ | :------ | :------| :------ | 
+| DSIN | 0.6356 | 4096 | 1 | 约10分钟 |
+
+1. 确认您当前所在目录为PaddleRec/models/rank/dsin  
+2. 进入paddlerec/datasets/Ali_Display_Ad_Click_DSIN目录下，执行该脚本，会从国内源的服务器上下载我们预处理完成的Alimama全量数据集，并解压到指定文件夹。若您希望从原始数据集自行处理，请详见该目录下的readme。
+
+``` bash
+cd ../../../datasets/Ali_Display_Ad_Click_DSIN
+sh run.sh
+```
+3. 切回模型目录,执行命令运行全量数据
+
+```bash
+cd - # 切回模型目录
+# 动态图训练
+python -u ../../../tools/trainer.py -m config_bigdata.yaml # 全量数据运行config_bigdata.yaml 
+python -u ../../../tools/infer.py -m config_bigdata.yaml # 全量数据运行config_bigdata.yaml
+```
+
+效果复现过程可参考[AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3850087)。
+
+Note:运行环境为至尊GPU。
+
+## 进阶使用
+
+## FAQ
diff --git a/doc/source/readme.md b/doc/source/readme.md
@@ -49,3 +49,4 @@
 [deeprec](https://paddlerec.readthedocs.io/en/latest/models/rank/deeprec.html)  
 [autofis](https://paddlerec.readthedocs.io/en/latest/models/rank/autofis.html)  
 [aitm](https://paddlerec.readthedocs.io/en/latest/models/rank/aitm.html)  
+[dsin](https://paddlerec.readthedocs.io/en/latest/models/rank/dsin.html)  
diff --git a/models/rank/dsin/__init__.py b/models/rank/dsin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/models/rank/dsin/config.yaml b/models/rank/dsin/config.yaml
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "data/sample_data"
+  train_reader_path: "dsin_reader" # importlib format
+  use_gpu: False
+  use_auc: True
+  train_batch_size: 64
+  epochs: 1
+  print_interval: 10
+  # model_init_path: "output_model_dmr/0" # init model
+  model_save_path: "output_model_dsin"
+  test_data_dir: "data/sample_data"
+  infer_reader_path: "dsin_reader" # importlib format
+  infer_batch_size: 64
+  infer_load_path: "output_model_dsin"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.002
+  # user feature size
+  user_size: 265442
+  cms_segid_size: 97
+  cms_group_size: 13
+  final_gender_size: 2
+  age_level_size: 7
+  pvalue_level_size: 4
+  shopping_level_size: 3
+  occupation_size: 2
+  new_user_class_level_size: 5
+
+  # item feature size
+  adgroup_size: 512431
+  cate_size: 12974   #max value + 1
+  campaign_size: 309448
+  customer_size: 195841
+  brand_size: 461499  #max value + 1
+
+  # context feature size
+  pid_size: 2
+
+  # embedding size
+  feat_embed_size: 4
diff --git a/models/rank/dsin/config_bigdata.yaml b/models/rank/dsin/config_bigdata.yaml
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "../../../datasets/Ali_Display_Ad_Click_DSIN/big_train"
+  train_reader_path: "dsin_reader" # importlib format
+  use_gpu: True
+  use_auc: True
+  train_batch_size: 4096
+  epochs: 1
+  print_interval: 50
+
+  model_save_path: "output_model_all_dsin"
+  test_data_dir: "../../../datasets/Ali_Display_Ad_Click_DSIN/big_test"
+  infer_reader_path: "dsin_reader" # importlib format
+  infer_batch_size: 16384 # 2**14
+  infer_load_path: "output_model_all_dsin"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.00235
+  # user feature size
+  user_size: 265442
+  cms_segid_size: 97
+  cms_group_size: 13
+  final_gender_size: 2
+  age_level_size: 7
+  pvalue_level_size: 4
+  shopping_level_size: 3
+  occupation_size: 2
+  new_user_class_level_size: 5
+
+  # item feature size
+  adgroup_size: 512431
+  cate_size: 11859   #max value + 1
+  campaign_size: 309448
+  customer_size: 195841
+  brand_size: 362855  #max value + 1
+
+  # context feature size
+  pid_size: 2
+
+  # embedding size
+  feat_embed_size: 4
diff --git a/models/rank/dsin/data/sample_data/sample_feat_input.pkl b/models/rank/dsin/data/sample_data/sample_feat_input.pkl
diff --git a/models/rank/dsin/data/sample_data/sample_label.pkl b/models/rank/dsin/data/sample_data/sample_label.pkl
diff --git a/models/rank/dsin/data/sample_data/sample_sess_input.pkl b/models/rank/dsin/data/sample_data/sample_sess_input.pkl
diff --git a/models/rank/dsin/data/sample_data/sample_session_length.pkl b/models/rank/dsin/data/sample_data/sample_session_length.pkl