Merge branch 'main' of github.com:datawhalechina/torch-rechub into main

piajiguai · Jul 4, 2022 · 4e07525 · 4e07525
2 parents 0cfda9c + c246a82
commit 4e07525
Show file tree

Hide file tree

Showing 7 changed files with 8 additions and 8 deletions.
diff --git a/examples/matching/run_ml_dssm.py b/examples/matching/run_ml_dssm.py
@@ -18,7 +18,7 @@ def get_movielens_data(data_path, load_cache=False):
     data = pd.read_csv(data_path)
     data["cate_id"] = data["genres"].apply(lambda x: x.split("|")[0])
     sparse_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', "cate_id"]
-    user_col, item_col, label_col = "user_id", "movie_id", "label"
+    user_col, item_col = "user_id", "movie_id"
 
     feature_max_idx = {}
     for feature in sparse_features:

diff --git a/examples/matching/run_ml_youtube_dnn.py b/examples/matching/run_ml_youtube_dnn.py
@@ -50,7 +50,7 @@ def get_movielens_data(data_path, load_cache=False):
                                                        neg_ratio=3,
                                                        min_item=0)
         x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=50)
-        y_train = np.array([0] * df_train.shape[0])  #label=0 means the first pred value is positiva sample
+        y_train = np.array([0] * df_train.shape[0])  #label=0 means the first pred value is positive sample
         x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=50)
         np.save("./data/ml-1m/saved/data_cache.npy", np.array((x_train, y_train, x_test), dtype=object))
 

diff --git a/examples/matching/run_ml_youtube_sbc.py b/examples/matching/run_ml_youtube_sbc.py
@@ -91,7 +91,7 @@ def main(dataset_path, epoch, learning_rate, batch_size, weight_decay, device, s
                        sample_weight_feature,
                        user_params={"dims": [128, 64, 16]},
                        item_params={"dims": [128, 64, 16]},
-                       batch_size=batch_size,
+                       batch_size=batch_size,  # !! should be same as batch size of dataloader
                        n_neg=3,
                        temperature=0.02)
     #mode=2 means use list-wise loss: softmax

diff --git a/examples/ranking/README.md b/examples/ranking/README.md
@@ -154,7 +154,7 @@ test:用于测试模型预测的1天广告数据，约四百六十万。
 - 预处理之后的数据下载地址：https://cowtransfer.com/s/e8b67418ce044c
 - 使用方法
 
-```python
+```shell
 python run_census.py --model_name SharedBottom
 python run_census.py --model_name ESMM
 python run_census.py --model_name MMOE

diff --git a/examples/ranking/run_ali_ccp_multi_task.py b/examples/ranking/run_ali_ccp_multi_task.py
@@ -24,7 +24,7 @@ def get_ali_ccp_data_dict(model_name, data_path='./data/ali-ccp'):
 
     col_names = data.columns.values.tolist()
     dense_cols = ['D109_14', 'D110_14', 'D127_14', 'D150_14', 'D508', 'D509', 'D702', 'D853']
-    sparse_cols = [col for col in col_names if col not in dense_cols and col not in ['cvr_label', 'ctr_label']]
+    sparse_cols = [col for col in col_names if col not in dense_cols and col not in ['cvr_label', 'ctr_label', 'ctcvr_label']]
     print("sparse cols:%d dense cols:%d" % (len(sparse_cols), len(dense_cols)))
     #define dense and sparse features
     if model_name == "ESMM":

diff --git a/examples/ranking/run_census.py b/examples/ranking/run_census.py
@@ -25,7 +25,7 @@ def get_census_data_dict(model_name, data_path='./data/census-income'):
 
     col_names = data.columns.values.tolist()
     dense_cols = ['age', 'wage per hour', 'capital gains', 'capital losses', 'divdends from stocks', 'num persons worked for employer', 'weeks worked in year']
-    sparse_cols = [col for col in col_names if col not in dense_cols and col not in ['cvr_label', 'ctr_label']]
+    sparse_cols = [col for col in col_names if col not in dense_cols and col not in ['cvr_label', 'ctr_label', 'ctcvr_label']]
     print("sparse cols:%d dense cols:%d" % (len(sparse_cols), len(dense_cols)))
     #define dense and sparse features
     if model_name == "ESMM":

diff --git a/examples/ranking/run_criteo.py b/examples/ranking/run_criteo.py
@@ -30,8 +30,8 @@ def get_criteo_data_dict(data_path):
     dense_features = [f for f in data.columns.tolist() if f[0] == "I"]
     sparse_features = [f for f in data.columns.tolist() if f[0] == "C"]
 
-    data[sparse_features] = data[sparse_features].fillna('-996',)
-    data[dense_features] = data[dense_features].fillna(0,)
+    data[sparse_features] = data[sparse_features].fillna('0')
+    data[dense_features] = data[dense_features].fillna(0)
 
     for feat in tqdm(dense_features):  #discretize dense feature and as new sparse feature
         sparse_features.append(feat + "_cat")