Extreme-classification
diff --git a/‎README.md‎
Lines changed: 40 additions & 5 deletions b/‎README.md‎
Lines changed: 40 additions & 5 deletions
diff --git a/‎deepxml/configs/DeepXML-ANNS/EURLex-4K.json‎
Lines changed: 4 additions & 3 deletions b/‎deepxml/configs/DeepXML-ANNS/EURLex-4K.json‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎deepxml/configs/DeepXML-OVA/EURLex-4K.json‎
Lines changed: 1 addition & 1 deletion b/‎deepxml/configs/DeepXML-OVA/EURLex-4K.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepxml/configs/DeepXML/Amazon-670K.json‎
Lines changed: 85 additions & 0 deletions b/‎deepxml/configs/DeepXML/Amazon-670K.json‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎deepxml/configs/DeepXML/AmazonTitles-3M.json‎
Lines changed: 85 additions & 0 deletions b/‎deepxml/configs/DeepXML/AmazonTitles-3M.json‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎deepxml/configs/DeepXML/LF-Amazon-131K.json‎
Lines changed: 85 additions & 0 deletions b/‎deepxml/configs/DeepXML/LF-Amazon-131K.json‎
Lines changed: 85 additions & 0 deletions
@@ -56,8 +56,9 @@ An ensemble can be trained as follows. A json file is used to specify architectu
 
 ---
 
+### Expected directory structure
+
 ```txt
-Expected directory structure
 +-- work_dir
 |  +-- programs
 |  |  +-- deepxml
@@ -69,6 +70,17 @@ Expected directory structure
 
 ```
 
+### Convert the data to new format
+
+```perl
+# A perl script is provided (deepxml/tools) to convert the data into new format as expected by DeepXML
+perl convert_format.pl <data_dir>/train.txt <data_dir>/trn_X_Xf.txt <data_dir>/trn_X_Y.txt
+
+perl convert_format.pl <data_dir>/test.txt <data_dir>/tst_X_Xf.txt <data_dir>/tst_X_Y.txt
+```
+
+### Run details
+
 ```txt
 ./run_main.sh <gpu_id> <framework> <dataset> <version> <seed>
 
@@ -82,8 +94,11 @@ Expected directory structure
 * dataset
   - Name of the dataset.
   - Expected files in work_dir/data/<dataset>
-    - train.txt
-    - test.txt
+    - trn_X_Xf.txt
+    - trn_X_Y.txt
+    - tst_X_Xf.txt
+    - tst_X_Y.txt
+    - fasttextB_embeddings_300d.npy or fasttextB_embeddings_512d.npy 
 
 * version
   - different runs could be managed by version and seed.
@@ -94,10 +109,30 @@ Expected directory structure
   - an ensemble is learned if multiple comma separated values are passed.
 ```
 
+### Notes
+
+```txt
+* Other file formats such as npy, npz, pickle are also supported.
+* Initializing with token embeddings (computed from FastText) leads to noticible accuracy gain in Astec. Please ensure that the token embedding file is available in data directory, if 'init=token_embeddings', otherwise it'll throw an error.
+* Config files are made available in deepxml/configs/<framework>/<method> for datasets in XC repository. You can use them when trying out Astec/DeepXML on new datasets.
+```
+
+## Cite as
+
+```bib
+@InProceedings{Dahiya21,
+    author = "Dahiya, K. and Saini, D. and Mittal, A. and Shaw, A. and Dave, K. and Soni, A. and Jain, H. and Agarwal, S. and Varma, M.",
+    title = "DeepXML: A Deep Extreme Multi-Label Learning Framework Applied to Short Text Documents",
+    booktitle = "Proceedings of the ACM International Conference on Web Search and Data Mining",
+    month = "March",
+    year = "2021"
+}
+```
+
 ## References
 
 ---
-[1] Dahiya, K., Saini, D., Mittal, A., Shaw, A., Dave, K., Soni,A., Jain, H., Agarwal, S., and Varma, M.  Deepxml:  Adeep extreme multi-label learning framework applied toshort text documents. In WSDM, 2021.
+[1] K. Dahiya, D. Saini, A. Mittal, A. Shaw, K. Dave, A. Soni, H. Jain, S. Agarwal, and M. Varma. Deepxml:  A deep extreme multi-label learning framework applied to short text documents. In WSDM, 2021.
 
 [2] pyxclib: <https://github.com/kunaldahiya/pyxclib>
 
@@ -107,4 +142,4 @@ Expected directory structure
 
 [5]  R. Babbar, and B. Schölkopf, DiSMEC - Distributed Sparse Machines for Extreme Multi-label Classification In WSDM, 2017.
 
-[6] Bojanowski, P., Grave, E., Joulin, A., & Mikolov, T. Enriching word vectors with subword information. In TACL, 2017.
+[6] P., Bojanowski, E. Grave, A. Joulin, and T. Mikolov. Enriching word vectors with subword information. In TACL, 2017.
@@ -7,6 +7,7 @@
         "A": 0.55,
         "B": 1.5,
         "use_reranker": true,
+        "beta": 0.5,
         "surrogate_threshold": 0,
         "surrogate_method": 0,
         "embedding_dims": 300,
@@ -35,16 +36,16 @@
         "beta": 0.5,
         "retrain_hnsw_after": 5,
         "use_intermediate_for_shorty": true,
-        "update_shortlist": false,
+        "update_shortlist": true,
         "surrogate_mapping": null,
         "num_clf_partitions": 1,
         "optim": "Adam",
-        "freeze_intermediate": true,
+        "freeze_intermediate": false,
         "validate": true,
         "model_method": "shortlist",
         "normalize": true,
         "shortlist_method": "hybrid",
-        "load_intermediate": false,
+        "init": "token_embeddings",
         "use_shortlist": true,
         "embeddings": "fasttextB_embeddings_300d.npy"
     }
 
@@ -32,7 +32,7 @@
         "validate": true,
         "model_method": "full",
         "normalize": true,
-        "load_intermediate": false,
+        "init": "token_embeddings",
         "use_shortlist": false,
         "embeddings": "fasttextB_embeddings_300d.npy"
     }
 
@@ -0,0 +1,85 @@
+{
+    "global": {
+        "dataset": "Amazon-670K",
+        "feature_type": "sparse",
+        "num_labels": 670091,
+        "arch": "Astec",
+        "A": 0.6,
+        "B": 2.6,
+        "use_reranker": true,
+        "surrogate_threshold": 65536,
+        "surrogate_method": 1,
+        "embedding_dims": 512,
+        "top_k": 250,
+	    "beta": 0.10,
+        "save_predictions": true, 
+        "trn_label_fname": "trn_X_Y.txt",
+        "val_label_fname": "tst_X_Y.txt",
+        "tst_label_fname": "tst_X_Y.txt",
+        "trn_feat_fname": "trn_X_Xf.txt",
+        "val_feat_fname": "tst_X_Xf.txt",
+        "tst_feat_fname": "tst_X_Xf.txt"
+    },
+    "surrogate": {
+        "num_epochs": 20,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.02,
+        "batch_size": 255,
+        "dlr_step": 14,
+        "normalize": true,
+        "optim": "Adam",
+        "init": "token_embeddings",
+        "embeddings": "fasttextB_embeddings_512d.npy",
+        "validate": true,
+        "save_intermediate": true
+    },
+    "extreme": {
+        "num_epochs": 20,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.0005,
+        "batch_size": 255,
+        "dlr_step": 14,
+        "ns_method": "ensemble",
+        "num_centroids": 1,
+        "efC": 300,
+        "efS": 400,
+        "M": 100,
+        "num_nbrs": 500,
+        "ann_threads": 18,
+        "beta": 0.5,
+        "surrogate_mapping": null,
+        "num_clf_partitions": 1,
+        "optim": "Adam",
+        "freeze_intermediate": true,
+        "validate": true,
+        "model_method": "shortlist",
+        "normalize": true,
+        "shortlist_method": "hybrid",
+        "init": "intermediate",
+        "use_shortlist": true,
+        "use_intermediate_for_shorty": true
+    },
+    "reranker": {
+        "num_epochs": 15,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.001,
+        "batch_size": 255,
+        "dlr_step": 8,
+        "beta": 0.5,
+        "num_clf_partitions": 1,
+        "optim": "Adam",
+        "validate": true,
+        "model_method": "reranker",
+        "shortlist_method": "static",
+        "surrogate_mapping": null,
+        "normalize": true,
+        "use_shortlist": true,
+        "init": "token_embeddings",
+        "save_intermediate": false,
+        "keep_invalid": true,
+        "freeze_intermediate": false,
+        "update_shortlist": false,
+        "use_pretrained_shortlist": true,
+        "embeddings": "fasttextB_embeddings_512d.npy"
+    }
+}
@@ -0,0 +1,85 @@
+{
+    "global": {
+        "dataset": "AmazonTitles-3M",
+        "feature_type": "sparse",
+        "num_labels": 2812281,
+        "arch": "Astec",
+        "A": 0.6,
+        "B": 2.6,
+        "use_reranker": true,
+        "surrogate_threshold": 65536,
+        "surrogate_method": 1,
+        "embedding_dims": 300,
+        "beta": 0.10,
+        "top_k": 300,
+        "save_predictions": true, 
+        "trn_label_fname": "trn_X_Y.txt",
+        "val_label_fname": "tst_X_Y.txt",
+        "tst_label_fname": "tst_X_Y.txt",
+        "trn_feat_fname": "trn_X_Xf.txt",
+        "val_feat_fname": "tst_X_Xf.txt",
+        "tst_feat_fname": "tst_X_Xf.txt"
+    },
+    "surrogate": {
+        "num_epochs": 20,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.003,
+        "batch_size": 255,
+        "dlr_step": 14,
+        "normalize": true,
+        "optim": "Adam",
+        "init": "token_embeddings",
+        "embeddings": "fasttextB_embeddings_300d.npy",
+        "validate": true,
+        "save_intermediate": true
+    },
+    "extreme": {
+        "num_epochs": 15,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.002,
+        "batch_size": 255,
+        "dlr_step": 14,
+        "ns_method": "ensemble",
+        "num_centroids": 300,
+        "efC": 300,
+        "efS": 400,
+        "M": 100,
+        "num_nbrs": 500,
+        "ann_threads": 18,
+        "beta": 0.5,
+        "surrogate_mapping": null,
+        "num_clf_partitions": 1,
+        "optim": "Adam",
+        "freeze_intermediate": true,
+        "validate": true,
+        "model_method": "shortlist",
+        "normalize": true,
+        "shortlist_method": "hybrid",
+        "init": "intermediate",
+        "use_shortlist": true,
+        "use_intermediate_for_shorty": true
+    },
+    "reranker": {
+        "num_epochs": 10,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.002,
+        "batch_size": 255,
+        "dlr_step": 7,
+        "beta": 0.6,
+        "num_clf_partitions": 1,
+        "optim": "Adam",
+        "validate": true,
+        "model_method": "reranker",
+        "shortlist_method": "static",
+        "surrogate_mapping": null,
+        "normalize": true,
+        "use_shortlist": true,
+        "init": "token_embeddings",
+        "save_intermediate": false,
+        "keep_invalid": true,
+        "freeze_intermediate": false,
+        "update_shortlist": false,
+        "use_pretrained_shortlist": true,
+        "embeddings": "fasttextB_embeddings_300d.npy"
+    }
+}
@@ -0,0 +1,85 @@
+{
+    "global": {
+        "dataset": "LF-Amazon-131K",
+        "feature_type": "sparse",
+        "num_labels": 131073,
+        "arch": "Astec",
+        "A": 0.6,
+        "B": 2.6,
+        "use_reranker": true,
+        "surrogate_threshold": 65536,
+        "surrogate_method": 1,
+        "embedding_dims": 512,
+        "top_k": 250,
+	    "beta": 0.10,
+        "save_predictions": true, 
+        "trn_label_fname": "trn_X_Y.txt",
+        "val_label_fname": "tst_X_Y.txt",
+        "tst_label_fname": "tst_X_Y.txt",
+        "trn_feat_fname": "trn_X_Xf.txt",
+        "val_feat_fname": "tst_X_Xf.txt",
+        "tst_feat_fname": "tst_X_Xf.txt"
+    },
+    "surrogate": {
+        "num_epochs": 20,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.02,
+        "batch_size": 255,
+        "dlr_step": 14,
+        "normalize": true,
+        "optim": "Adam",
+        "init": "token_embeddings",
+        "embeddings": "fasttextB_embeddings_512d.npy",
+        "validate": true,
+        "save_intermediate": true
+    },
+    "extreme": {
+        "num_epochs": 20,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.0005,
+        "batch_size": 255,
+        "dlr_step": 14,
+        "ns_method": "ensemble",
+        "num_centroids": 1,
+        "efC": 300,
+        "efS": 400,
+        "M": 100,
+        "num_nbrs": 500,
+        "ann_threads": 18,
+        "beta": 0.5,
+        "surrogate_mapping": null,
+        "num_clf_partitions": 1,
+        "optim": "Adam",
+        "freeze_intermediate": true,
+        "validate": true,
+        "model_method": "shortlist",
+        "normalize": true,
+        "shortlist_method": "hybrid",
+        "init": "intermediate",
+        "use_shortlist": true,
+        "use_intermediate_for_shorty": true
+    },
+    "reranker": {
+        "num_epochs": 15,
+        "dlr_factor": 0.5,
+        "learning_rate": 0.001,
+        "batch_size": 255,
+        "dlr_step": 8,
+        "beta": 0.5,
+        "num_clf_partitions": 1,
+        "optim": "Adam",
+        "validate": true,
+        "model_method": "reranker",
+        "shortlist_method": "static",
+        "surrogate_mapping": null,
+        "normalize": true,
+        "use_shortlist": true,
+        "init": "token_embeddings",
+        "save_intermediate": false,
+        "keep_invalid": true,
+        "freeze_intermediate": false,
+        "update_shortlist": false,
+        "use_pretrained_shortlist": true,
+        "embeddings": "fasttextB_embeddings_512d.npy"
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`"validate": true,`
`33`	`33`	`"model_method": "full",`
`34`	`34`	`"normalize": true,`
`35`		`- "load_intermediate": false,`
	`35`	`+ "init": "token_embeddings",`
`36`	`36`	`"use_shortlist": false,`
`37`	`37`	`"embeddings": "fasttextB_embeddings_300d.npy"`
`38`	`38`	`}`