Skip to content

Commit 89dee2e

Browse files
committed
A: configs; I: repr_dims from json file and init
1 parent 73c9c6e commit 89dee2e

File tree

15 files changed

+738
-22
lines changed

15 files changed

+738
-22
lines changed

README.md

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,9 @@ An ensemble can be trained as follows. A json file is used to specify architectu
5656

5757
---
5858

59+
### Expected directory structure
60+
5961
```txt
60-
Expected directory structure
6162
+-- work_dir
6263
| +-- programs
6364
| | +-- deepxml
@@ -69,6 +70,17 @@ Expected directory structure
6970
7071
```
7172

73+
### Convert the data to new format
74+
75+
```perl
76+
# A perl script is provided (deepxml/tools) to convert the data into new format as expected by DeepXML
77+
perl convert_format.pl <data_dir>/train.txt <data_dir>/trn_X_Xf.txt <data_dir>/trn_X_Y.txt
78+
79+
perl convert_format.pl <data_dir>/test.txt <data_dir>/tst_X_Xf.txt <data_dir>/tst_X_Y.txt
80+
```
81+
82+
### Run details
83+
7284
```txt
7385
./run_main.sh <gpu_id> <framework> <dataset> <version> <seed>
7486
@@ -82,8 +94,11 @@ Expected directory structure
8294
* dataset
8395
- Name of the dataset.
8496
- Expected files in work_dir/data/<dataset>
85-
- train.txt
86-
- test.txt
97+
- trn_X_Xf.txt
98+
- trn_X_Y.txt
99+
- tst_X_Xf.txt
100+
- tst_X_Y.txt
101+
- fasttextB_embeddings_300d.npy or fasttextB_embeddings_512d.npy
87102
88103
* version
89104
- different runs could be managed by version and seed.
@@ -94,10 +109,30 @@ Expected directory structure
94109
- an ensemble is learned if multiple comma separated values are passed.
95110
```
96111

112+
### Notes
113+
114+
```txt
115+
* Other file formats such as npy, npz, pickle are also supported.
116+
* Initializing with token embeddings (computed from FastText) leads to noticible accuracy gain in Astec. Please ensure that the token embedding file is available in data directory, if 'init=token_embeddings', otherwise it'll throw an error.
117+
* Config files are made available in deepxml/configs/<framework>/<method> for datasets in XC repository. You can use them when trying out Astec/DeepXML on new datasets.
118+
```
119+
120+
## Cite as
121+
122+
```bib
123+
@InProceedings{Dahiya21,
124+
author = "Dahiya, K. and Saini, D. and Mittal, A. and Shaw, A. and Dave, K. and Soni, A. and Jain, H. and Agarwal, S. and Varma, M.",
125+
title = "DeepXML: A Deep Extreme Multi-Label Learning Framework Applied to Short Text Documents",
126+
booktitle = "Proceedings of the ACM International Conference on Web Search and Data Mining",
127+
month = "March",
128+
year = "2021"
129+
}
130+
```
131+
97132
## References
98133

99134
---
100-
[1] Dahiya, K., Saini, D., Mittal, A., Shaw, A., Dave, K., Soni,A., Jain, H., Agarwal, S., and Varma, M. Deepxml: Adeep extreme multi-label learning framework applied toshort text documents. In WSDM, 2021.
135+
[1] K. Dahiya, D. Saini, A. Mittal, A. Shaw, K. Dave, A. Soni, H. Jain, S. Agarwal, and M. Varma. Deepxml: A deep extreme multi-label learning framework applied to short text documents. In WSDM, 2021.
101136

102137
[2] pyxclib: <https://github.com/kunaldahiya/pyxclib>
103138

@@ -107,4 +142,4 @@ Expected directory structure
107142

108143
[5] R. Babbar, and B. Schölkopf, DiSMEC - Distributed Sparse Machines for Extreme Multi-label Classification In WSDM, 2017.
109144

110-
[6] Bojanowski, P., Grave, E., Joulin, A., & Mikolov, T. Enriching word vectors with subword information. In TACL, 2017.
145+
[6] P., Bojanowski, E. Grave, A. Joulin, and T. Mikolov. Enriching word vectors with subword information. In TACL, 2017.

deepxml/configs/DeepXML-ANNS/EURLex-4K.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"A": 0.55,
88
"B": 1.5,
99
"use_reranker": true,
10+
"beta": 0.5,
1011
"surrogate_threshold": 0,
1112
"surrogate_method": 0,
1213
"embedding_dims": 300,
@@ -35,16 +36,16 @@
3536
"beta": 0.5,
3637
"retrain_hnsw_after": 5,
3738
"use_intermediate_for_shorty": true,
38-
"update_shortlist": false,
39+
"update_shortlist": true,
3940
"surrogate_mapping": null,
4041
"num_clf_partitions": 1,
4142
"optim": "Adam",
42-
"freeze_intermediate": true,
43+
"freeze_intermediate": false,
4344
"validate": true,
4445
"model_method": "shortlist",
4546
"normalize": true,
4647
"shortlist_method": "hybrid",
47-
"load_intermediate": false,
48+
"init": "token_embeddings",
4849
"use_shortlist": true,
4950
"embeddings": "fasttextB_embeddings_300d.npy"
5051
}

deepxml/configs/DeepXML-OVA/EURLex-4K.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
"validate": true,
3333
"model_method": "full",
3434
"normalize": true,
35-
"load_intermediate": false,
35+
"init": "token_embeddings",
3636
"use_shortlist": false,
3737
"embeddings": "fasttextB_embeddings_300d.npy"
3838
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
{
2+
"global": {
3+
"dataset": "Amazon-670K",
4+
"feature_type": "sparse",
5+
"num_labels": 670091,
6+
"arch": "Astec",
7+
"A": 0.6,
8+
"B": 2.6,
9+
"use_reranker": true,
10+
"surrogate_threshold": 65536,
11+
"surrogate_method": 1,
12+
"embedding_dims": 512,
13+
"top_k": 250,
14+
"beta": 0.10,
15+
"save_predictions": true,
16+
"trn_label_fname": "trn_X_Y.txt",
17+
"val_label_fname": "tst_X_Y.txt",
18+
"tst_label_fname": "tst_X_Y.txt",
19+
"trn_feat_fname": "trn_X_Xf.txt",
20+
"val_feat_fname": "tst_X_Xf.txt",
21+
"tst_feat_fname": "tst_X_Xf.txt"
22+
},
23+
"surrogate": {
24+
"num_epochs": 20,
25+
"dlr_factor": 0.5,
26+
"learning_rate": 0.02,
27+
"batch_size": 255,
28+
"dlr_step": 14,
29+
"normalize": true,
30+
"optim": "Adam",
31+
"init": "token_embeddings",
32+
"embeddings": "fasttextB_embeddings_512d.npy",
33+
"validate": true,
34+
"save_intermediate": true
35+
},
36+
"extreme": {
37+
"num_epochs": 20,
38+
"dlr_factor": 0.5,
39+
"learning_rate": 0.0005,
40+
"batch_size": 255,
41+
"dlr_step": 14,
42+
"ns_method": "ensemble",
43+
"num_centroids": 1,
44+
"efC": 300,
45+
"efS": 400,
46+
"M": 100,
47+
"num_nbrs": 500,
48+
"ann_threads": 18,
49+
"beta": 0.5,
50+
"surrogate_mapping": null,
51+
"num_clf_partitions": 1,
52+
"optim": "Adam",
53+
"freeze_intermediate": true,
54+
"validate": true,
55+
"model_method": "shortlist",
56+
"normalize": true,
57+
"shortlist_method": "hybrid",
58+
"init": "intermediate",
59+
"use_shortlist": true,
60+
"use_intermediate_for_shorty": true
61+
},
62+
"reranker": {
63+
"num_epochs": 15,
64+
"dlr_factor": 0.5,
65+
"learning_rate": 0.001,
66+
"batch_size": 255,
67+
"dlr_step": 8,
68+
"beta": 0.5,
69+
"num_clf_partitions": 1,
70+
"optim": "Adam",
71+
"validate": true,
72+
"model_method": "reranker",
73+
"shortlist_method": "static",
74+
"surrogate_mapping": null,
75+
"normalize": true,
76+
"use_shortlist": true,
77+
"init": "token_embeddings",
78+
"save_intermediate": false,
79+
"keep_invalid": true,
80+
"freeze_intermediate": false,
81+
"update_shortlist": false,
82+
"use_pretrained_shortlist": true,
83+
"embeddings": "fasttextB_embeddings_512d.npy"
84+
}
85+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
{
2+
"global": {
3+
"dataset": "AmazonTitles-3M",
4+
"feature_type": "sparse",
5+
"num_labels": 2812281,
6+
"arch": "Astec",
7+
"A": 0.6,
8+
"B": 2.6,
9+
"use_reranker": true,
10+
"surrogate_threshold": 65536,
11+
"surrogate_method": 1,
12+
"embedding_dims": 300,
13+
"beta": 0.10,
14+
"top_k": 300,
15+
"save_predictions": true,
16+
"trn_label_fname": "trn_X_Y.txt",
17+
"val_label_fname": "tst_X_Y.txt",
18+
"tst_label_fname": "tst_X_Y.txt",
19+
"trn_feat_fname": "trn_X_Xf.txt",
20+
"val_feat_fname": "tst_X_Xf.txt",
21+
"tst_feat_fname": "tst_X_Xf.txt"
22+
},
23+
"surrogate": {
24+
"num_epochs": 20,
25+
"dlr_factor": 0.5,
26+
"learning_rate": 0.003,
27+
"batch_size": 255,
28+
"dlr_step": 14,
29+
"normalize": true,
30+
"optim": "Adam",
31+
"init": "token_embeddings",
32+
"embeddings": "fasttextB_embeddings_300d.npy",
33+
"validate": true,
34+
"save_intermediate": true
35+
},
36+
"extreme": {
37+
"num_epochs": 15,
38+
"dlr_factor": 0.5,
39+
"learning_rate": 0.002,
40+
"batch_size": 255,
41+
"dlr_step": 14,
42+
"ns_method": "ensemble",
43+
"num_centroids": 300,
44+
"efC": 300,
45+
"efS": 400,
46+
"M": 100,
47+
"num_nbrs": 500,
48+
"ann_threads": 18,
49+
"beta": 0.5,
50+
"surrogate_mapping": null,
51+
"num_clf_partitions": 1,
52+
"optim": "Adam",
53+
"freeze_intermediate": true,
54+
"validate": true,
55+
"model_method": "shortlist",
56+
"normalize": true,
57+
"shortlist_method": "hybrid",
58+
"init": "intermediate",
59+
"use_shortlist": true,
60+
"use_intermediate_for_shorty": true
61+
},
62+
"reranker": {
63+
"num_epochs": 10,
64+
"dlr_factor": 0.5,
65+
"learning_rate": 0.002,
66+
"batch_size": 255,
67+
"dlr_step": 7,
68+
"beta": 0.6,
69+
"num_clf_partitions": 1,
70+
"optim": "Adam",
71+
"validate": true,
72+
"model_method": "reranker",
73+
"shortlist_method": "static",
74+
"surrogate_mapping": null,
75+
"normalize": true,
76+
"use_shortlist": true,
77+
"init": "token_embeddings",
78+
"save_intermediate": false,
79+
"keep_invalid": true,
80+
"freeze_intermediate": false,
81+
"update_shortlist": false,
82+
"use_pretrained_shortlist": true,
83+
"embeddings": "fasttextB_embeddings_300d.npy"
84+
}
85+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
{
2+
"global": {
3+
"dataset": "LF-Amazon-131K",
4+
"feature_type": "sparse",
5+
"num_labels": 131073,
6+
"arch": "Astec",
7+
"A": 0.6,
8+
"B": 2.6,
9+
"use_reranker": true,
10+
"surrogate_threshold": 65536,
11+
"surrogate_method": 1,
12+
"embedding_dims": 512,
13+
"top_k": 250,
14+
"beta": 0.10,
15+
"save_predictions": true,
16+
"trn_label_fname": "trn_X_Y.txt",
17+
"val_label_fname": "tst_X_Y.txt",
18+
"tst_label_fname": "tst_X_Y.txt",
19+
"trn_feat_fname": "trn_X_Xf.txt",
20+
"val_feat_fname": "tst_X_Xf.txt",
21+
"tst_feat_fname": "tst_X_Xf.txt"
22+
},
23+
"surrogate": {
24+
"num_epochs": 20,
25+
"dlr_factor": 0.5,
26+
"learning_rate": 0.02,
27+
"batch_size": 255,
28+
"dlr_step": 14,
29+
"normalize": true,
30+
"optim": "Adam",
31+
"init": "token_embeddings",
32+
"embeddings": "fasttextB_embeddings_512d.npy",
33+
"validate": true,
34+
"save_intermediate": true
35+
},
36+
"extreme": {
37+
"num_epochs": 20,
38+
"dlr_factor": 0.5,
39+
"learning_rate": 0.0005,
40+
"batch_size": 255,
41+
"dlr_step": 14,
42+
"ns_method": "ensemble",
43+
"num_centroids": 1,
44+
"efC": 300,
45+
"efS": 400,
46+
"M": 100,
47+
"num_nbrs": 500,
48+
"ann_threads": 18,
49+
"beta": 0.5,
50+
"surrogate_mapping": null,
51+
"num_clf_partitions": 1,
52+
"optim": "Adam",
53+
"freeze_intermediate": true,
54+
"validate": true,
55+
"model_method": "shortlist",
56+
"normalize": true,
57+
"shortlist_method": "hybrid",
58+
"init": "intermediate",
59+
"use_shortlist": true,
60+
"use_intermediate_for_shorty": true
61+
},
62+
"reranker": {
63+
"num_epochs": 15,
64+
"dlr_factor": 0.5,
65+
"learning_rate": 0.001,
66+
"batch_size": 255,
67+
"dlr_step": 8,
68+
"beta": 0.5,
69+
"num_clf_partitions": 1,
70+
"optim": "Adam",
71+
"validate": true,
72+
"model_method": "reranker",
73+
"shortlist_method": "static",
74+
"surrogate_mapping": null,
75+
"normalize": true,
76+
"use_shortlist": true,
77+
"init": "token_embeddings",
78+
"save_intermediate": false,
79+
"keep_invalid": true,
80+
"freeze_intermediate": false,
81+
"update_shortlist": false,
82+
"use_pretrained_shortlist": true,
83+
"embeddings": "fasttextB_embeddings_512d.npy"
84+
}
85+
}

0 commit comments

Comments
 (0)