Skip to content

Commit

Permalink
Added mini-ImageNet-LT instructions
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulvigneswaran committed Nov 10, 2021
1 parent e5f6e4b commit 00a15bc
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ datasets
extra
garbage
datasets
Notebooks
# Notebooks
.vscode
*.csv
*.pt
Expand Down
217 changes: 217 additions & 0 deletions Notebooks/Create_mini-ImageNet-LT.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from os import walk\n",
"\n",
"# inits\n",
"output = f\"libs/data/mini-imagenet/\" # Should be the place you want to output the .txt file which the dataloader will read later.\n",
"imb_ratio = 0.01 # Choose Imbalance ratio here\n",
"root = \"/home/rahul_intern/Imagenet/mini_imagenet\" # Should point to the dir in which your ImagNet is.\n",
"\n",
"# Split ratios \n",
"train_split_ratio = 0.8\n",
"test_split_ratio = 1 - train_split_ratio\n",
"val_split_ratio = 0.2\n",
"\n",
"final = []\n",
"labels = []\n",
"final_1 = []\n",
"labels_1 = []\n",
"final_2 = []\n",
"labels_2 = []\n",
"\n",
"all_classes = []\n",
"equal_classes = []\n",
"unequal_classes = []\n",
"\n",
"mini_keys = ['n02110341', 'n01930112', 'n04509417', 'n04067472', 'n04515003', 'n02120079', 'n03924679', 'n02687172', 'n03075370', 'n07747607', 'n09246464', 'n02457408', 'n04418357', 'n03535780', 'n04435653', 'n03207743', 'n04251144', 'n03062245', 'n02174001', 'n07613480', 'n03998194', 'n02074367', 'n04146614', 'n04243546', 'n03854065', 'n03838899', 'n02871525', 'n03544143', 'n02108089', 'n13133613', 'n03676483', 'n03337140', 'n03272010', 'n01770081', 'n09256479', 'n02091244', 'n02116738', 'n04275548', 'n03773504', 'n02606052', 'n03146219', 'n04149813', 'n07697537', 'n02823428', 'n02089867', 'n03017168', 'n01704323', 'n01532829', 'n03047690', 'n03775546', 'n01843383', 'n02971356', 'n13054560', 'n02108551', 'n02101006', 'n03417042', 'n04612504', 'n01558993', 'n04522168', 'n02795169', 'n06794110', 'n01855672', 'n04258138', 'n02110063', 'n07584110', 'n02091831', 'n03584254', 'n03888605', 'n02113712', 'n03980874', 'n02219486', 'n02138441', 'n02165456', 'n02108915', 'n03770439', 'n01981276', 'n03220513', 'n02099601', 'n02747177', 'n01749939', 'n03476684', 'n02105505', 'n02950826', 'n04389033', 'n03347037', 'n02966193', 'n03127925', 'n03400231', 'n04296562', 'n03527444', 'n04443257', 'n02443484', 'n02114548', 'n04604644', 'n01910747', 'n04596742', 'n02111277', 'n03908618', 'n02129165', 'n02981792']\n",
"\n",
"with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/all_classes.txt\", \"r\") as f:\n",
" for line in f:\n",
" all_classes.append(str(line.strip()))\n",
"\n",
"with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/equal_classes.txt\", \"r\") as f:\n",
" for line in f:\n",
" equal_classes.append(str(line.strip()))\n",
"\n",
"with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/unequal_classes.txt\", \"r\") as f:\n",
" for line in f:\n",
" unequal_classes.append(str(line.strip()))\n",
"\n",
"filenames = next(walk(root), (None, None, []))[2] \n",
"for name in filenames:\n",
" label_temp = name.split(\"_\")[0]\n",
" if label_temp in all_classes:\n",
" final.append(name)\n",
" labels.append(label_temp)\n",
" if label_temp in equal_classes:\n",
" final_1.append(name)\n",
" labels_1.append(label_temp)\n",
" else:\n",
" final_2.append(name)\n",
" labels_2.append(label_temp)\n",
"\n",
"actual_label = np.unique(labels)\n",
"pseudo_label = np.arange(len(np.unique(labels)))\n",
"\n",
"# Converts the labels to range of 0 to max ints\n",
"label_dict = {}\n",
"inverse_label_dict = {}\n",
"\n",
"for i,j in zip(actual_label, pseudo_label):\n",
" label_dict[i] = j\n",
" inverse_label_dict[j] = i\n",
" \n",
"# Re-splitting the mini-imagenet which was made for few-shot into proper train, val, test sets.\n",
"train_x, test_x, train_y, test_y = train_test_split(final_1, labels_1, train_size=train_split_ratio, test_size=test_split_ratio, stratify=labels_1)\n",
"train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, train_size=1-val_split_ratio, test_size=val_split_ratio, stratify=train_y)\n",
"\n",
"# Limiting train, val, test datapoints to 500, 100, 100 per class (Not a very clean code but gets the job done)\n",
"def select_random_data(train_x, train_y, count=500):\n",
" train_classwise_dict = {}\n",
" for i, j in zip(train_y, train_x):\n",
" if i in train_classwise_dict.keys():\n",
" train_classwise_dict[i].extend([j])\n",
" else:\n",
" train_classwise_dict[i] = []\n",
" train_classwise_dict[i].extend([j])\n",
"\n",
" new_train_x = []\n",
" new_train_y = []\n",
" for i in train_classwise_dict.keys():\n",
" ind1 = np.random.permutation(len(train_classwise_dict[i]))[:count]\n",
" new_train_x.append(list(np.array(train_classwise_dict[i])[ind1]))\n",
" new_train_y.append([i]*count)\n",
" return sum(new_train_x, []), sum(new_train_y, [])\n",
"\n",
"train_x, train_y = select_random_data(train_x, train_y, 500)\n",
"val_x, val_y = select_random_data(val_x, val_y, 100)\n",
"test_x, test_y = select_random_data(test_x, test_y, 100)\n",
"\n",
"# Randomly select and limit datapoints per class from unequal_classes, divide them into train, val, test and append it to the already limited and divided train, val, test of equal_classes\n",
"train_classwise_dict = {}\n",
"for i, j in zip(labels_2, final_2):\n",
" if i in train_classwise_dict.keys():\n",
" train_classwise_dict[i].extend([j])\n",
" else:\n",
" train_classwise_dict[i] = []\n",
" train_classwise_dict[i].extend([j])\n",
"\n",
"new_train_x = []\n",
"new_train_y = []\n",
"new_val_x = []\n",
"new_val_y = []\n",
"new_test_x = []\n",
"new_test_y = []\n",
"for i in train_classwise_dict.keys():\n",
" ind1 = np.random.permutation(len(train_classwise_dict[i]))[:500]\n",
" ind2 = np.random.permutation(len(train_classwise_dict[i]))[500:600]\n",
" ind3 = np.random.permutation(len(train_classwise_dict[i]))[600:700]\n",
" new_train_x.append(list(np.array(train_classwise_dict[i])[ind1]))\n",
" new_val_x.append(list(np.array(train_classwise_dict[i])[ind2]))\n",
" new_test_x.append(list(np.array(train_classwise_dict[i])[ind3]))\n",
" new_train_y.append([i]*500)\n",
" new_val_y.append([i]*100)\n",
" new_test_y.append([i]*100)\n",
"\n",
"train_x.extend(sum(new_train_x, []))\n",
"train_y.extend(sum(new_train_y, []))\n",
"val_x.extend(sum(new_val_x, []))\n",
"val_y.extend(sum(new_val_y, []))\n",
"test_x.extend(sum(new_test_x, []))\n",
"test_y.extend(sum(new_test_y, []))\n",
"\n",
"\n",
"# print(np.unique(train_y, return_counts=True)[1], len(np.unique(train_y, return_counts=True)[1]))\n",
"# print(np.unique(val_y, return_counts=True)[1], len(np.unique(val_y, return_counts=True)[1]))\n",
"# print(np.unique(test_y, return_counts=True)[1], len(np.unique(test_y, return_counts=True)[1]))\n",
"\n",
"# Making Imbalanced train data\n",
"def get_img_num_per_cls(cls_num, imb_type, imb_factor, data_length):\n",
" img_max = data_length / cls_num\n",
" img_num_per_cls = []\n",
" if imb_type == \"exp\":\n",
" for cls_idx in range(cls_num):\n",
" num = img_max * (imb_factor ** (cls_idx / (cls_num - 1.0)))\n",
" img_num_per_cls.append(int(num))\n",
" elif imb_type == \"step\":\n",
" for cls_idx in range(cls_num // 2):\n",
" img_num_per_cls.append(int(img_max))\n",
" for cls_idx in range(cls_num // 2):\n",
" img_num_per_cls.append(int(img_max * imb_factor))\n",
" else:\n",
" img_num_per_cls.extend([int(img_max)] * cls_num)\n",
" return img_num_per_cls\n",
"\n",
"def gen_imbalanced_data(img_num_per_cls, data, targets):\n",
" new_data = []\n",
" new_targets = []\n",
" targets_np = np.array(targets, dtype=np.int64)\n",
" data = np.array(data)\n",
" classes = np.unique(targets_np)\n",
"\n",
" num_per_cls_dict = dict()\n",
" for the_class, the_img_num in zip(classes, img_num_per_cls):\n",
" num_per_cls_dict[the_class] = the_img_num\n",
" idx = np.where(targets_np == the_class)[0]\n",
" np.random.shuffle(idx)\n",
" selec_idx = idx[:the_img_num]\n",
" new_data.extend(data[selec_idx, ...])\n",
" new_targets.extend([the_class,]* the_img_num)\n",
" \n",
" # print(len(new_data[-1]))\n",
" # new_data = np.stack(new_data)\n",
" return new_data, new_targets\n",
"\n",
"# Convert WordNetID labels to a range from 0 to 100\n",
"train_y = [label_dict[i] for i in train_y]\n",
"val_y = [label_dict[i] for i in val_y]\n",
"test_y = [label_dict[i] for i in test_y]\n",
"\n",
"img_num_per_cls = get_img_num_per_cls(100, \"exp\", imb_ratio, len(train_x))\n",
"train_x, train_y = gen_imbalanced_data(img_num_per_cls, train_x, train_y)\n",
"\n",
"# Writing as txt into \"output\" dir in inits\n",
"dataxy = [(train_x, train_y), (val_x, val_y), (test_x, test_y)]\n",
"for i, j in enumerate([\"train\", \"val\", \"test\"]):\n",
" with open(f'{output}{imb_ratio}_{j}.txt', 'w') as f:\n",
" for line, lab in zip(dataxy[i][0], dataxy[i][1]):\n",
" f.write(line + \" \" + str(lab))\n",
" f.write('\\n')\n",
"\n",
"for phase in [\"train\",\"val\", \"test\"]:\n",
" finals = []\n",
" labels = []\n",
" input = f\"{output}{imb_ratio}_{phase}.txt\"\n",
" with open(input) as f:\n",
" for line in f:\n",
" finals.append(line.split()[0])\n",
" labels.append(line.split()[-1])\n",
"\n",
"\n",
" print(np.unique(labels, return_counts=True)[1])\n",
" max_val = max(np.unique(labels, return_counts=True)[1])\n",
" min_val = min(np.unique(labels, return_counts=True)[1])\n",
" sum_val = sum(np.unique(labels, return_counts=True)[1])\n",
" cls_count = len(np.unique(labels, return_counts=True)[1])\n",
" print(f\"{phase} -> Max: {max_val} | Min: {min_val} | Sum: {sum_val} | Imb: {max_val/min_val} | Class count: {cls_count}\")"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ by [Rahul Vigneswaran](https://rahulvigneswaran.github.io/), [Marc T. Law](http:
- [🧪 Advanced Usage](#-advanced-usage)
- [✔ Things to do before you run the code from this repo](#-things-to-do-before-you-run-the-code-from-this-repo)
- [📀 How to use?](#-how-to-use)
- [📚 How to create the mini-ImageNet-LT dataset?](#-how-to-create-the-mini-imagenet-lt-dataset)
- [⚙ Arguments](#-arguments)
- [🏋️‍♂️ Trained weights](#%EF%B8%8F%EF%B8%8F-trained-weights)
- [🪀 Results on a Toy Dataset](#-results-on-a-toy-dataset)
Expand Down Expand Up @@ -71,6 +72,8 @@ print(f"After: {np.unique(lab, return_counts=True)}")
- For CIFAR100-LT: `run_all_CIFAR100-LT.sh`
- For mini-ImageNet-LT : `run_all_mini-ImageNet-LT.sh`

### 📚 How to create the mini-ImageNet-LT dataset?
Check `Notebooks/Create_mini-ImageNet-LT.ipynb` for the script that generates the mini-ImageNet-LT dataset with varying imbalance ratios and train-test-val splits.
### ⚙ Arguments
- `--seed` : Select seed for fixing it.
- Default : `1`
Expand Down Expand Up @@ -182,7 +185,7 @@ Ignored `tailcalib_pip` as it is for the `tailcalib` pip package.
```
@inproceedings{rahul2021tailcalibX,
title = {{Feature Generation for Long-tail Classification}},
author = {Rahul Vigneswaran, Marc T. Law, Vineeth N. Balasubramanian, Makarand Tapaswi},
author = {Rahul Vigneswaran and Marc T. Law and Vineeth N. Balasubramanian and Makarand Tapaswi},
booktitle = {ICVGIP},
year = {2021}
}
Expand Down

0 comments on commit 00a15bc

Please sign in to comment.