-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e5f6e4b
commit 00a15bc
Showing
3 changed files
with
222 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ datasets | |
extra | ||
garbage | ||
datasets | ||
Notebooks | ||
# Notebooks | ||
.vscode | ||
*.csv | ||
*.pt | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Imports\n", | ||
"import numpy as np\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from os import walk\n", | ||
"\n", | ||
"# inits\n", | ||
"output = f\"libs/data/mini-imagenet/\" # Should be the place you want to output the .txt file which the dataloader will read later.\n", | ||
"imb_ratio = 0.01 # Choose Imbalance ratio here\n", | ||
"root = \"/home/rahul_intern/Imagenet/mini_imagenet\" # Should point to the dir in which your ImagNet is.\n", | ||
"\n", | ||
"# Split ratios \n", | ||
"train_split_ratio = 0.8\n", | ||
"test_split_ratio = 1 - train_split_ratio\n", | ||
"val_split_ratio = 0.2\n", | ||
"\n", | ||
"final = []\n", | ||
"labels = []\n", | ||
"final_1 = []\n", | ||
"labels_1 = []\n", | ||
"final_2 = []\n", | ||
"labels_2 = []\n", | ||
"\n", | ||
"all_classes = []\n", | ||
"equal_classes = []\n", | ||
"unequal_classes = []\n", | ||
"\n", | ||
"mini_keys = ['n02110341', 'n01930112', 'n04509417', 'n04067472', 'n04515003', 'n02120079', 'n03924679', 'n02687172', 'n03075370', 'n07747607', 'n09246464', 'n02457408', 'n04418357', 'n03535780', 'n04435653', 'n03207743', 'n04251144', 'n03062245', 'n02174001', 'n07613480', 'n03998194', 'n02074367', 'n04146614', 'n04243546', 'n03854065', 'n03838899', 'n02871525', 'n03544143', 'n02108089', 'n13133613', 'n03676483', 'n03337140', 'n03272010', 'n01770081', 'n09256479', 'n02091244', 'n02116738', 'n04275548', 'n03773504', 'n02606052', 'n03146219', 'n04149813', 'n07697537', 'n02823428', 'n02089867', 'n03017168', 'n01704323', 'n01532829', 'n03047690', 'n03775546', 'n01843383', 'n02971356', 'n13054560', 'n02108551', 'n02101006', 'n03417042', 'n04612504', 'n01558993', 'n04522168', 'n02795169', 'n06794110', 'n01855672', 'n04258138', 'n02110063', 'n07584110', 'n02091831', 'n03584254', 'n03888605', 'n02113712', 'n03980874', 'n02219486', 'n02138441', 'n02165456', 'n02108915', 'n03770439', 'n01981276', 'n03220513', 'n02099601', 'n02747177', 'n01749939', 'n03476684', 'n02105505', 'n02950826', 'n04389033', 'n03347037', 'n02966193', 'n03127925', 'n03400231', 'n04296562', 'n03527444', 'n04443257', 'n02443484', 'n02114548', 'n04604644', 'n01910747', 'n04596742', 'n02111277', 'n03908618', 'n02129165', 'n02981792']\n", | ||
"\n", | ||
"with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/all_classes.txt\", \"r\") as f:\n", | ||
" for line in f:\n", | ||
" all_classes.append(str(line.strip()))\n", | ||
"\n", | ||
"with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/equal_classes.txt\", \"r\") as f:\n", | ||
" for line in f:\n", | ||
" equal_classes.append(str(line.strip()))\n", | ||
"\n", | ||
"with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/unequal_classes.txt\", \"r\") as f:\n", | ||
" for line in f:\n", | ||
" unequal_classes.append(str(line.strip()))\n", | ||
"\n", | ||
"filenames = next(walk(root), (None, None, []))[2] \n", | ||
"for name in filenames:\n", | ||
" label_temp = name.split(\"_\")[0]\n", | ||
" if label_temp in all_classes:\n", | ||
" final.append(name)\n", | ||
" labels.append(label_temp)\n", | ||
" if label_temp in equal_classes:\n", | ||
" final_1.append(name)\n", | ||
" labels_1.append(label_temp)\n", | ||
" else:\n", | ||
" final_2.append(name)\n", | ||
" labels_2.append(label_temp)\n", | ||
"\n", | ||
"actual_label = np.unique(labels)\n", | ||
"pseudo_label = np.arange(len(np.unique(labels)))\n", | ||
"\n", | ||
"# Converts the labels to range of 0 to max ints\n", | ||
"label_dict = {}\n", | ||
"inverse_label_dict = {}\n", | ||
"\n", | ||
"for i,j in zip(actual_label, pseudo_label):\n", | ||
" label_dict[i] = j\n", | ||
" inverse_label_dict[j] = i\n", | ||
" \n", | ||
"# Re-splitting the mini-imagenet which was made for few-shot into proper train, val, test sets.\n", | ||
"train_x, test_x, train_y, test_y = train_test_split(final_1, labels_1, train_size=train_split_ratio, test_size=test_split_ratio, stratify=labels_1)\n", | ||
"train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, train_size=1-val_split_ratio, test_size=val_split_ratio, stratify=train_y)\n", | ||
"\n", | ||
"# Limiting train, val, test datapoints to 500, 100, 100 per class (Not a very clean code but gets the job done)\n", | ||
"def select_random_data(train_x, train_y, count=500):\n", | ||
" train_classwise_dict = {}\n", | ||
" for i, j in zip(train_y, train_x):\n", | ||
" if i in train_classwise_dict.keys():\n", | ||
" train_classwise_dict[i].extend([j])\n", | ||
" else:\n", | ||
" train_classwise_dict[i] = []\n", | ||
" train_classwise_dict[i].extend([j])\n", | ||
"\n", | ||
" new_train_x = []\n", | ||
" new_train_y = []\n", | ||
" for i in train_classwise_dict.keys():\n", | ||
" ind1 = np.random.permutation(len(train_classwise_dict[i]))[:count]\n", | ||
" new_train_x.append(list(np.array(train_classwise_dict[i])[ind1]))\n", | ||
" new_train_y.append([i]*count)\n", | ||
" return sum(new_train_x, []), sum(new_train_y, [])\n", | ||
"\n", | ||
"train_x, train_y = select_random_data(train_x, train_y, 500)\n", | ||
"val_x, val_y = select_random_data(val_x, val_y, 100)\n", | ||
"test_x, test_y = select_random_data(test_x, test_y, 100)\n", | ||
"\n", | ||
"# Randomly select and limit datapoints per class from unequal_classes, divide them into train, val, test and append it to the already limited and divided train, val, test of equal_classes\n", | ||
"train_classwise_dict = {}\n", | ||
"for i, j in zip(labels_2, final_2):\n", | ||
" if i in train_classwise_dict.keys():\n", | ||
" train_classwise_dict[i].extend([j])\n", | ||
" else:\n", | ||
" train_classwise_dict[i] = []\n", | ||
" train_classwise_dict[i].extend([j])\n", | ||
"\n", | ||
"new_train_x = []\n", | ||
"new_train_y = []\n", | ||
"new_val_x = []\n", | ||
"new_val_y = []\n", | ||
"new_test_x = []\n", | ||
"new_test_y = []\n", | ||
"for i in train_classwise_dict.keys():\n", | ||
" ind1 = np.random.permutation(len(train_classwise_dict[i]))[:500]\n", | ||
" ind2 = np.random.permutation(len(train_classwise_dict[i]))[500:600]\n", | ||
" ind3 = np.random.permutation(len(train_classwise_dict[i]))[600:700]\n", | ||
" new_train_x.append(list(np.array(train_classwise_dict[i])[ind1]))\n", | ||
" new_val_x.append(list(np.array(train_classwise_dict[i])[ind2]))\n", | ||
" new_test_x.append(list(np.array(train_classwise_dict[i])[ind3]))\n", | ||
" new_train_y.append([i]*500)\n", | ||
" new_val_y.append([i]*100)\n", | ||
" new_test_y.append([i]*100)\n", | ||
"\n", | ||
"train_x.extend(sum(new_train_x, []))\n", | ||
"train_y.extend(sum(new_train_y, []))\n", | ||
"val_x.extend(sum(new_val_x, []))\n", | ||
"val_y.extend(sum(new_val_y, []))\n", | ||
"test_x.extend(sum(new_test_x, []))\n", | ||
"test_y.extend(sum(new_test_y, []))\n", | ||
"\n", | ||
"\n", | ||
"# print(np.unique(train_y, return_counts=True)[1], len(np.unique(train_y, return_counts=True)[1]))\n", | ||
"# print(np.unique(val_y, return_counts=True)[1], len(np.unique(val_y, return_counts=True)[1]))\n", | ||
"# print(np.unique(test_y, return_counts=True)[1], len(np.unique(test_y, return_counts=True)[1]))\n", | ||
"\n", | ||
"# Making Imbalanced train data\n", | ||
"def get_img_num_per_cls(cls_num, imb_type, imb_factor, data_length):\n", | ||
" img_max = data_length / cls_num\n", | ||
" img_num_per_cls = []\n", | ||
" if imb_type == \"exp\":\n", | ||
" for cls_idx in range(cls_num):\n", | ||
" num = img_max * (imb_factor ** (cls_idx / (cls_num - 1.0)))\n", | ||
" img_num_per_cls.append(int(num))\n", | ||
" elif imb_type == \"step\":\n", | ||
" for cls_idx in range(cls_num // 2):\n", | ||
" img_num_per_cls.append(int(img_max))\n", | ||
" for cls_idx in range(cls_num // 2):\n", | ||
" img_num_per_cls.append(int(img_max * imb_factor))\n", | ||
" else:\n", | ||
" img_num_per_cls.extend([int(img_max)] * cls_num)\n", | ||
" return img_num_per_cls\n", | ||
"\n", | ||
"def gen_imbalanced_data(img_num_per_cls, data, targets):\n", | ||
" new_data = []\n", | ||
" new_targets = []\n", | ||
" targets_np = np.array(targets, dtype=np.int64)\n", | ||
" data = np.array(data)\n", | ||
" classes = np.unique(targets_np)\n", | ||
"\n", | ||
" num_per_cls_dict = dict()\n", | ||
" for the_class, the_img_num in zip(classes, img_num_per_cls):\n", | ||
" num_per_cls_dict[the_class] = the_img_num\n", | ||
" idx = np.where(targets_np == the_class)[0]\n", | ||
" np.random.shuffle(idx)\n", | ||
" selec_idx = idx[:the_img_num]\n", | ||
" new_data.extend(data[selec_idx, ...])\n", | ||
" new_targets.extend([the_class,]* the_img_num)\n", | ||
" \n", | ||
" # print(len(new_data[-1]))\n", | ||
" # new_data = np.stack(new_data)\n", | ||
" return new_data, new_targets\n", | ||
"\n", | ||
"# Convert WordNetID labels to a range from 0 to 100\n", | ||
"train_y = [label_dict[i] for i in train_y]\n", | ||
"val_y = [label_dict[i] for i in val_y]\n", | ||
"test_y = [label_dict[i] for i in test_y]\n", | ||
"\n", | ||
"img_num_per_cls = get_img_num_per_cls(100, \"exp\", imb_ratio, len(train_x))\n", | ||
"train_x, train_y = gen_imbalanced_data(img_num_per_cls, train_x, train_y)\n", | ||
"\n", | ||
"# Writing as txt into \"output\" dir in inits\n", | ||
"dataxy = [(train_x, train_y), (val_x, val_y), (test_x, test_y)]\n", | ||
"for i, j in enumerate([\"train\", \"val\", \"test\"]):\n", | ||
" with open(f'{output}{imb_ratio}_{j}.txt', 'w') as f:\n", | ||
" for line, lab in zip(dataxy[i][0], dataxy[i][1]):\n", | ||
" f.write(line + \" \" + str(lab))\n", | ||
" f.write('\\n')\n", | ||
"\n", | ||
"for phase in [\"train\",\"val\", \"test\"]:\n", | ||
" finals = []\n", | ||
" labels = []\n", | ||
" input = f\"{output}{imb_ratio}_{phase}.txt\"\n", | ||
" with open(input) as f:\n", | ||
" for line in f:\n", | ||
" finals.append(line.split()[0])\n", | ||
" labels.append(line.split()[-1])\n", | ||
"\n", | ||
"\n", | ||
" print(np.unique(labels, return_counts=True)[1])\n", | ||
" max_val = max(np.unique(labels, return_counts=True)[1])\n", | ||
" min_val = min(np.unique(labels, return_counts=True)[1])\n", | ||
" sum_val = sum(np.unique(labels, return_counts=True)[1])\n", | ||
" cls_count = len(np.unique(labels, return_counts=True)[1])\n", | ||
" print(f\"{phase} -> Max: {max_val} | Min: {min_val} | Sum: {sum_val} | Imb: {max_val/min_val} | Class count: {cls_count}\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters