Added mini-ImageNet-LT instructions

rahulvigneswaran · Nov 10, 2021 · 00a15bc · 00a15bc
1 parent e5f6e4b
commit 00a15bc
Show file tree

Hide file tree

Showing 3 changed files with 222 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,7 +9,7 @@ datasets
 extra
 garbage
 datasets
-Notebooks
+# Notebooks
 .vscode
 *.csv
 *.pt

diff --git a/Notebooks/Create_mini-ImageNet-LT.ipynb b/Notebooks/Create_mini-ImageNet-LT.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from os import walk\n",
+    "\n",
+    "# inits\n",
+    "output = f\"libs/data/mini-imagenet/\"    # Should be the place you want to output the .txt file which the dataloader will read later.\n",
+    "imb_ratio = 0.01    # Choose Imbalance ratio here\n",
+    "root = \"/home/rahul_intern/Imagenet/mini_imagenet\"  # Should point to the dir in which your ImagNet is.\n",
+    "\n",
+    "# Split ratios \n",
+    "train_split_ratio = 0.8\n",
+    "test_split_ratio = 1 - train_split_ratio\n",
+    "val_split_ratio = 0.2\n",
+    "\n",
+    "final = []\n",
+    "labels = []\n",
+    "final_1 = []\n",
+    "labels_1 = []\n",
+    "final_2 = []\n",
+    "labels_2 = []\n",
+    "\n",
+    "all_classes = []\n",
+    "equal_classes = []\n",
+    "unequal_classes = []\n",
+    "\n",
+    "mini_keys = ['n02110341', 'n01930112', 'n04509417', 'n04067472', 'n04515003', 'n02120079', 'n03924679', 'n02687172', 'n03075370', 'n07747607', 'n09246464', 'n02457408', 'n04418357', 'n03535780', 'n04435653', 'n03207743', 'n04251144', 'n03062245', 'n02174001', 'n07613480', 'n03998194', 'n02074367', 'n04146614', 'n04243546', 'n03854065', 'n03838899', 'n02871525', 'n03544143', 'n02108089', 'n13133613', 'n03676483', 'n03337140', 'n03272010', 'n01770081', 'n09256479', 'n02091244', 'n02116738', 'n04275548', 'n03773504', 'n02606052', 'n03146219', 'n04149813', 'n07697537', 'n02823428', 'n02089867', 'n03017168', 'n01704323', 'n01532829', 'n03047690', 'n03775546', 'n01843383', 'n02971356', 'n13054560', 'n02108551', 'n02101006', 'n03417042', 'n04612504', 'n01558993', 'n04522168', 'n02795169', 'n06794110', 'n01855672', 'n04258138', 'n02110063', 'n07584110', 'n02091831', 'n03584254', 'n03888605', 'n02113712', 'n03980874', 'n02219486', 'n02138441', 'n02165456', 'n02108915', 'n03770439', 'n01981276', 'n03220513', 'n02099601', 'n02747177', 'n01749939', 'n03476684', 'n02105505', 'n02950826', 'n04389033', 'n03347037', 'n02966193', 'n03127925', 'n03400231', 'n04296562', 'n03527444', 'n04443257', 'n02443484', 'n02114548', 'n04604644', 'n01910747', 'n04596742', 'n02111277', 'n03908618', 'n02129165', 'n02981792']\n",
+    "\n",
+    "with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/all_classes.txt\", \"r\") as f:\n",
+    "  for line in f:\n",
+    "    all_classes.append(str(line.strip()))\n",
+    "\n",
+    "with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/equal_classes.txt\", \"r\") as f:\n",
+    "  for line in f:\n",
+    "    equal_classes.append(str(line.strip()))\n",
+    "\n",
+    "with open(\"/home/rahul_intern/fb_dl_fresh/long_tail/libs/data/mini-imagenet/unequal_classes.txt\", \"r\") as f:\n",
+    "  for line in f:\n",
+    "    unequal_classes.append(str(line.strip()))\n",
+    "\n",
+    "filenames = next(walk(root), (None, None, []))[2]  \n",
+    "for name in filenames:\n",
+    "    label_temp = name.split(\"_\")[0]\n",
+    "    if label_temp in all_classes:\n",
+    "        final.append(name)\n",
+    "        labels.append(label_temp)\n",
+    "        if label_temp in equal_classes:\n",
+    "            final_1.append(name)\n",
+    "            labels_1.append(label_temp)\n",
+    "        else:\n",
+    "            final_2.append(name)\n",
+    "            labels_2.append(label_temp)\n",
+    "\n",
+    "actual_label = np.unique(labels)\n",
+    "pseudo_label = np.arange(len(np.unique(labels)))\n",
+    "\n",
+    "# Converts the labels to range of 0 to max ints\n",
+    "label_dict = {}\n",
+    "inverse_label_dict = {}\n",
+    "\n",
+    "for i,j in zip(actual_label, pseudo_label):\n",
+    "    label_dict[i] = j\n",
+    "    inverse_label_dict[j] = i\n",
+    "    \n",
+    "# Re-splitting the mini-imagenet which was made for few-shot into proper train, val, test sets.\n",
+    "train_x, test_x, train_y, test_y = train_test_split(final_1, labels_1, train_size=train_split_ratio, test_size=test_split_ratio, stratify=labels_1)\n",
+    "train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, train_size=1-val_split_ratio, test_size=val_split_ratio, stratify=train_y)\n",
+    "\n",
+    "# Limiting train, val, test datapoints to 500, 100, 100 per class (Not a very clean code but gets the job done)\n",
+    "def select_random_data(train_x, train_y, count=500):\n",
+    "    train_classwise_dict = {}\n",
+    "    for i, j in zip(train_y, train_x):\n",
+    "        if i in train_classwise_dict.keys():\n",
+    "            train_classwise_dict[i].extend([j])\n",
+    "        else:\n",
+    "            train_classwise_dict[i] = []\n",
+    "            train_classwise_dict[i].extend([j])\n",
+    "\n",
+    "    new_train_x = []\n",
+    "    new_train_y = []\n",
+    "    for i in train_classwise_dict.keys():\n",
+    "        ind1 = np.random.permutation(len(train_classwise_dict[i]))[:count]\n",
+    "        new_train_x.append(list(np.array(train_classwise_dict[i])[ind1]))\n",
+    "        new_train_y.append([i]*count)\n",
+    "    return sum(new_train_x, []), sum(new_train_y, [])\n",
+    "\n",
+    "train_x, train_y = select_random_data(train_x, train_y, 500)\n",
+    "val_x, val_y = select_random_data(val_x, val_y, 100)\n",
+    "test_x, test_y = select_random_data(test_x, test_y, 100)\n",
+    "\n",
+    "# Randomly select and limit datapoints per class from unequal_classes, divide them into train, val, test and append it to the already limited and divided train, val, test of equal_classes\n",
+    "train_classwise_dict = {}\n",
+    "for i, j in zip(labels_2, final_2):\n",
+    "    if i in train_classwise_dict.keys():\n",
+    "        train_classwise_dict[i].extend([j])\n",
+    "    else:\n",
+    "        train_classwise_dict[i] = []\n",
+    "        train_classwise_dict[i].extend([j])\n",
+    "\n",
+    "new_train_x = []\n",
+    "new_train_y = []\n",
+    "new_val_x = []\n",
+    "new_val_y = []\n",
+    "new_test_x = []\n",
+    "new_test_y = []\n",
+    "for i in train_classwise_dict.keys():\n",
+    "    ind1 = np.random.permutation(len(train_classwise_dict[i]))[:500]\n",
+    "    ind2 = np.random.permutation(len(train_classwise_dict[i]))[500:600]\n",
+    "    ind3 = np.random.permutation(len(train_classwise_dict[i]))[600:700]\n",
+    "    new_train_x.append(list(np.array(train_classwise_dict[i])[ind1]))\n",
+    "    new_val_x.append(list(np.array(train_classwise_dict[i])[ind2]))\n",
+    "    new_test_x.append(list(np.array(train_classwise_dict[i])[ind3]))\n",
+    "    new_train_y.append([i]*500)\n",
+    "    new_val_y.append([i]*100)\n",
+    "    new_test_y.append([i]*100)\n",
+    "\n",
+    "train_x.extend(sum(new_train_x, []))\n",
+    "train_y.extend(sum(new_train_y, []))\n",
+    "val_x.extend(sum(new_val_x, []))\n",
+    "val_y.extend(sum(new_val_y, []))\n",
+    "test_x.extend(sum(new_test_x, []))\n",
+    "test_y.extend(sum(new_test_y, []))\n",
+    "\n",
+    "\n",
+    "# print(np.unique(train_y, return_counts=True)[1], len(np.unique(train_y, return_counts=True)[1]))\n",
+    "# print(np.unique(val_y, return_counts=True)[1], len(np.unique(val_y, return_counts=True)[1]))\n",
+    "# print(np.unique(test_y, return_counts=True)[1], len(np.unique(test_y, return_counts=True)[1]))\n",
+    "\n",
+    "# Making Imbalanced train data\n",
+    "def get_img_num_per_cls(cls_num, imb_type, imb_factor, data_length):\n",
+    "    img_max = data_length / cls_num\n",
+    "    img_num_per_cls = []\n",
+    "    if imb_type == \"exp\":\n",
+    "        for cls_idx in range(cls_num):\n",
+    "            num = img_max * (imb_factor ** (cls_idx / (cls_num - 1.0)))\n",
+    "            img_num_per_cls.append(int(num))\n",
+    "    elif imb_type == \"step\":\n",
+    "        for cls_idx in range(cls_num // 2):\n",
+    "            img_num_per_cls.append(int(img_max))\n",
+    "        for cls_idx in range(cls_num // 2):\n",
+    "            img_num_per_cls.append(int(img_max * imb_factor))\n",
+    "    else:\n",
+    "        img_num_per_cls.extend([int(img_max)] * cls_num)\n",
+    "    return img_num_per_cls\n",
+    "\n",
+    "def gen_imbalanced_data(img_num_per_cls, data, targets):\n",
+    "        new_data = []\n",
+    "        new_targets = []\n",
+    "        targets_np = np.array(targets, dtype=np.int64)\n",
+    "        data = np.array(data)\n",
+    "        classes = np.unique(targets_np)\n",
+    "\n",
+    "        num_per_cls_dict = dict()\n",
+    "        for the_class, the_img_num in zip(classes, img_num_per_cls):\n",
+    "            num_per_cls_dict[the_class] = the_img_num\n",
+    "            idx = np.where(targets_np == the_class)[0]\n",
+    "            np.random.shuffle(idx)\n",
+    "            selec_idx = idx[:the_img_num]\n",
+    "            new_data.extend(data[selec_idx, ...])\n",
+    "            new_targets.extend([the_class,]* the_img_num)\n",
+    "        \n",
+    "        # print(len(new_data[-1]))\n",
+    "        # new_data = np.stack(new_data)\n",
+    "        return new_data, new_targets\n",
+    "\n",
+    "# Convert WordNetID labels to a range from 0 to 100\n",
+    "train_y = [label_dict[i] for i in train_y]\n",
+    "val_y = [label_dict[i] for i in val_y]\n",
+    "test_y = [label_dict[i] for i in test_y]\n",
+    "\n",
+    "img_num_per_cls = get_img_num_per_cls(100, \"exp\", imb_ratio, len(train_x))\n",
+    "train_x, train_y = gen_imbalanced_data(img_num_per_cls, train_x, train_y)\n",
+    "\n",
+    "# Writing as txt into \"output\" dir in inits\n",
+    "dataxy = [(train_x, train_y), (val_x, val_y), (test_x, test_y)]\n",
+    "for i, j in enumerate([\"train\", \"val\", \"test\"]):\n",
+    "    with open(f'{output}{imb_ratio}_{j}.txt', 'w') as f:\n",
+    "        for line, lab in zip(dataxy[i][0], dataxy[i][1]):\n",
+    "            f.write(line + \" \" + str(lab))\n",
+    "            f.write('\\n')\n",
+    "\n",
+    "for phase in [\"train\",\"val\", \"test\"]:\n",
+    "    finals = []\n",
+    "    labels = []\n",
+    "    input = f\"{output}{imb_ratio}_{phase}.txt\"\n",
+    "    with open(input) as f:\n",
+    "        for line in f:\n",
+    "            finals.append(line.split()[0])\n",
+    "            labels.append(line.split()[-1])\n",
+    "\n",
+    "\n",
+    "    print(np.unique(labels, return_counts=True)[1])\n",
+    "    max_val = max(np.unique(labels, return_counts=True)[1])\n",
+    "    min_val = min(np.unique(labels, return_counts=True)[1])\n",
+    "    sum_val = sum(np.unique(labels, return_counts=True)[1])\n",
+    "    cls_count = len(np.unique(labels, return_counts=True)[1])\n",
+    "    print(f\"{phase} -> Max: {max_val} | Min: {min_val} | Sum: {sum_val} | Imb: {max_val/min_val} | Class count: {cls_count}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ by [Rahul Vigneswaran](https://rahulvigneswaran.github.io/), [Marc T. Law](http:
   - [🧪 Advanced Usage](#-advanced-usage)
     - [✔ Things to do before you run the code from this repo](#-things-to-do-before-you-run-the-code-from-this-repo)
     - [📀 How to use?](#-how-to-use)
+    - [📚 How to create the mini-ImageNet-LT dataset?](#-how-to-create-the-mini-imagenet-lt-dataset)
     - [⚙ Arguments](#-arguments)
   - [🏋️‍♂️ Trained weights](#%EF%B8%8F%EF%B8%8F-trained-weights)
   - [🪀 Results on a Toy Dataset](#-results-on-a-toy-dataset)
@@ -71,6 +72,8 @@ print(f"After: {np.unique(lab, return_counts=True)}")
     - For CIFAR100-LT: `run_all_CIFAR100-LT.sh`
     - For mini-ImageNet-LT : `run_all_mini-ImageNet-LT.sh`
 
+### 📚 How to create the mini-ImageNet-LT dataset?
+Check `Notebooks/Create_mini-ImageNet-LT.ipynb` for the script that generates the mini-ImageNet-LT dataset with varying imbalance ratios and train-test-val splits.
 ### ⚙ Arguments
 - `--seed` : Select seed for fixing it. 
     - Default : `1`
@@ -182,7 +185,7 @@ Ignored `tailcalib_pip` as it is for the `tailcalib` pip package.
 ```
 @inproceedings{rahul2021tailcalibX,
     title   = {{Feature Generation for Long-tail Classification}},
-    author  = {Rahul Vigneswaran, Marc T. Law, Vineeth N. Balasubramanian, Makarand Tapaswi},
+    author  = {Rahul Vigneswaran and Marc T. Law and Vineeth N. Balasubramanian and Makarand Tapaswi},
     booktitle = {ICVGIP},
     year = {2021}
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,7 @@ datasets @@
     extra
     garbage
     datasets
-    Notebooks
+    # Notebooks
     .vscode
     *.csv
     *.pt
@@ Expand Down @@