prakashpandey9
diff --git a/‎README.md
Lines changed: 87 additions & 0 deletions b/‎README.md
Lines changed: 87 additions & 0 deletions
diff --git a/‎data_loader.py
Lines changed: 81 additions & 0 deletions b/‎data_loader.py
Lines changed: 81 additions & 0 deletions
diff --git a/‎download_skipthought.py
Lines changed: 12 additions & 0 deletions b/‎download_skipthought.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎imgs/net.jpeg
71.6 KB b/‎imgs/net.jpeg
71.6 KB
diff --git a/‎main.py
Lines changed: 124 additions & 0 deletions b/‎main.py
Lines changed: 124 additions & 0 deletions
diff --git a/‎nets/__init__.py b/‎nets/__init__.py
diff --git a/‎nets/discriminator.py
Lines changed: 80 additions & 0 deletions b/‎nets/discriminator.py
Lines changed: 80 additions & 0 deletions
@@ -0,0 +1,87 @@
+# Text to Image Synthesis using Skip-thought Vectors
+
+## Description
+This is a PyTorch implementation of the paper Generative Adversarial Text-to-Image Synthesis [http://arxiv.org/abs/1605.05396] using skip thought vectors for caption embedding. This implementation is based on DCGAN. Below is the model architecture where blue bars represent skip thought vector for the captions.
+
+[Figure]
+Image Source : Paper
+
+## Setup and Installments
+  ** Python==3.6.6
+  ** PyTorch==0.4.0
+  ** TorchVision==0.2.1
+  ** Theano
+
+## Dataset
+  **This model has been trained on the flowers dataset. Download flower dataset from here[] and save the images in Data folder as Data/flowers.
+  
+  ** Now download the corresponding captions from here[]. After extracting, copy the text_c10 folder and paste it in Data folder as Data/text_c10.
+
+## Skip-Thought Model
+  ** Download the pretrained models and vocabulary for skip thought vectors as per the instructions given below. Save the downloaded files in Data/skipthoughts.
+
+  ** Some of the files are quite large(>2GB). So make sure there is enough space available.
+
+  ** Run below code to download skip thought model and all other required files
+  python download_skipthought.py
+
+
+## Usage
+  ** Data Pre-processing : 
+  python data_loader.py
+
+  ** Training : 
+  		** Args :
+  				dataset : Dataset used. Default = flowers
+  				batch_size : Batch Size. Default = 1
+  				num_epochs : NUmber of epochs to train. Default = 200
+  				img_size : Size of the image. Default = 64
+  				z_dim : Latent variable dimension. Default = 100
+  				text_embedding_dim : Embedding dim of caption. Default = 4800
+  				reduced_text_dim : Reduced embedding dim of caption. Default = 1024
+  				learning_rate : Learning Rate. Default = 0.0002
+  				beta1 : Hyperparameter of the Adam optimizer. Default = 0.5
+  				beta2 : Hyperparameter of the Adam optimizer. Default = 0.999
+  				l1_coeff : Coefficient for the L1 Loss. Default = 50
+  				resume_epoch : Resume epoch to resume training. Default = 1
+
+  		** Train the model by running below code
+  			python main.py
+
+  ** Testing model by giving custom input text :
+  	  python predict.py --text="Input caption to be used to generate the image"
+
+  	The generated image will be save to text directory inside Data folder as Data/Testing 
+
+## Model key-points
+
+  ** Skip Thought is an efficient model used for sentence embedding and is based on the concept of word
+  embedding (word2vec or Glove). It returns a numpy array of dimension 4800 in which the first 2400
+  dimensions is the uni-skip model and the last 2400 dimensions is the bi-skip model. We use the combine
+  -skip vectors as experimentally, they perform the best.
+
+  ** Text2Image model is a Generarive Adversarial Network based model which is built on top of the DCGAN.
+  It consists of a Discriminator network and a Generator network.
+
+  ** Discriminator network not only classifies the images generated by the generate as a fake image but also those real images which do not correspond to the correct caption. In short, fake examples are categorized by following :
+  Fake Image + Correct Caption
+  False Image(Real Image) + Incorrect Caption
+
+  ** Images are 64 x 64 in dimension
+
+## Generated Images
+Following are some of the images generated by this model
+[A table of few 5-6 images along with their captions]
+
+## TODO
+Implementation of the same using an autoencoder for sentence embedding
+
+
+## References
+  * Generative Adversarial Text-to-Image Synthesis - http://arxiv.org/abs/1605.05396
+  * Tensorflow implementation - https://github.com/paarthneekhara/text-to-image
+  * Skip-Thought Model - https://github.com/ryankiros/skip-thoughts
+
+
+## License
+MIT
@@ -0,0 +1,81 @@
+import os
+import torch
+import skipthoughts
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.autograd import Variable
+from torch.utils.data import Dataset
+
+# Each batch will have 3 things : true image, its captions(5), and false image(real image but image
+# corresponding to an incorrect caption).
+# Discriminator is trained in such a way that true_img + caption corresponds to a real example and
+# false_img + caption corresponds to a fake example.
+
+
+class Text2ImageDataset(Dataset):
+
+    def __init__(self, data_dir):
+        self.data_dir = data_dir
+
+        self.load_flower_dataset()
+
+    def load_flower_dataset(self):
+        # It will return two things : a list of image file names, a dictionary of 5 captions per image
+        # with image file name as the key of the dictionary and 5 values(captions) for each key.
+
+        print ("------------------  Loading images  ------------------")
+        self.img_files = []
+        for f in os.listdir(os.path.join(self.data_dir, 'flowers')):
+            self.img_files.append(f)
+
+        print ('Total number of images : {}'.format(len(self.img_files)))
+
+        print ("------------------  Loading captions  ----------------")
+        self.img_captions = {}
+        for class_dir in tqdm(os.listdir(os.path.join(self.data_dir, 'text_c10'))):
+            if not 't7' in class_dir:
+                for cap_file in class_dir:
+                    if 'txt' in cap_file:
+                        with open(cap_file) as f:
+                            captions = f.read().split('\n')
+                        img_file = cap_file[:11] + '.jpg'
+                        # 5 captions per image
+                        self.img_captions[img_file] = captions[:5]
+
+        print ("---------------  Loading Skip-thought Model  ---------------")
+        model = skipthoughts.load_model()
+        self.encoded_captions = {}
+
+        print ("------------  Encoding of image captions STARTED  ------------")
+        for img_file in self.img_captions:
+            self.encoded_captions[img_file] = skipthoughts.encode(model, self.img_captions[img_file])
+            # print (type(self.encoded_captions[img_file]))
+            # convert it to torch tensor if it is a numpy array
+
+        print ("-------------  Encoding of image captions DONE  -------------")
+
+    def read_image(self, image_file_name):
+        image = Image.open(os.path.join(self.data_dir, 'flowers/' + image_file_name))
+        # check its shape and reshape it to (64, 64, 3)
+        return image
+
+    def get_false_img(self, index):
+        false_img_id = np.random.randint(len(self.img_files))
+        if false_img_id != index:
+            return self.img_files[false_img_id]
+
+        return self.get_false_img(index)
+
+    def __len__(self):
+
+        return len(self.img_files)
+
+    def __getitem__(self, index):
+
+        sample = {}
+        sample['true_imgs'] = torch.FloatTensor(self.read_image(self.img_files[index]))
+        sample['false_imgs'] = torch.FloatTensor(self.read_image(self.get_false_img(index)))
+        sample['true_embed'] = torch.FloatTensor(self.encoded_captions[self.img_files[index]])
+
+        return sample
@@ -0,0 +1,12 @@
+import os
+
+print ('Downloading Skip-Thought Model ...........')
+os.sysytem('wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt')
+os.sysytem('wget http://www.cs.toronto.edu/~rkiros/models/utable.npy')
+os.sysytem('wget http://www.cs.toronto.edu/~rkiros/models/btable.npy')
+os.sysytem('wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz')
+os.sysytem('wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl')
+os.sysytem('wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz')
+os.sysytem('wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl')
+
+print ('Download Completed ............')
@@ -0,0 +1,124 @@
+import os
+import torch
+import train
+import argparse
+import numpy as np
+
+from train import GAN_CLS
+from torch.utils.data import DataLoader
+from data_loader import Text2ImageDataset
+
+
+def check_dir(dir_name):
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+
+    print ('{} created'.format(dir_name))
+
+
+def check_args(args):
+    # Make all directories if they don't exist
+
+    # --checkpoint_dir
+    check_dir(args.checkpoint_dir)
+
+    # --sample_dir
+    check_dir(args.sample_dir)
+
+    # --log_dir
+    check_dir(args.log_dir)
+
+    # --final_model dir
+    check_dir(args.final_model)
+
+    # --epoch
+    assert args.num_epochs > 0, 'Number of epochs must be greater than 0'
+
+    # --batch_size
+    assert args.batch_size > 0, 'Batch size must be greater than zero'
+
+    # --z_dim
+    assert args.z_dim > 0, 'Size of the noise vector must be greater than zero'
+
+    return args
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument_group('Dataset related arguments')
+    parser.add_argument('--data_dir', type=str, default="Data",
+                        help='Data Directory')
+
+    parser.add_argument('--dataset', type=str, default="flowers",
+                        help='Dataset to train')
+
+    parser.add_argument_group('Model saving path and steps related arguments')
+    parser.add_argument('--log_step', type=int, default=100,
+                        help='Save INFO into logger after every x iterations')
+
+    parser.add_argument('--sample_step', type=int, default=100,
+                        help='Save generated image after every x iterations')
+
+    parser.add_argument('--checkpoint_dir', type=str, default='checkpoints',
+                        help='Save model checkpoints after every x iterations')
+
+    parser.add_argument('--sample_dir', type=str, default='sample',
+                        help='Save generated image after every x iterations')
+
+    parser.add_argument('--log_dir', type=str, default='logs',
+                        help='Save INFO into logger after every x iterations')
+
+    parser.add_argument('--final_model', type=str, default='final_model',
+                        help='Save INFO into logger after every x iterations')
+
+    parser.add_argument_group('Model training related arguments')
+    parser.add_argument('--num_epochs', type=int, default=200,
+                        help='Total number of epochs to train')
+
+    parser.add_argument('--batch_size', type=int, default=1,
+                        help='Batch Size')
+
+    parser.add_argument('--img_size', type=int, default=64,
+                        help='Size of the image')
+
+    parser.add_argument('--z_dim', type=int, default=100,
+                        help='Size of the latent variable')
+
+    parser.add_argument('--text_embed_dim', type=int, default=4800,
+                        help='Size of the embeddding for the captions')
+
+    parser.add_argument('--text_reduced_dim', type=int, default=1024,
+                        help='Reduced dimension of the caption encoding')
+
+    parser.add_argument('--learning_rate', type=float, default=0.0002,
+                        help='Learning Rate')
+
+    parser.add_argument('--beta1', type=float, default=0.5,
+                        help='Hyperparameter of the Adam optimizer')
+
+    parser.add_argument('--beta2', type=float, default=0.999,
+                        help='Hyperparameter of the Adam optimizer')
+
+    parser.add_argument('--l1_coeff', type=float, default=50,
+                        help='Coefficient for the L1 Loss')
+
+    parser.add_argument('--resume_epoch', type=int, default=1,
+                        help='Resume epoch to resume training')
+
+    args = parser.parse_args()
+
+    check_args(args)
+
+    dataset = Text2ImageDataset()
+    data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
+
+    gan = GAN_CLS(args, data_loader)
+
+    gan.build_model()
+    gan.train_model()
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.nn import functional as F
+
+
+class Discriminator(nn.Module):
+    def __init__(self, batch_size, img_size, text_embed_dim, text_reduced_dim):
+        super(Discriminator, self).__init__()
+
+        self.batch_size = batch_size
+        self.img_size = img_size
+        self.in_channels = img_size.size()[2]
+        self.text_embed_dim = text_embed_dim
+        self.text_reduced_dim = text_reduced_dim
+
+        # Defining the discriminator network architecture
+        self.d_net = nn.Sequential(
+            nn.Conv2d(self.in_channels, 64, 4, 2, 1, bias=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 256, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(256, 512, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.2, inplace=True))
+
+        # output_dim = (batch_size, 4, 4, 512)
+        # text.size() = (batch_size, text_embed_dim)
+
+        # Defining a linear layer to reduce the dimensionality of caption embedding
+        # from text_embed_dim to text_reduced_dim
+        self.text_reduced_dim = nn.Linear(self.text_embed_dim, self.text_reduced_dim)
+
+        self.cat_net = nn.Sequential(
+            nn.Conv2d(512 + self.text_reduced_dim, 512, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.2, inplace=True))
+
+        self.linear = nn.Linear(2 * 2 * 512, 1)
+
+    def forward(self, image, text):
+        """ Given the image and its caption embedding, predict whether the image
+        is real or fake.
+
+        Arguments
+        ---------
+        image : torch.FloatTensor
+            image.size() = (batch_size, 64, 64, 3)
+
+        text : torch.FloatTensor
+            Output of the skipthought embedding model for the caption
+            text.size() = (batch_size, text_embed_dim)
+
+        --------
+        Returns
+        --------
+        output : Probability for the image being real/fake
+        logit : Final score of the discriminator
+
+        """
+
+        d_net_out = self.d_net(image)  # (batch_size, 4, 4, 512)
+        text_reduced = self.text_reduced_dim(text)  # (batch_size, text_reduced_dim)
+        text_reduced = text_reduced.squeeze(1)  # (batch_size, 1, text_reduced_dim)
+        text_reduced = text_reduced.squeeze(2)  # (batch_size, 1, 1, text_reduced_dim)
+        text_reduced = text_reduced.expand(1, 4, 4, self.text_reduced_dim)
+
+        concat_out = torch.cat((d_net_out, text_reduced), 3)  # (1, 4, 4, 512+text_reduced_dim)
+
+        logit = self.cat_net(concat_out)
+        concat_out = torch.view(-1, concat_out.size()[1] * concat_out.size()[2] * concat_out.size()[3])
+        concat_out = self.linear(concat_out)
+
+        output = F.sigmoid(logit)
+
+        return output, logit