Skip to content

Commit

Permalink
Merge pull request #4 from chirag26495/basic_vqa_1
Browse files Browse the repository at this point in the history
model files
  • Loading branch information
adhiraj2001 authored Apr 27, 2023
2 parents 5ef2630 + ea7c550 commit 9ba4efd
Show file tree
Hide file tree
Showing 6 changed files with 929 additions and 0 deletions.
88 changes: 88 additions & 0 deletions model_files/models_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import torch
import torch.nn as nn
import torchvision.models as models


class ImgEncoder(nn.Module):

def __init__(self, embed_size):
"""(1) Load the pretrained model as you want.
cf) one needs to check structure of model using 'print(model)'
to remove last fc layer from the model.
(2) Replace final fc layer (score values from the ImageNet)
with new fc layer (image feature).
(3) Normalize feature vector.
"""
super(ImgEncoder, self).__init__()
# model = models.vgg19(pretrained=True)
model = models.vgg19(weights=models.VGG19_Weights.DEFAULT)
in_features = model.classifier[-1].in_features # input size of feature vector
model.classifier = nn.Sequential(
*list(model.classifier.children())[:-1]) # remove last fc layer

self.model = model # loaded model without last fc layer
self.fc = nn.Linear(in_features, embed_size) # feature vector of image

def forward(self, image):
"""Extract feature vector from image vector.
"""
with torch.no_grad():
img_feature = self.model(image) # [batch_size, vgg16(19)_fc=4096]
img_feature = self.fc(img_feature) # [batch_size, embed_size]

l2_norm = img_feature.norm(p=2, dim=1, keepdim=True).detach()
img_feature = img_feature.div(l2_norm) # l2-normalized feature vector

return img_feature


class QstEncoder(nn.Module):

def __init__(self, qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size):

super(QstEncoder, self).__init__()
self.word2vec = nn.Embedding(qst_vocab_size, word_embed_size)
self.tanh = nn.Tanh()
self.lstm = nn.LSTM(word_embed_size, hidden_size, num_layers)
self.fc = nn.Linear(2*num_layers*hidden_size, embed_size) # 2 for hidden and cell states

def forward(self, question):

qst_vec = self.word2vec(question) # [batch_size, max_qst_length=30, word_embed_size=300]
qst_vec = self.tanh(qst_vec)
qst_vec = qst_vec.transpose(0, 1) # [max_qst_length=30, batch_size, word_embed_size=300]
_, (hidden, cell) = self.lstm(qst_vec) # [num_layers=2, batch_size, hidden_size=512]
qst_feature = torch.cat((hidden, cell), 2) # [num_layers=2, batch_size, 2*hidden_size=1024]
qst_feature = qst_feature.transpose(0, 1) # [batch_size, num_layers=2, 2*hidden_size=1024]
qst_feature = qst_feature.reshape(qst_feature.size()[0], -1) # [batch_size, 2*num_layers*hidden_size=2048]
qst_feature = self.tanh(qst_feature)
qst_feature = self.fc(qst_feature) # [batch_size, embed_size]

return qst_feature


class VqaModel(nn.Module):

def __init__(self, embed_size, qst_vocab_size, ans_vocab_size, word_embed_size, num_layers, hidden_size):

super(VqaModel, self).__init__()
self.img_encoder = ImgEncoder(embed_size)
self.qst_encoder = QstEncoder(qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size)
self.tanh = nn.Tanh()
self.dropout = nn.Dropout(0.5)
self.fc1 = nn.Linear(embed_size, ans_vocab_size)
self.fc2 = nn.Linear(ans_vocab_size, ans_vocab_size)

def forward(self, img, qst):

img_feature = self.img_encoder(img) # [batch_size, embed_size]
qst_feature = self.qst_encoder(qst) # [batch_size, embed_size]
combined_feature = torch.mul(img_feature, qst_feature) # [batch_size, embed_size]
combined_feature = self.tanh(combined_feature)
combined_feature = self.dropout(combined_feature)
combined_feature = self.fc1(combined_feature) # [batch_size, ans_vocab_size=1000]
combined_feature = self.tanh(combined_feature)
combined_feature = self.dropout(combined_feature)
combined_feature = self.fc2(combined_feature) # [batch_size, ans_vocab_size=1000]

return combined_feature
120 changes: 120 additions & 0 deletions model_files/models_basic_variant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import torch
import torch.nn as nn
import torchvision.models as models


class ImgEncoder(nn.Module):

def __init__(self, embed_size):
"""(1) Load the pretrained model as you want.
cf) one needs to check structure of model using 'print(model)'
to remove last fc layer from the model.
(2) Replace final fc layer (score values from the ImageNet)
with new fc layer (image feature).
(3) Normalize feature vector.
"""
super(ImgEncoder, self).__init__()

# vggnet_feat = models.vgg19(pretrained=True).features
vggnet_feat = models.vgg19(weights=models.VGG19_Weights.DEFAULT).features

modules = list(vggnet_feat.children())[:-2]
self.cnn = nn.Sequential(*modules)
self.fc = nn.Sequential(nn.Linear(self.cnn[-3].out_channels, embed_size),
nn.Tanh()) # feature vector of image

def forward(self, image):
"""Extract feature vector from image vector.
# """
with torch.no_grad():
img_feature = self.cnn(image) # [batch_size, vgg16(19)_fc=4096]
img_feature = img_feature.view(-1, 512, 196).transpose(1,2) # [batch_size, 196, 512]
img_feature = self.fc(img_feature) # [batch_size, 196, embed_size]

return img_feature


class QstEncoder(nn.Module):

def __init__(self, qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size):
super(QstEncoder, self).__init__()

self.word2vec = nn.Embedding(qst_vocab_size, word_embed_size)
self.tanh = nn.Tanh()
self.lstm = nn.LSTM(word_embed_size, hidden_size, num_layers)
self.fc = nn.Linear(2*num_layers*hidden_size, embed_size) # 2 for hidden and cell states

def forward(self, question):

qst_vec = self.word2vec(question) # [batch_size, max_qst_length=30, word_embed_size=300]
qst_vec = self.tanh(qst_vec)
qst_vec = qst_vec.transpose(0, 1) # [max_qst_length=30, batch_size, word_embed_size=300]
_, (hidden, cell) = self.lstm(qst_vec) # [num_layers=2, batch_size, hidden_size=512]
qst_feature = torch.cat((hidden, cell), 2) # [num_layers=2, batch_size, 2*hidden_size=1024]
qst_feature = qst_feature.transpose(0, 1) # [batch_size, num_layers=2, 2*hidden_size=1024]
qst_feature = qst_feature.reshape(qst_feature.size()[0], -1) # [batch_size, 2*num_layers*hidden_size=2048]
qst_feature = self.tanh(qst_feature)
qst_feature = self.fc(qst_feature) # [batch_size, embed_size]

return qst_feature


class NotAttention(nn.Module):
def __init__(self, num_channels, embed_size, dropout=True):
"""Stacked attention Module
"""
super(NotAttention, self).__init__()

self.ff_image = nn.Linear(embed_size, num_channels)
self.ff_questions = nn.Linear(embed_size, num_channels)

if dropout:
self.dropout = nn.Dropout(p=0.5)

self.ff_attention = nn.Linear(num_channels, 1)

def forward(self, vi, vq):
"""Extract feature vector from image vector.
"""
hi = vi
hq = vq.unsqueeze(dim=1)

ha = torch.tanh(hi+hq)

if self.dropout:
ha = self.dropout(ha)

u = ha.sum(dim=1)
return u


class VqaModel(nn.Module):
# num_attention_layer and num_mlp_layer not implemented yet
def __init__(self, embed_size, qst_vocab_size, ans_vocab_size, word_embed_size, num_layers, hidden_size, stack_size=1):
super(VqaModel, self).__init__()
self.num_attention_layer = stack_size

self.img_encoder = ImgEncoder(embed_size)
self.qst_encoder = QstEncoder(qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size)

self.san = nn.ModuleList([NotAttention(512, embed_size)]*self.num_attention_layer)

self.mlp = nn.Sequential(nn.Dropout(p=0.5),
nn.Linear(embed_size, ans_vocab_size))

self.tanh = nn.Tanh()

def forward(self, img, qst):

img_feature = self.img_encoder(img) # [batch_size, embed_size]
qst_feature = self.qst_encoder(qst) # [batch_size, embed_size]

vi = img_feature
u = qst_feature

for not_attn_layer in self.san:
u = not_attn_layer(vi, u)

combined_feature = self.mlp(u)
return combined_feature
Loading

0 comments on commit 9ba4efd

Please sign in to comment.