Skip to content

Commit

Permalink
merge with main
Browse files Browse the repository at this point in the history
  • Loading branch information
zarzouram committed Feb 28, 2022
2 parents e11eafa + 325f5df commit 49418e1
Show file tree
Hide file tree
Showing 188 changed files with 2,288 additions and 150 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,11 @@ dmypy.json
.pyre/

# code/test/
*test*
test/
runs/
.data/
<<<<<<< HEAD
.vector_cache
=======
logs/exp_3*
>>>>>>> UseTorch.text.vocab
Empty file removed code/__init__.py
Empty file.
21 changes: 13 additions & 8 deletions code/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,31 @@
}
},
"pathes": {
"embedding_path": "/srv/data/guszarzmo/embeddings/Glove/glove.6B.300d.txt",
"checkpoints": "/srv/data/guszarzmo/mlproject/checkpoints/"
"embedding_path": "/srv/data/guszarzmo/embeddings/Glove",
"checkpoint": "/srv/data/guszarzmo/mlproject/checkpoints/"
},
"dataloader_parms": {
"batch_size": 32,
"shuffle": true,
"num_workers": 4
},
"train_parms": {
"grad_clip": 5.0,
"lambda_c": 1.0,
"epochs": 100,
"val_interval": 5,
"stop_criteria": 25
"val_interval": 2,
"early_stop": 10,
"lr_patience": 5,
"embedings_finetune": 10,
"grad_clip": 5.0,
"lambda_c": 1.0
},
"optim_params": {
"encoder_lr": 1e-4,
"transformer_lr": 1e-4
"transformer_lr": 1e-4,
"lr_factors": [
0.75,
0.75
]
},

"max_len": 52,
"min_freq": 3,
"seed": 9001
Expand Down
28 changes: 21 additions & 7 deletions code/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from pathlib import Path
from itertools import chain

import torch

from utils.train_utils import seed_everything
from dataset.utils import parse_arguments, load_json, write_h5_dataset
from dataset.utils import write_json
Expand All @@ -13,14 +15,16 @@
from dataset.dataset_helper import split_dataset, build_vocab


def get_data(json_path: str, imgs_dir: str) -> ImagesAndCaptions:
def get_data(json_path: str,
imgs_dir: str,
max_len: int = 52) -> ImagesAndCaptions:
"""Load annations json file and return a images ids with its captions in
the following format:
image_name: {image_id: list of captions tokens}
"""

annotations, images_id = load_json(json_path)
captions = get_captions(annotations)
captions = get_captions(annotations, max_len)
images_w_captions = combine_image_captions(images_id, captions, imgs_dir)

return images_w_captions
Expand All @@ -42,10 +46,18 @@ def get_data(json_path: str, imgs_dir: str) -> ImagesAndCaptions:
train_imgs_dir = str(ds_dir / args.image_train) # train images path
val_imgs_dir = str(ds_dir / args.image_val) # val images path
output_dir.mkdir(parents=True, exist_ok=True)
# torchtext.vocab
# Vector must be one of the vectors supoorted by
# torchtext.vocab.Vectors classes
# https://github.com/pytorch/text/blob/0169cde2f1d446ae886ef0be07e9a673585ed256/torchtext/vocab.py#L151

vector_dir = Path(os.path.expanduser(args.vector_dir))
vector_name = list(vector_dir.glob("*.zip")) # dir must have one zip file
vector_name = f"{vector_name[0].name.strip('.zip')}.{args.vector_dim}d"

# process annotation files
print("Process annotation files...")
images_captions = get_data(train_ann_path, train_imgs_dir)
images_captions = get_data(train_ann_path, train_imgs_dir, args.max_len)
images_captions_test = get_data(val_ann_path, val_imgs_dir)

# split data
Expand All @@ -54,12 +66,13 @@ def get_data(json_path: str, imgs_dir: str) -> ImagesAndCaptions:

# Create vocab from train dataset set OOV to <UNK>, then encode captions
captions = [chain.from_iterable(d["captions"]) for d in train_ds.values()]
vocab = build_vocab(captions)
vocab = build_vocab(captions, str(vector_dir), vector_name, args.min_freq)
print("Processing finished.\n")

# Create numpy arrays for images, list of list of list of str for captions
# after encoding them and list of list for captions lengthes then save them
for ds, split in zip([train_ds, val_ds, test_ds], ["train", "val", "test"]):
for ds, split in zip([train_ds, val_ds, test_ds],
["train", "val", "test"]):
# create arrays
arrs = run_create_arrays(dataset=ds, vocab=vocab, split=split)
images, captions_encoded, lengthes = arrs
Expand All @@ -72,10 +85,11 @@ def get_data(json_path: str, imgs_dir: str) -> ImagesAndCaptions:
data=images,
type="uint8")

write_json(str(output_dir / f"{split}_captions.json"), captions_encoded)
write_json(str(output_dir / f"{split}_captions.json"),
captions_encoded)
write_json(str(output_dir / f"{split}_lengthes.json"), lengthes)
print(f"Saving {split} dataset finished.\n")

vocab.save_vocab(str(output_dir / "vocab.json"))
torch.save(vocab, str(output_dir / "vocab.pth"))

print("\nCreating dataset files finished.\n")
7 changes: 6 additions & 1 deletion code/dataset/custom_types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from typing import DefaultDict, List, Dict
from typing import DefaultDict, List, Dict, Mapping
from collections import Counter

Captions = DefaultDict[str, List[List[str]]]
ImagesAndCaptions = Dict[str, Captions]


class BOW(Counter, Mapping[str, int]):
pass
73 changes: 35 additions & 38 deletions code/dataset/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ def __getitem__(self, i: int) -> Tuple[Tensor, Tensor, Tensor]:
# [seq_len_max, captns_num=5]
y = [torch.as_tensor(c, dtype=torch.long) for c in self.captions[i]]
y = pad_sequence(y, padding_value=self.pad_id) # type: Tensor
# select random
idx = np.random.randint(0, y.size(-1))
y_selected = y[:, idx].view(-1, 1)
y = torch.hstack([y_selected, y[:, :idx], y[:, idx + 1:]])
# # select random
# idx = np.random.randint(0, y.size(-1))
# y_selected = y[:, idx].view(-1, 1)
# y = torch.hstack([y_selected, y[:, :idx], y[:, idx + 1:]])

# Lengthes: select the random length and rearrange to have it in idx=0
ls = torch.as_tensor(self.lengthes[i], dtype=torch.long)
ls_selected = ls[idx]
ls = torch.hstack([ls_selected, ls[:idx], ls[idx + 1:]])
# ls_selected = ls[idx]
# ls = torch.hstack([ls_selected, ls[:idx], ls[idx + 1:]])

return X, y, ls

Expand All @@ -85,28 +85,24 @@ def __call__(self, batch) -> Tuple[Tensor, Tensor, Tensor]:
# [B, max_seq_len, captns_num=5]
ls = torch.stack(ls) # (B, num_captions)
y = pad_sequence(y, batch_first=True, padding_value=self.pad)
y = y.permute(0, 2, 1) # type: Tensor # [B, captns_num, max_seq_len]

# Either pad or teruncate to the max len
pad_right = self.max_len - y.size(-1)
# truncate if len > max_len; keep the last token=<EOS>
if pad_right < 0:
t_len = self.max_len - 1
y = torch.dstack((y[:, :, :t_len], y[:, :, -1].unsqueeze(2)))
ls[ls > self.max_len] = self.max_len
# pad to the max_len
elif pad_right > 0:

# pad to the max len
pad_right = self.max_len - y.size(1)
if pad_right > 0:
# [B, captns_num, max_seq_len]
y = y.permute(0, 2, 1) # type: Tensor
y = ConstantPad1d((0, pad_right), value=self.pad)(y)
y = y.permute(0, 2, 1) # [B, max_len, captns_num]

X = torch.stack(X) # (B, 3, 256, 256)
y = y.permute(0, 2, 1) # [B, max_len, captns_num]
X = torch.stack(X) # (B, 3, 256, 256)

return X, y, ls


if __name__ == "__main__":
from utils import seed_worker
from tqdm import tqdm
from pathlib import Path

SEED = 9001
random.seed(SEED)
Expand All @@ -120,24 +116,25 @@ def __call__(self, batch) -> Tuple[Tensor, Tensor, Tensor]:
g = torch.Generator()
g.manual_seed(SEED)

img_p = "/srv/data/guszarzmo/mlproject/data/mscoco_h5/test_images.hdf5"
cap_p = "/srv/data/guszarzmo/mlproject/data/mscoco_h5/test_captions.json"
ls_p = "/srv/data/guszarzmo/mlproject/data/mscoco_h5/test_lengthes.json"
train = HDF5Dataset(img_p, cap_p, ls_p, 0)

num_epochs = 2
loader_params = {
"batch_size": 100,
"shuffle": True,
"num_workers": 4,
"worker_init_fn": seed_worker,
"generator": g
}
data_loader = data.DataLoader(train,
collate_fn=collate_padd(30),
**loader_params)

for X, y, ls in tqdm(data_loader, total=len(data_loader)):
pass
apath = Path("/srv/data/guszarzmo/mlproject/data/mscoco_h5/")
for p in ["train", "val", "test"]:
img_p = str(apath / f"{p}_images.hdf5")
cap_p = str(apath / f"{p}_captions.json")
ls_p = str(apath / f"{p}_lengthes.json")
train = HDF5Dataset(img_p, cap_p, ls_p, 0)

loader_params = {
"batch_size": 100,
"shuffle": True,
"num_workers": 4,
"worker_init_fn": seed_worker,
"generator": g
}
data_loader = data.DataLoader(train,
collate_fn=collate_padd(30),
**loader_params)

for X, y, ls in tqdm(data_loader, total=len(data_loader)):
pass

print("done")
37 changes: 27 additions & 10 deletions code/dataset/dataset_helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Tuple
from numpy.typing import NDArray
from .custom_types import Captions, ImagesAndCaptions
from .custom_types import Captions, ImagesAndCaptions, BOW

from collections import defaultdict, Counter
from itertools import chain
Expand All @@ -10,14 +10,16 @@
import re

import numpy as np
from torchtext.vocab import Vocab

from sklearn.model_selection import train_test_split

import cv2

from .vocab import Vocabulary
from .utils import init_unk


def get_captions(annotations: list) -> Captions:
def get_captions(annotations: list, max_len: int) -> Captions:
""" Images and thier captions are separated into two list of dicts.
json_path: a string of the mscoco annotation file
Expand All @@ -29,7 +31,11 @@ def get_captions(annotations: list) -> Captions:
captions = [
s for s in re.split(r"(\W)", annton["caption"]) if s.strip()
]
captions = ["<SOS>"] + captions + ["<EOS>"]
# Truncate if len > max_len - 2 (<sos> and <eos>)
if len(captions) > (max_len - 2):
captions = captions[:max_len - 2]

captions = ["<sos>"] + captions + ["<eos>"]
captions_dict[annton["image_id"]].append(captions)

return captions_dict
Expand Down Expand Up @@ -70,7 +76,7 @@ def load_images(image_path: str,


def encode_captions(captions: List[List[str]],
vocab: Vocabulary) -> Tuple[List[List[int]], List[int]]:
vocab: Vocab) -> Tuple[List[List[int]], List[int]]:
"""Encode captions text to the respective indices"""
encoded = []
lengthes = []
Expand Down Expand Up @@ -121,14 +127,25 @@ def split_dataset(
return dict(train_split), dict(val_split), test_split


def build_vocab(captions: List[chain]) -> Vocabulary:
all_words = list(chain.from_iterable(captions))
return Vocabulary(dict(Counter(all_words)))
def build_vocab(captions: List[chain],
vector_dir: str,
vector_name: str,
min_freq: int = 2) -> Vocab:
all_words = list(chain.from_iterable(captions)) # Type: List[str]
bag_of_words: BOW = Counter(all_words)

vocab: Vocab = Vocab(bag_of_words,
min_freq=min_freq,
specials=("<unk>", "<pad>", "<sos>", "<eos>"),
vectors_cache=vector_dir,
vectors=vector_name,
unk_init=init_unk)
return vocab


def create_input_arrays(
dataset: Tuple[str, Captions],
vocab: Vocabulary) -> Tuple[NDArray, List[List[int]], List[int]]:
vocab: Vocab) -> Tuple[NDArray, List[List[int]], List[int]]:
"""load images and encode captions text"""

image = load_images(dataset[0], 256, 256)
Expand All @@ -139,7 +156,7 @@ def create_input_arrays(

def run_create_arrays(
dataset: ImagesAndCaptions,
vocab: Vocabulary,
vocab: Vocab,
split: str,
num_proc: int = 4
) -> Tuple[NDArray, List[List[List[int]]], List[List[int]]]:
Expand Down
31 changes: 31 additions & 0 deletions code/dataset/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

import numpy as np
import torch
from torch import Tensor
from torch.nn.init import xavier_uniform_


def parse_arguments() -> Namespace:
Expand Down Expand Up @@ -51,6 +53,28 @@ def parse_arguments() -> Namespace:
default="/srv/data/guszarzmo/mlproject/data/mscoco_h5/",
help="Directory have MS COCO image files for the val split.")

parser.add_argument("--vector_dir",
type=str,
default="/srv/data/guszarzmo/embeddings/Glove",
help="Directory to embedding vector.")

parser.add_argument("--vector_dim",
type=str,
default="300",
help="Vector dimention")

parser.add_argument(
"--min_freq",
type=int,
default=2,
help="minimum frequency needed to include a token in the vocabulary")

parser.add_argument(
"--max_len",
type=int,
default=52,
help="minimum length for captions")

args = parser.parse_args()

return args
Expand Down Expand Up @@ -86,3 +110,10 @@ def seed_worker(worker_id):
worker_seed = torch.initial_seed() % 2**32
np.random.seed(worker_seed)
random.seed(worker_seed)


def init_unk(tensor: Tensor) -> Tensor:
"""initialize unkown word vectors. A function that takes in a Tensor and
returns a weight Tensor of the same size"""
weight_unk = torch.ones(tensor.size())
return xavier_uniform_(weight_unk.view(1, -1)).view(-1)
Loading

0 comments on commit 49418e1

Please sign in to comment.