-
Notifications
You must be signed in to change notification settings - Fork 26
/
preprocess_caption.py
25 lines (20 loc) · 1020 Bytes
/
preprocess_caption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
import argparse
import fastText
from data import ConvertCapVec
parser = argparse.ArgumentParser()
parser.add_argument('--caption_root', type=str, required=True,
help='root directory that contains captions')
parser.add_argument('--fasttext_model', type=str, required=True,
help='pretrained fastText model (binary file)')
parser.add_argument('--max_nwords', type=int, default=50,
help='maximum number of words (default: 50)')
args = parser.parse_args()
if __name__ == '__main__':
caption_root = args.caption_root.split('/')[-1]
if (caption_root + '_vec') not in os.listdir(args.caption_root.replace(caption_root, '')):
os.makedirs(args.caption_root + '_vec')
print('Loading a pretrained fastText model...')
word_embedding = fastText.load_model(args.fasttext_model)
print('Making vectorized caption data files...')
ConvertCapVec().convert_and_save(args.caption_root, word_embedding, args.max_nwords)