Skip to content

Commit 46a60e7

Browse files
authored
Add files via upload
1 parent 389574c commit 46a60e7

File tree

1 file changed

+92
-0
lines changed

1 file changed

+92
-0
lines changed

build_vocab.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# encoding: UTF-8
2+
3+
#-----------------------------------------------------------------------
4+
# We preprocess the text data by lower casing, and replacing words which
5+
# occur less than 5 times in the 82K training set with <unk>;
6+
# This results in a vocabulary size of 10,622 (from 32,807 words).
7+
#-----------------------------------------------------------------------
8+
9+
import os
10+
import numpy as np
11+
import cPickle as pickle
12+
import time
13+
14+
15+
train_images_captions_path = './data/train_images_captions.pkl'
16+
with open(train_images_captions_path, 'r') as train_fr:
17+
train_images_captions = pickle.load(train_fr)
18+
19+
val_images_captions_path = './data/val_images_captions.pkl'
20+
with open(val_images_captions_path, 'r') as val_fr:
21+
val_images_captions = pickle.load(val_fr)
22+
23+
24+
#------------------------------------------------------------------------
25+
# Borrowed this function from NeuralTalk:
26+
# https://github.com/karpathy/neuraltalk/blob/master/driver.py#L16
27+
#-----------------------------------------------------------------------
28+
def preProBuildWordVocab(sentence_iterator, word_count_threshold=5):
29+
print 'Preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, )
30+
31+
t0 = time.time()
32+
word_counts = {}
33+
nsents = 0
34+
35+
for sent in sentence_iterator:
36+
nsents += 1
37+
tmp_sent = sent.split(' ')
38+
# remove the empty string '' in the sentence
39+
tmp_sent = filter(None, tmp_sent)
40+
for w in tmp_sent:
41+
word_counts[w] = word_counts.get(w, 0) + 1
42+
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
43+
print 'Filter words from %d to %d in %0.2fs' % (len(word_counts), len(vocab), time.time()-t0)
44+
45+
ixtoword = {}
46+
ixtoword[0] = '<pad>'
47+
ixtoword[1] = '<bos>'
48+
ixtoword[2] = '<eos>'
49+
ixtoword[3] = '<unk>'
50+
51+
wordtoix = {}
52+
wordtoix['<pad>'] = 0
53+
wordtoix['<bos>'] = 1
54+
wordtoix['<eos>'] = 2
55+
wordtoix['<unk>'] = 3
56+
57+
for idx, w in enumerate(vocab):
58+
wordtoix[w] = idx + 4
59+
ixtoword[idx+4] = w
60+
61+
word_counts['<eos>'] = nsents
62+
word_counts['<bos>'] = nsents
63+
word_counts['<pad>'] = nsents
64+
word_counts['<unk>'] = nsents
65+
66+
bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
67+
bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
68+
bias_init_vector = np.log(bias_init_vector)
69+
bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
70+
71+
return wordtoix, ixtoword, bias_init_vector
72+
73+
74+
# extract all sentences in captions
75+
all_sents = []
76+
for image, sents in train_images_captions.iteritems():
77+
for each_sent in sents:
78+
all_sents.append(each_sent)
79+
#for image, sents in val_images_captions.iteritems():
80+
# for each_sent in sents:
81+
# all_sents.append(each_sent)
82+
83+
word_to_idx, idx_to_word, bias_init_vector = preProBuildWordVocab(all_sents, word_count_threshold=5)
84+
85+
with open('./data/idx_to_word.pkl', 'w') as fw_1:
86+
pickle.dump(idx_to_word, fw_1)
87+
88+
with open('./data/word_to_idx.pkl', 'w') as fw_2:
89+
pickle.dump(word_to_idx, fw_2)
90+
91+
np.save('./data/bias_init_vector.npy', bias_init_vector)
92+

0 commit comments

Comments
 (0)