-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlistener.py
51 lines (39 loc) · 1.71 KB
/
listener.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import torch
import torch.nn.functional as F
from scipy import spatial
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def pickone(decoder, word_map, img1, img2, caps, caplens):
img1 = img1.to(device).float()
img2 = img2.to(device).float()
caps = caps.to(device).long()
caplens = caplens.to(device).squeeze(1).long()
# Forward prop.
score1 = decoder(img1, caps, caplens) #(batch, len, vocab_size)
score2 = decoder(img2, caps, caplens)
sc1 = score1.argmax(dim=2) #(batch, len)
sc2 = score2.argmax(dim=2)
# Clean up sentences, get the clean caption of each (batch, sentlen)
hp1 = list(map(lambda c: [w for w in c if w not in {word_map['<end>'], word_map['<start>'], word_map['<pad>']}],
sc1.to('cpu').numpy()))
hp2 = list(map(lambda c: [w for w in c if w not in {word_map['<end>'],word_map['<start>'], word_map['<pad>']}],
sc2.to('cpu').numpy()))
cp = list(map(lambda c: [w for w in c if w not in {word_map['<end>'], \
word_map['<start>'], \
word_map['<pad>']}],caps.to('cpu').numpy()))
idxx = []
diff = []
for i in range(img1.shape[0]): # batch
# encode every clean word and sum the embedding
cpi = decoder.embedding(torch.tensor(cp[i]).long().to(device)).sum(dim=0).cpu().detach().numpy()
sc1i = decoder.embedding(torch.tensor(hp1[i]).long().to(device)).sum(dim=0).cpu().detach().numpy()
sc2i = decoder.embedding(torch.tensor(hp2[i]).long().to(device)).sum(dim=0).cpu().detach().numpy()
# similarity, higher better
res1 = 1 - spatial.distance.cosine(sc1i, cpi)
res2 = 1 - spatial.distance.cosine(sc2i, cpi)
it = (res1 < res2) * 1.0
idxx.append(it)
if it == 0:
diff.append(res1 - res2)
else:
diff.append(res2 - res1)
return idxx, diff