forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpretrained.py
113 lines (93 loc) · 3.29 KB
/
pretrained.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python
# Author: http://lazyprogrammer.me
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
# WHERE TO GET THE VECTORS:
# GloVe: https://nlp.stanford.edu/projects/glove/
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
def dist1(a, b):
return np.linalg.norm(a - b)
def dist2(a, b):
return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))
# pick a distance type
dist, metric = dist2, 'cosine'
# dist, metric = dist1, 'euclidean'
## more intuitive
# def find_analogies(w1, w2, w3):
# for w in (w1, w2, w3):
# if w not in word2vec:
# print("%s not in dictionary" % w)
# king = word2vec[w1]
# man = word2vec[w2]
# woman = word2vec[w3]
# v0 = king - man + woman
# min_dist = float('inf')
# best_word = ''
# for word, v1 in iteritems(word2vec):
# if word not in (w1, w2, w3):
# d = dist(v0, v1)
# if d < min_dist:
# min_dist = d
# best_word = word
# print(w1, "-", w2, "=", best_word, "-", w3)
## faster
def find_analogies(w1, w2, w3):
for w in (w1, w2, w3):
if w not in word2vec:
print("%s not in dictionary" % w)
king = word2vec[w1]
man = word2vec[w2]
woman = word2vec[w3]
v0 = king - man + woman
distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
idx = distances.argmin()
best_word = idx2word[idx]
print(w1, "-", w2, "=", best_word, "-", w3)
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
embedding = []
idx2word = []
with open('../large_files/glove.6B/glove.6B.50d.txt') as f:
# is just a space-separated text file in the format:
# word vec[0] vec[1] vec[2] ...
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
embedding.append(vec)
idx2word.append(word)
print('Found %s word vectors.' % len(word2vec))
embedding = np.array(embedding)
V, D = embedding.shape
find_analogies('king', 'man', 'woman')
find_analogies('france', 'paris', 'london')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'she')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')