-
Notifications
You must be signed in to change notification settings - Fork 1
/
lsaWordSim.py
66 lines (58 loc) · 2.06 KB
/
lsaWordSim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import time
import numpy as np
class lsaWordSim:
def __init__(self):
#self.w1, self.w2 = word1, word2
print "..........Starting to upload files ......."
t1 = time.time()
self.dictionary = self.uploadFiles("dict.pickle")
self.U = self.uploadFiles("u.pickle")
self.sigma = self.uploadFiles("sigma.pickle")
self.V = self.uploadFiles("v.pickle")
self.wordCount = self.uploadFiles("wordCount.pickle")
print "All files successfully uploaded..........", (time.time() - t1)
def uploadFiles(self, fileName):
with open(fileName, 'rb') as handle:
return pickle.load(handle)
def preProcessing(self, word):
ps = PorterStemmer()
return ps.stem(word.lower())
def checkWordPresence(self, w1 = None, w2 = None):
if w1 is None or w2 is None: w1, w2 = self.w1, self.w2
if w1 in self.dictionary and w2 in self.dictionary:
return self.dictionary[w1], self.dictionary[w2]
else:
return -1, -1
def uSigma(self, index):
vector = []
row = self.U[index]
for i in xrange(len(row)):
vector.append(row[i] * self.sigma[i])
vector = np.asarray(vector, dtype='float32')
return vector
def sigmaV(self, index):
vector = []
for i in xrange(len(self.sigma)):
vector.append(self.sigma[i] * self.V[i][index])
vector = np.asarray(vector, dtype='float32')
return vector
def calculateSimilarity(self, w1 = None, w2 = None):
self.w1 = self.preProcessing(w1)
self.w2 = self.preProcessing(w2)
index_w1, index_w2 = self.checkWordPresence()
if index_w1 != -1 and index_w2 != -1:
#print "Both words found in the dictionary"
vector1 = self.uSigma(index_w1)
vector2 = self.sigmaV(index_w2)
similarityScore = cosine_similarity([vector1], [vector2])
return similarityScore
else:
#print "Both words not found in the dictionary"
return 0
if __name__=='__main__':
lsa = lsaWordSim("person", "car")
#print lsa.checkWordPresence()
print lsa.calculateSimilarity()