-
Notifications
You must be signed in to change notification settings - Fork 1
/
SentenceSimilarity.py
executable file
·100 lines (69 loc) · 2.34 KB
/
SentenceSimilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
##########################################
#
# SentenceSimilarity.py: A Simple python script to calculate similarities between vectors of sentences on a given dataset.
# This will also be an attempt to implement a continuous training on the Doc2Vec model from gensim.
#
# Author: Cosimo Iaia <cosimo.iaia@gmail.com>
# Date: 04/05/2018
#
# This file is distribuited under the terms of GNU General Public
#
#########################################
from __future__ import print_function
import numpy as np
from gensim import models
import argparse
FLAGS = None
def getSim(model, corpus, sentence):
sentence = sentence.rstrip('\n')
vector = model.infer_vector(sentence)
sims = model.docvecs.most_similar(positive=[vector], topn=FLAGS.maxres)
res = []
for idx, sim in sims:
res.append([corpus[idx], sim])
return res
def scrubData(path):
with open(path, 'r') as f:
lines = f.readlines()
ds = []
for l in lines:
d = l.rstrip('\n').split('\t')
for x in d: ds.append(x)
return ds
def labelCorpus(corpus):
c = []
for i, s in enumerate(corpus):
c.append(models.doc2vec.LabeledSentence(s, [i]))
return c
def main():
path = FLAGS.dataset
# Prepare the data:
ds = scrubData(path)
corpus = labelCorpus(ds)
# Build and train the model
m = models.Doc2Vec(min_count=1, window=10, size=10, sample=1e-2,negative=3, workers=4, iter=FLAGS.epochs)
m.build_vocab(corpus)
print("------ Training the model --------")
m.train(corpus, epochs=m.iter, total_examples=m.corpus_count)
# Let's test it...
result = getSim(m, corpus, "What about the Spanish Inquisition?")
print(result)
# ...and play with it.
try:
import readline
sentence = raw_input('Insert sentence to compare> ')
result = getSim(m, corpus, sentence)
for txt,sim in result:
print(txt[0])
print('Similarity: ', sim)
except EOFError:
print("Ok, bye")
return
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Python Service to calculate sentence similarities')
parser.add_argument('--dataset', type=str, required=True, default='', help='Path to the dataset file')
parser.add_argument('--maxres', type=int, default='5', help='How many similar sentences we show')
parser.add_argument('--epochs', type=int, default='10', help='How many epochs to train')
FLAGS = parser.parse_args()
main()