Skip to content

Commit

Permalink
Line lengths pep8 fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
shubhvachher committed Apr 11, 2017
1 parent 2b69e3b commit 59feafd
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 40 deletions.
58 changes: 36 additions & 22 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,56 +345,66 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, i
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
return result[:topn]

def most_similar_among(self, positive=[], negative=[], topn=10, words_list=None, indexer=None,
def most_similar_among(self, positive=[], negative=[],
topn=10, words_list=None, indexer=None,
suppress_warnings=False):
"""
Find the top-N most similar words among words_list to given words. Positive words
contribute positively towards the similarity, negative words negatively.
Find the top-N most similar words among words_list to given words.
Positive words contribute positively towards the similarity,
negative words negatively.
Please refer to docs of most_similar function.
If topn is False, most_similar returns the vector of similarity scores for all words
in vocabulary of model, restriced by the supplied words_list.
If topn is False, most_similar returns the vector of similarity scores
for all words in vocabulary of model, restriced by the supplied words_list.
'words_list' should be a list/set of words. The returned word similarities will only
contain similarity scores for those words that are in words_list (and in trained vocabulary).
'words_list' should be a list/set of words. The returned word similarities
will only contain similarity scores for those words that are in words_list
(and in trained vocabulary).
If some words in words_list are not in vocabulary then a warning is issued to the user.
If some words in words_list are not in vocabulary then a warning is
issued to the user.
Warnings can be supressed by setting the suppress_warnings flag.
Example::
>>> trained_model.most_similar_among(positive=['man'], topn=1, words_list=['woman','random_word'])
>>> trained_model.most_similar_among(positive=['man'], topn=1,
words_list=['woman','random_word'])
[('woman', 0.75882536)]
"""

if isinstance(words_list, int):
raise ValueError("words_list must be a set/list of words. Maybe you wanted the \
most_similar function.")
raise ValueError("words_list must be a set/list of words. \
Maybe you wanted the most_similar function.")
elif isinstance(words_list, list) or isinstance(words_list, set):
pass
else: # This is triggered for empty words_list parameter
raise ValueError("words_list must be set/list of words. Maybe you wanted the \
most_similar function. Please read doc string")
raise ValueError("words_list must be set/list of words. \
Maybe you wanted the most_similar function. \
Please read doc string")

if type(topn) is not int:
if topn is False:
pass
else:
if suppress_warnings is False:
logger.warning("topn needs to either be a number or False. \
Please read docstring. Displaying all similarities!")
Please read docstring. \
Displaying all similarities!")
topn = len(self.index2word)

self.init_sims()

if isinstance(positive, string_types) and negative is False:
# allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
# allow calls like most_similar('dog'),
# as a shorthand for most_similar(['dog'])
positive = [positive]

# add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
# add weights for each word, if not already present;
# default to 1.0 for positive and -1.0 for negative words
positive = [
(word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
for word in positive
Expand Down Expand Up @@ -426,20 +436,24 @@ def most_similar_among(self, positive=[], negative=[], topn=10, words_list=None,
words_to_use = vocabulary_words.intersection(words_list)

if not words_to_use:
raise ValueError("None of the words in words_list exist in current vocabulary")
raise ValueError("None of the words in words_list \
exist in current vocabulary")

if suppress_warnings is False:
missing_words = words_list.difference(vocabulary_words)
if not missing_words: # missing_words is empty
pass
else:
logger.warning("The following words are not in trained vocabulary : %s", str(missing_words))
logger.info("This warning is expensive to calculate, especially for largs words_list. \
If you would rather not remove the missing_words from words_list \
please set the suppress_warnings flag.")
logger.warning("The following words are not in \
trained vocabulary : %s", str(missing_words))
logger.info("This warning is expensive to calculate, \
especially for largs words_list. \
If you would rather not remove the missing_words \
from words_list please set the \
suppress_warnings flag.")

words_list_indices = [self.vocab[word].index for word in words_to_use]
# limited = self.syn0norm[words_list_indices] #syn0norm is an ndarray so this indexing works
# limited = self.syn0norm[words_list_indices]
# Storing 'limited' might add a huge memory overhead so we avoid doing that

dists = dot(self.syn0norm[words_list_indices], mean)
Expand Down
43 changes: 25 additions & 18 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import itertools
import bz2
import sys
import warnings

import numpy as np

Expand Down Expand Up @@ -470,49 +469,57 @@ def test_most_similar_among(self, l):
CBOW model is used here.
"""

model = word2vec.Word2Vec(sentences, size=2, sg=0, min_count=1, hs=1, negative=0)
model = word2vec.Word2Vec(sentences, size=2, sg=0, min_count=1,
hs=1, negative=0)

# Testing Error in case of absent words_list
self.assertRaises(ValueError, model.wv.most_similar_among, positive=['graph'])
self.assertRaises(ValueError, model.wv.most_similar_among,
positive=['graph'])

words_in_voc = model.wv.index2word[:5]

# Testing logs for warnings
model.wv.most_similar_among('graph', \
words_list=words_in_voc+['random_word'], \
topn="some_gibberish_not_number_or_False")
model.wv.most_similar_among('graph',
words_list=words_in_voc+['random_word'],
topn="some_gibberish_not_number_or_False")

self.assertIn("topn needs to either be a number or False", str(l))
self.assertIn("The following words are not in trained vocabulary", str(l))
self.assertIn("This warning is expensive to calculate", str(l))

l.clear()

# Check if warnings are suppressed upon setting suppress_warnings flag
model.wv.most_similar_among('graph', \
words_list=words_in_voc+['random_word'], \
topn="some_gibberish_not_number_or_False", \
suppress_warnings=True)
model.wv.most_similar_among('graph',
words_list=words_in_voc+['random_word'],
topn="some_gibberish_not_number_or_False",
suppress_warnings=True)
self.assertIn("No logging captured", str(l))

# Check functionality
sims = model.wv.most_similar_among('graph', words_list=words_in_voc)
sims2 = model.wv.most_similar_among('graph', words_list=words_in_voc+['random_word'], \
suppress_warnings=True)
sims2 = model.wv.most_similar_among('graph',
words_list=words_in_voc+['random_word'],
suppress_warnings=True)
self.assertEqual(sims, sims2)

# Results by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims3 = model.wv.most_similar_among(positive = [graph_vector], words_list=words_in_voc)
sims3 = model.wv.most_similar_among(positive = [graph_vector],
words_list=words_in_voc)
sims3 = [(w, sim) for w, sim in sims3 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims3)

sims4 = model.wv.most_similar_among('graph', words_list=model.wv.index2word, \
topn=False) # Returns all possible similarities
sims5 = model.wv.most_similar_among('graph', words_list=model.wv.index2word, \
topn=len(model.wv.vocab))
sims4 = model.wv.most_similar_among('graph',
words_list=model.wv.index2word,
topn=False) # Returns all possible similarities
sims5 = model.wv.most_similar_among('graph',
words_list=model.wv.index2word,
topn=len(model.wv.vocab))
self.assertEqual(sims4, sims5)
self.assertEqual(len(sims4), len(model.wv.vocab)-1)
# Subtracting one as the word itself is not returned in most_similar calculation
# Subtracting one as the word itself is not returned
# in most_similar calculation

def test_cosmul(self):
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
Expand Down

0 comments on commit 59feafd

Please sign in to comment.