diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 06eea74b62..676267a919 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -345,39 +345,46 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, i result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn] - def most_similar_among(self, positive=[], negative=[], topn=10, words_list=None, indexer=None, + def most_similar_among(self, positive=[], negative=[], + topn=10, words_list=None, indexer=None, suppress_warnings=False): """ - Find the top-N most similar words among words_list to given words. Positive words - contribute positively towards the similarity, negative words negatively. + Find the top-N most similar words among words_list to given words. + + Positive words contribute positively towards the similarity, + negative words negatively. Please refer to docs of most_similar function. - If topn is False, most_similar returns the vector of similarity scores for all words - in vocabulary of model, restriced by the supplied words_list. + If topn is False, most_similar returns the vector of similarity scores + for all words in vocabulary of model, restriced by the supplied words_list. - 'words_list' should be a list/set of words. The returned word similarities will only - contain similarity scores for those words that are in words_list (and in trained vocabulary). + 'words_list' should be a list/set of words. The returned word similarities + will only contain similarity scores for those words that are in words_list + (and in trained vocabulary). - If some words in words_list are not in vocabulary then a warning is issued to the user. + If some words in words_list are not in vocabulary then a warning is + issued to the user. Warnings can be supressed by setting the suppress_warnings flag. Example:: - >>> trained_model.most_similar_among(positive=['man'], topn=1, words_list=['woman','random_word']) + >>> trained_model.most_similar_among(positive=['man'], topn=1, + words_list=['woman','random_word']) [('woman', 0.75882536)] """ if isinstance(words_list, int): - raise ValueError("words_list must be a set/list of words. Maybe you wanted the \ - most_similar function.") + raise ValueError("words_list must be a set/list of words. \ + Maybe you wanted the most_similar function.") elif isinstance(words_list, list) or isinstance(words_list, set): pass else: # This is triggered for empty words_list parameter - raise ValueError("words_list must be set/list of words. Maybe you wanted the \ - most_similar function. Please read doc string") + raise ValueError("words_list must be set/list of words. \ + Maybe you wanted the most_similar function. \ + Please read doc string") if type(topn) is not int: if topn is False: @@ -385,16 +392,19 @@ def most_similar_among(self, positive=[], negative=[], topn=10, words_list=None, else: if suppress_warnings is False: logger.warning("topn needs to either be a number or False. \ - Please read docstring. Displaying all similarities!") + Please read docstring. \ + Displaying all similarities!") topn = len(self.index2word) self.init_sims() if isinstance(positive, string_types) and negative is False: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) + # allow calls like most_similar('dog'), + # as a shorthand for most_similar(['dog']) positive = [positive] - # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words + # add weights for each word, if not already present; + # default to 1.0 for positive and -1.0 for negative words positive = [ (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word for word in positive @@ -426,20 +436,24 @@ def most_similar_among(self, positive=[], negative=[], topn=10, words_list=None, words_to_use = vocabulary_words.intersection(words_list) if not words_to_use: - raise ValueError("None of the words in words_list exist in current vocabulary") + raise ValueError("None of the words in words_list \ + exist in current vocabulary") if suppress_warnings is False: missing_words = words_list.difference(vocabulary_words) if not missing_words: # missing_words is empty pass else: - logger.warning("The following words are not in trained vocabulary : %s", str(missing_words)) - logger.info("This warning is expensive to calculate, especially for largs words_list. \ - If you would rather not remove the missing_words from words_list \ - please set the suppress_warnings flag.") + logger.warning("The following words are not in \ + trained vocabulary : %s", str(missing_words)) + logger.info("This warning is expensive to calculate, \ + especially for largs words_list. \ + If you would rather not remove the missing_words \ + from words_list please set the \ + suppress_warnings flag.") words_list_indices = [self.vocab[word].index for word in words_to_use] - # limited = self.syn0norm[words_list_indices] #syn0norm is an ndarray so this indexing works + # limited = self.syn0norm[words_list_indices] # Storing 'limited' might add a huge memory overhead so we avoid doing that dists = dot(self.syn0norm[words_list_indices], mean) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 5632466f2c..f2a67194ef 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -16,7 +16,6 @@ import itertools import bz2 import sys -import warnings import numpy as np @@ -470,17 +469,20 @@ def test_most_similar_among(self, l): CBOW model is used here. """ - model = word2vec.Word2Vec(sentences, size=2, sg=0, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, size=2, sg=0, min_count=1, + hs=1, negative=0) # Testing Error in case of absent words_list - self.assertRaises(ValueError, model.wv.most_similar_among, positive=['graph']) + self.assertRaises(ValueError, model.wv.most_similar_among, + positive=['graph']) words_in_voc = model.wv.index2word[:5] # Testing logs for warnings - model.wv.most_similar_among('graph', \ - words_list=words_in_voc+['random_word'], \ - topn="some_gibberish_not_number_or_False") + model.wv.most_similar_among('graph', + words_list=words_in_voc+['random_word'], + topn="some_gibberish_not_number_or_False") + self.assertIn("topn needs to either be a number or False", str(l)) self.assertIn("The following words are not in trained vocabulary", str(l)) self.assertIn("This warning is expensive to calculate", str(l)) @@ -488,31 +490,36 @@ def test_most_similar_among(self, l): l.clear() # Check if warnings are suppressed upon setting suppress_warnings flag - model.wv.most_similar_among('graph', \ - words_list=words_in_voc+['random_word'], \ - topn="some_gibberish_not_number_or_False", \ - suppress_warnings=True) + model.wv.most_similar_among('graph', + words_list=words_in_voc+['random_word'], + topn="some_gibberish_not_number_or_False", + suppress_warnings=True) self.assertIn("No logging captured", str(l)) # Check functionality sims = model.wv.most_similar_among('graph', words_list=words_in_voc) - sims2 = model.wv.most_similar_among('graph', words_list=words_in_voc+['random_word'], \ - suppress_warnings=True) + sims2 = model.wv.most_similar_among('graph', + words_list=words_in_voc+['random_word'], + suppress_warnings=True) self.assertEqual(sims, sims2) # Results by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] - sims3 = model.wv.most_similar_among(positive = [graph_vector], words_list=words_in_voc) + sims3 = model.wv.most_similar_among(positive = [graph_vector], + words_list=words_in_voc) sims3 = [(w, sim) for w, sim in sims3 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims3) - sims4 = model.wv.most_similar_among('graph', words_list=model.wv.index2word, \ - topn=False) # Returns all possible similarities - sims5 = model.wv.most_similar_among('graph', words_list=model.wv.index2word, \ - topn=len(model.wv.vocab)) + sims4 = model.wv.most_similar_among('graph', + words_list=model.wv.index2word, + topn=False) # Returns all possible similarities + sims5 = model.wv.most_similar_among('graph', + words_list=model.wv.index2word, + topn=len(model.wv.vocab)) self.assertEqual(sims4, sims5) self.assertEqual(len(sims4), len(model.wv.vocab)-1) - # Subtracting one as the word itself is not returned in most_similar calculation + # Subtracting one as the word itself is not returned + # in most_similar calculation def test_cosmul(self): model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)