piskvorky · shubhvachher · Apr 1, 2017 · Apr 1, 2017 · Apr 6, 2017 · Apr 6, 2017
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -274,6 +274,13 @@ def word_vec(self, word, use_norm=False):
         else:
             raise KeyError("word '%s' not in vocabulary" % word)
 
+    def get_ordered_keys(self):
+        """
+        Returns the keys in the current KeyedVectors instant as a list.
+        If the model is not trained yet then an empty list is returned.
+        """
+        return self.index2word
+
     def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None):
         """
         Find the top-N most similar words. Positive words contribute positively towards the
@@ -338,6 +345,129 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, i
         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
         return result[:topn]
 
+    def most_similar_among(self, positive=[], negative=[],
+                            topn=10, words_list=None, indexer=None,
+                            suppress_warnings=False):
+        """
+        Find the top-N most similar words among words_list to given words.
+
+        Positive words contribute positively towards the similarity,
+        negative words negatively.
+
+        Please refer to docs of most_similar function.
+
+        If topn is False, most_similar returns the vector of similarity scores
+        for all words in vocabulary of model, restriced by the supplied words_list.
+
+        'words_list' should be a list/set of words. The returned word similarities
+        will only contain similarity scores for those words that are in words_list
+        (and in trained vocabulary).
+
+        If some words in words_list are not in vocabulary then a warning is
+        issued to the user.
+
+        Warnings can be supressed by setting the suppress_warnings flag.
+
+        Example::
+
+          >>> trained_model.most_similar_among(positive=['man'], topn=1,
+                                                words_list=['woman','random_word'])
+          [('woman', 0.75882536)]
+
+        """
+
+        if isinstance(words_list, int):
+            raise ValueError("words_list must be a set/list of words. " \
+                                "Maybe you wanted the most_similar function.")
+        elif isinstance(words_list, list) or isinstance(words_list, set):
+            pass
+        else:  # This is triggered for empty words_list parameter
+            raise ValueError("words_list must be set/list of words. " \
+                                "Maybe you wanted the most_similar function. " \
+                                "Please read doc string")
+
+        if type(topn) is not int:
+            if topn is False:
+                pass
+            else:
+                if suppress_warnings is False:
+                    logger.warning("topn needs to either be a number or False. " \
+                                    "Please read docstring. " \
+                                    "Displaying all similarities!")
+            topn = len(self.index2word)
+
+        self.init_sims()
+
+        if isinstance(positive, string_types) and not negative:
+            # allow calls like most_similar('dog'),
+            # as a shorthand for most_similar(['dog'])
+            positive = [positive]
+
+        # add weights for each word, if not already present;
+        # default to 1.0 for positive and -1.0 for negative words
+        positive = [
+            (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
+            for word in positive
+        ]
+        negative = [
+            (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
+            for word in negative
+        ]
+
+        # compute the weighted average of all words
+        all_words, mean = set(), []
+        for word, weight in positive + negative:
+            if isinstance(word, ndarray):
+                mean.append(weight * word)
+            else:
+                mean.append(weight * self.word_vec(word, use_norm=True))
+                if word in self.vocab:
+                    all_words.add(self.vocab[word].index)
+        if mean is False:
+            raise ValueError("cannot compute similarity with no input")
+        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
+
+        if indexer is not None:
+            return indexer.most_similar(mean, topn)
+
+        words_list = set(words_list)
+        vocabulary_words = set(self.index2word)
+
+        words_to_use = vocabulary_words.intersection(words_list)
+
+        if not words_to_use:
+            raise ValueError("None of the words in words_list " \
+                                "exist in current vocabulary")
+
+        if suppress_warnings is False:
+            missing_words = words_list.difference(vocabulary_words)
+            if not missing_words:  # missing_words is empty
+                pass
+            else:
+                logger.warning("The following words are not in " \
+                                "trained vocabulary : %s", str(missing_words))
+                logger.info("This warning is expensive to calculate, " \
+                                "especially for large words_list. " \
+                                "If you would rather not remove the missing_words " \
+                                "from words_list please set the " \
+                                "suppress_warnings flag.")
+
+        words_list_indices = [self.vocab[word].index for word in words_to_use]
+        # limited = self.syn0norm[words_list_indices]
+        # Storing 'limited' might add a huge memory overhead so we avoid doing that
+
+        dists = dot(self.syn0norm[words_list_indices], mean)
+        result = []
+
+        best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
+        # ignore (don't return) words from the input
+        for sim in best:
+            index_to_return = words_list_indices[sim]
+            if(index_to_return not in all_words):
+                result.append((self.index2word[index_to_return], float(dists[sim])))
+
+        return result[:topn]
+
     def wmdistance(self, document1, document2):
         """
         Compute the Word Mover's Distance between two documents. When using this

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -1193,6 +1193,13 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
                         self.syn0_lockf[self.wv.vocab[word].index] = lockf  # lock-factor: 0.0 stops further changes
         logger.info("merged %d vectors into %s matrix from %s" % (overlap_count, self.wv.syn0.shape, fname))
 
+    def get_words_from_vocab(self):
+        """
+        Returns the words in the currently trained vocabulary as a list.
+        If the model is not trained yet then an empty list is returned.
+        """
+        return self.wv.get_ordered_keys()
+
     def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None):
         """
         Please refer to the documentation for

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -370,6 +370,7 @@ def testTraining(self):
         # test querying for "most similar" by vector
         graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
         sims2 = model.most_similar(positive=[graph_vector], topn=11)
+        # topn is 11 because the first answer will be 'graph' itself
         sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
         self.assertEqual(sims, sims2)
 
@@ -461,6 +462,67 @@ def test_cbow_neg(self):
                                   min_count=5, iter=10, workers=2, sample=0)
         self.model_sanity(model)
 
+    @log_capture()
+    def test_most_similar_among(self, l):
+        """
+        Testing most_similar_among method of KeyedVectors Class.
+        CBOW model is used here.
+        """
+
+        model = word2vec.Word2Vec(sentences, size=2, sg=0, min_count=1,
+                                    hs=1, negative=0)
+
+        # Testing Error in case of absent words_list
+        self.assertRaises(ValueError, model.wv.most_similar_among,
+                            positive=['graph'])
+
+        words_in_voc = model.wv.index2word[:5]
+
+        # Testing logs for warnings
+        l.clear()
+
+        model.wv.most_similar_among('graph',
+                                    words_list=words_in_voc+['random_word'],
+                                    topn="some_gibberish_not_number_or_False")
+
+        self.assertIn("topn needs to either be a number or False", str(l))
+        self.assertIn("The following words are not in trained vocabulary", str(l))
+        self.assertIn("This warning is expensive to calculate", str(l))
+
+        l.clear()
+
+        # Check if warnings are suppressed upon setting suppress_warnings flag
+        model.wv.most_similar_among('graph',
+                                    words_list=words_in_voc+['random_word'],
+                                    topn="some_gibberish_not_number_or_False",
+                                    suppress_warnings=True)
+        self.assertIn("No logging captured", str(l))
+
+        # Check functionality
+        sims = model.wv.most_similar_among('graph', words_list=words_in_voc)
+        sims2 = model.wv.most_similar_among('graph',
+                                            words_list=words_in_voc+['random_word'],
+                                            suppress_warnings=True)
+        self.assertEqual(sims, sims2)
+
+        # Results by vector
+        graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
+        sims3 = model.wv.most_similar_among(positive = [graph_vector],
+                                            words_list=words_in_voc)
+        sims3 = [(w, sim) for w, sim in sims3 if w != 'graph']  # ignore 'graph' itself
+        self.assertEqual(sims, sims3)
+
+        sims4 = model.wv.most_similar_among('graph',
+                                            words_list=model.wv.index2word,
+                                            topn=False)  # Returns all possible similarities
+        sims5 = model.wv.most_similar_among('graph',
+                                            words_list=model.wv.index2word,
+                                            topn=len(model.wv.vocab))
+        self.assertEqual(sims4, sims5)
+        self.assertEqual(len(sims4), len(model.wv.vocab)-1)
+        # Subtracting one as the word itself is not returned
+        # in most_similar calculation
+
     def test_cosmul(self):
         model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
         sims = model.most_similar_cosmul('graph', topn=10)