Merge branch 'release-0.10.1'

hsingjun0 · Jul 22, 2014 · 62c9237 · 62c9237
2 parents 65d4656 + 07212ed
commit 62c9237
Show file tree

Hide file tree

Showing 29 changed files with 337 additions and 160 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,7 +1,21 @@
 Changes
 =======
 
-0.10.0rc1
+0.10.1
+
+* make LDA print/show topics parameters consistent with LSI (Bram Vandekerckhove, #201)
+* add option for efficient word2vec subsampling (Gordon Mohr, #206)
+* fix length calculation for corpora on empty files (Christopher Corley, #209)
+* improve file cleanup of unit tests (Christopher Corley)
+* more unit tests
+* unicode now stored everywhere in gensim internally; accepted input stays either utf8 or unicode
+* various fixes to the py3k ported code
+* allow any dict-like input in Dictionary.from_corpus (Andreas Madsen)
+* error checking improvements to the MALLET wrapper
+* ignore non-articles during wiki parsig
+* utils.lemmatize now (optionally) ignores stopwords
+
+0.10.0 (aka "PY3K port"), 04/06/2014
 
 * full Python 3 support (targeting 3.3+, #196)
 * all internal methods now expect & store unicode, instead of utf8

diff --git a/README.rst b/README.rst
@@ -57,7 +57,14 @@ you'll need to run::
 For alternative modes of installation (without root privileges, development
 installation, optional install features), see the `documentation <http://radimrehurek.com/gensim/install.html>`_.
 
-This version has been tested under Python 2.6, 2.7 and 3.3.
+This version has been tested under Python 2.6, 2.7 and 3.3. Gensim's github repo is hooked to `Travis CI for automated testing <https://travis-ci.org/piskvorky/gensim>`_ on every commit push and pull request.
+
+How come gensim is so fast and memory efficient? Isn't it pure Python, and isn't Python slow and greedy?
+--------------------------------------------------------------------------------------------------------
+
+Many scientific algorithms can be expressed in terms of large matrix operations (see the BLAS note above). Gensim taps into these low-level BLAS libraries, by means of its dependency on NumPy. So while gensim-the-top-level-code is pure Python, it actually executes highly optimized Fortran/C under the hood, including multithreading (if your BLAS is so configured).
+
+Memory-wise, gensim makes heavy use of Python's built-in generators and iterators for streamed data processing. Memory efficiency was one of gensim's `design goals <http://radimrehurek.com/gensim/about.html>`_, and is a central feature of gensim, rather than something bolted on as an afterthought.
 
 Documentation
 -------------
@@ -69,4 +76,4 @@ It is also included in the source distribution package.
 ----------------
 
 Gensim is open source software released under the `GNU LGPL license <http://www.gnu.org/licenses/lgpl.html>`_.
-Copyright (c) 2009-2014 Radim Rehurek
+Copyright (c) 2009-now Radim Rehurek
diff --git a/docs/src/_static/css/style.css b/docs/src/_static/css/style.css
@@ -381,6 +381,7 @@ ul.reference li {
     background: url('../images/references/logo_roistr.png') top center no-repeat;
     padding-top: 100px;
     padding-bottom: 24px;
+    background-position-y: 15px;
 }
 
 .reference3 {
@@ -441,6 +442,16 @@ ul.reference li {
     padding-bottom: 24px;
 }
 
+.reference9 {
+    display: inline-block;
+    width: 247px;
+    background: url('../images/references/logo_tailwind.png') top center no-repeat;
+    background-size: 95%;
+    padding-top: 100px;
+    background-position: 0 -20px;
+    padding-bottom: 24px;
+}
+
 .getstarted{
     text-align: center;
     padding: 30px 0 20px 0;

diff --git a/docs/src/_static/images/references/logo_tailwind.png b/docs/src/_static/images/references/logo_tailwind.png
diff --git a/docs/src/_templates/indexcontent.html b/docs/src/_templates/indexcontent.html
@@ -36,7 +36,7 @@
           buildArrows: false,
           buildNavigation: false,
           buildStartStop: false,
-          startPanel: Math.floor(Math.random() * 7),
+          startPanel: Math.floor(Math.random() * 8),
           delay: 4000,
           autoPlay: true
         });
@@ -126,11 +126,14 @@ <h3 class="h3gensim">Features<div class="more-info">Hover your mouse over each f
     </div>
 
     <div class="podklad">
-      <div class="podkladwrapper">
+      <div class="podkladwrapper" id="testimonials">
         <h3 class="h3gensim">Who is using Gensim?<div class="more-info">Doing something interesting with gensim? Ask to be featured here.</div></h3>
 
         <div class="center">
           <ul id="slider_testimonials" class="reference">
+            <li><div class="reference9 testimonial">
+              “Here at Tailwind, we use Gensim to help our customers post interesting and relevant content to Pinterest. No fuss, no muss. Just fast, scalable language processing.” <span class="testimonial-author">Waylon Flinn, <a href="http://tailwindapp.com">Tailwind</a></span>
+            </div></li>
             <li><div class="reference8 testimonial">
               “We are using gensim every day. Over 15 thousand times per day to be precise. Gensim’s LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it’s all about. It simply works.” <span class="testimonial-author">Andrius Butkus, <a href="http://issuu.com/">Issuu</a></span>
             </div></li>

diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -52,9 +52,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.10.0'
+version = '0.10.1'
 # The full version, including alpha/beta/rc tags.
-release = '0.10.0'
+release = '0.10.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/src/distributed.rst b/docs/src/distributed.rst
@@ -30,7 +30,7 @@ much communication going on), so the network is allowed to be of relatively high
 
   To see what BLAS and LAPACK you are using, type into your shell::
 
-    python -c 'import numpy; numpy.show_config()'
+    python -c 'import scipy; scipy.show_config()'
 
 Prerequisites
 -----------------

diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -61,25 +61,21 @@ def __init__(self, fname, fname_vocab=None):
             else:
                 raise IOError('BleiCorpus: could not find vocabulary file')
 
-
         self.fname = fname
         with utils.smart_open(fname_vocab) as fin:
             words = [utils.to_unicode(word).rstrip() for word in fin]
         self.id2word = dict(enumerate(words))
-        self.length = None
-
+        self.length = 0
 
     def __iter__(self):
         """
         Iterate over the corpus, returning one sparse vector at a time.
         """
-        length = 0
+        lineno = -1
         with utils.smart_open(self.fname) as fin:
             for lineno, line in enumerate(fin):
-                length += 1
                 yield self.line2doc(line)
-        self.length = length
-
+        self.length = lineno + 1
 
     def line2doc(self, line):
         parts = utils.to_unicode(line).split()
@@ -89,7 +85,6 @@ def line2doc(self, line):
         doc = [(int(p1), float(p2)) for p1, p2 in doc]
         return doc
 
-
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
         """
@@ -133,5 +128,5 @@ def docbyoffset(self, offset):
         with utils.smart_open(self.fname) as f:
             f.seek(offset)
             return self.line2doc(f.readline())
-#endclass BleiCorpus
 
+# endclass BleiCorpus
diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -47,7 +47,6 @@ def __init__(self, fname, labels):
         self.dialect = csv.Sniffer().sniff(head)
         logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))
 
-
     def __iter__(self):
         """
         Iterate over the corpus, returning one sparse vector at a time.
@@ -64,3 +63,5 @@ def __iter__(self):
             yield list(enumerate(map(float, line)))
 
         self.length = line_no + 1  # store the total number of CSV rows = documents
+
+# endclass CsvCorpus
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -18,10 +18,15 @@
 from __future__ import with_statement
 
 from collections import Mapping
+import sys
 import logging
 import itertools
 
 from gensim import utils
+
+if sys.version_info[0] >= 3:
+    unicode = str
+
 from six import PY3, iteritems, iterkeys, itervalues, string_types
 from six.moves import xrange
 from six.moves import zip as izip
@@ -118,7 +123,7 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
         """
         Convert `document` (a list of words) into the bag-of-words format = list
         of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
-        **tokenized and normalized** utf-8 encoded string. No further preprocessing
+        **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing
         is done on the words in `document`; apply tokenization, stemming etc. before
         calling this method.
 
@@ -186,7 +191,7 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
 
         # determine which tokens to keep
         good_ids = (v for v in itervalues(self.token2id)
-                      if no_below <= self.dfs[v] <= no_above_abs)
+                      if no_below <= self.dfs.get(v, 0) <= no_above_abs)
         good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
         if keep_n is not None:
             good_ids = good_ids[:keep_n]
@@ -235,15 +240,12 @@ def compactify(self):
         logger.debug("rebuilding dictionary, shrinking gaps")
 
         # build mapping from old id -> new id
-        idmap = dict(izip(itervalues(self.token2id),
-                     xrange(len(self.token2id))))
+        idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id))))
 
         # reassign mappings to new ids
-        self.token2id = dict((token, idmap[tokenid])
-                             for token, tokenid in iteritems(self.token2id))
+        self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id))
         self.id2token = {}
-        self.dfs = dict((idmap[tokenid], freq)
-                        for tokenid, freq in iteritems(self.dfs))
+        self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
 
 
     def save_as_text(self, fname, sort_by_word=True):
@@ -337,16 +339,21 @@ def load_from_text(fname):
 
 
     @staticmethod
-    def from_corpus(corpus):
+    def from_corpus(corpus, id2word=None):
         """
         Create Dictionary from an existing corpus. This can be useful if you only
         have a term-document BOW matrix (represented by `corpus`), but not the
         original text corpus.
 
         This will scan the term-document count matrix for all word ids that
         appear in it, then construct and return Dictionary which maps each
-        `word_id -> str(word_id)`.
+        `word_id -> id2word[word_id]`.
+
+        `id2word` is an optional dictionary that maps the `word_id` to a token. In
+        case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)`
+        will be used.
         """
+
         result = Dictionary()
         max_id = -1
         for docno, document in enumerate(corpus):
@@ -358,10 +365,16 @@ def from_corpus(corpus):
                 max_id = max(wordid, max_id)
                 result.num_pos += word_freq
                 result.dfs[wordid] = result.dfs.get(wordid, 0) + 1
-        # now make sure length(result) == get_max_id(corpus) + 1
-        for i in xrange(max_id + 1):
-            result.token2id[str(i)] = i
-            result.dfs[i] = result.dfs.get(i, 0)
+
+        if id2word is None:
+            # make sure length(result) == get_max_id(corpus) + 1
+            result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1))
+        else:
+            # id=>word mapping given: simply copy it
+            result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word))
+        for id in itervalues(result.token2id):
+            # make sure all token ids have a valid `dfs` entry
+            result.dfs[id] = result.dfs.get(id, 0)
 
         logger.info("built %s from %i documents (total %i corpus positions)" %
                      (result, result.num_docs, result.num_pos))

diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
@@ -51,7 +51,6 @@ def __init__(self, fname, index_fname=None):
             self.index = None
         self.length = None
 
-
     @classmethod
     def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False):
         """
@@ -98,7 +97,6 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
         logger.info("saving %s index to %s" % (serializer.__name__, index_fname))
         utils.pickle(offsets, index_fname)
 
-
     def __len__(self):
         """
         Return the index length if the corpus is indexed. Otherwise, make a pass
@@ -111,9 +109,9 @@ def __len__(self):
             self.length = sum(1 for doc in self)
         return self.length
 
-
     def __getitem__(self, docno):
         if self.index is None:
             raise RuntimeError("cannot call corpus[docid] without an index")
         return self.docbyoffset(self.index[docno])
-#endclass IndexedCorpus
+
+# endclass IndexedCorpus
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -91,13 +91,16 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
     def _calculate_num_docs(self):
         # the first line in input data is the number of documents (integer). throws exception on bad input.
         with utils.smart_open(self.fname) as fin:
-            result = int(fin.readline())
+            try:
+                result = int(next(fin))
+            except StopIteration:
+                result = 0
+
         return result
 
     def __len__(self):
         return self.num_docs
 
-
     def line2doc(self, line):
         words = self.line2words(line)
 
@@ -127,7 +130,6 @@ def line2doc(self, line):
         # note that this way, only one doc is stored in memory at a time, not the whole corpus
         return doc
 
-
     def __iter__(self):
         """
         Iterate over the corpus, returning one bag-of-words vector at a time.
@@ -137,7 +139,6 @@ def __iter__(self):
                 if lineno > 0: # ignore the first line = number of documents
                     yield self.line2doc(line)
 
-
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
         """
@@ -160,7 +161,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
                 for wordid, value in doc:
                     if abs(int(value) - value) > 1e-6:
                         truncated += 1
-                    words.extend([str(id2word[wordid])] * int(value))
+                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                 offsets.append(fout.tell())
                 fout.write(utils.to_utf8('%s\n' % ' '.join(words)))
 
@@ -170,12 +171,12 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
                             truncated)
         return offsets
 
-
     def docbyoffset(self, offset):
         """
         Return the document stored at file position `offset`.
         """
         with utils.smart_open(self.fname) as f:
             f.seek(offset)
             return self.line2doc(f.readline())
-#endclass LowCorpus
+
+# endclass LowCorpus
diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -17,6 +17,7 @@
 
 logger = logging.getLogger('gensim.corpora.malletcorpus')
 
+
 class MalletCorpus(LowCorpus):
     """
     Quoting http://mallet.cs.umass.edu/import.php:
@@ -65,7 +66,6 @@ def line2doc(self, line):
         else:
             return doc
 
-
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
         """
@@ -101,7 +101,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
                 for wordid, value in doc:
                     if abs(int(value) - value) > 1e-6:
                         truncated += 1
-                    words.extend([str(id2word[wordid])] * int(value))
+                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                 offsets.append(fout.tell())
                 fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))
 
@@ -112,11 +112,12 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
 
         return offsets
 
-
     def docbyoffset(self, offset):
         """
         Return the document stored at file position `offset`.
         """
         with utils.smart_open(self.fname) as f:
             f.seek(offset)
             return self.line2doc(f.readline())
+
+# endclass MalletCorpus