Merge branch 'release-0.8.8'

hsingjun0 · Nov 3, 2013 · a096d78 · a096d78
2 parents 578a174 + 08a4e3d
commit a096d78
Show file tree

Hide file tree

Showing 21 changed files with 769 additions and 123 deletions.
diff --git a/.gitignore b/.gitignore
@@ -56,4 +56,8 @@ data
 *.bak
 /build/
 /dist/
-
+*.prof
+*.lprof
+*.bin
+*.old
+*.model
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,6 +1,16 @@
 Changes
 =======
 
+0.8.8 (aka "word2vec release")
+
+* python3 port by Parikshit Samant: https://github.com/samantp/gensimPy3
+* massive optimizations to word2vec (cython, BLAS, multithreading): ~20x-300x speedup
+* new word2vec functionality (thx to Ghassen Hamrouni, PR #124)
+* new CSV corpus class (thx to Zygmunt Zając)
+* corpus serialization checks to prevent overwriting (by Ian Langmore, PR #125)
+* add context manager support for older Python<=2.6 for gzip and bz2
+* added unittests for word2vec
+
 0.8.7
 
 * initial version of word2vec, a neural network deep learning algo
@@ -13,7 +23,7 @@ Changes
 * save/load directly from bzip2 files (by Luis Pedro Coelho, PR #101)
 * Blei corpus now tries harder to find its vocabulary file (by Luis Pedro Coelho, PR #100)
 * sparse vector elements can now be a list (was: only a 2-tuple)
-* simple_preprocess now optionally de-accents letters (ř/š/ú etc.)
+* simple_preprocess now optionally deaccents letters (ř/š/ú=>r/s/u etc.)
 * better serialization of numpy corpora
 * print_topics() returns the topics, in addition to printing/logging
 * fixes for more robust Windows multiprocessing

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,3 +7,4 @@ include CHANGELOG.txt
 include COPYING
 include COPYING.LESSER
 include ez_setup.py
+include gensim/models/word2vec_inner.pyx
diff --git a/README.rst b/README.rst
@@ -6,6 +6,7 @@ gensim -- Python Framework for Topic Modelling
 Gensim is a Python library for *topic modelling*, *document indexing* and *similarity retrieval* with large corpora.
 Target audience is the *natural language processing* (NLP) and *information retrieval* (IR) community.
 
+For a Python3 port of gensim by Parikshit Samant, visit `this fork <https://github.com/samantp/gensimPy3>`_.
 
 Features
 ---------
@@ -59,6 +60,5 @@ It is also included in the source distribution package.
 
 ----------------
 
-Gensim is open source software, and has been released under the
-`GNU LGPL license <http://www.gnu.org/licenses/lgpl.html>`_.
+Gensim is open source software released under the `GNU LGPL license <http://www.gnu.org/licenses/lgpl.html>`_.
 Copyright (c) 2009-2013 Radim Rehurek
diff --git a/docs/src/Makefile b/docs/src/Makefile
@@ -33,7 +33,7 @@ clean:
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	rm -r $(BUILDDIR)/html/_sources
-#	cp -r $(BUILDDIR)/html/* ../
+	cp -r $(BUILDDIR)/html/* ../
 	@echo
 	@echo "Build finished. The HTML pages are in ../"
 

diff --git a/docs/src/_templates/indexcontent.html b/docs/src/_templates/indexcontent.html
@@ -124,15 +124,15 @@ <h3 class="h3gensim">Who is using Gensim?<div class="more-info">Doing something
 
         <div class="center">
           <ul id="slider_testimonials" class="reference">
+            <li><div class="reference2 testimonial">
+              “Gensim hits the sweetest spot of being a simple yet powerful way to access some incredibly complex NLP goodness.” <span class="testimonial-author">Alan J. Salmoni, <a href="http://roistr.com/">Roistr.com</a></span>
+            </div></li>
             <li><div class="reference1 testimonial">
               “I used gensim at Ghent university. I found it easy to build prototypes with various models, extend it with additional features and gain empirical insights quickly. It's a reliable library that can be used beyond prototyping too.” <span class="testimonial-author"><a href="http://dieter.plaetinck.be/">Dieter Plaetinck</a>, <a href="http://www.ibcn.intec.ugent.be/">IBCN group</a></span>
             </div></li>
             <li><div class="reference3 testimonial">
               “We used gensim in several text mining projects at Sports Authority. The data were from free-form text fields in customer surveys, as well as social media sources. Having gensim significantly sped our time to development, and it is still my go-to package for topic modeling with large retail data sets.” <span class="testimonial-author">Josh Hemann, <a href="http://www.sportsauthority.com/home/index.jsp">Sports Authority</a></span>
             </div></li>
-            <li><div class="reference2 testimonial">
-              “Gensim hits the sweetest spot of being a simple yet powerful way to access some incredibly complex NLP goodness.” <span class="testimonial-author">Alan J. Salmoni, <a href="http://roistr.com/">Roistr.com</a></span>
-            </div></li>
             <li><div class="reference4 testimonial">
               “Semantic analysis is a hot topic in online marketing, but there are few products on the market that are truly powerful.
 Gensim is undoubtedly one of the best frameworks that efficiently implement algorithms for statistical analysis.

diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -52,9 +52,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.8.7'
+version = '0.8.8'
 # The full version, including alpha/beta/rc tags.
-release = '0.8.7'
+release = '0.8.8'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/src/gensim_theme/layout.html b/docs/src/gensim_theme/layout.html
@@ -128,7 +128,7 @@ <h1 class="h1gensim">
           <img src="{{ pathto('_static/images/logo-gensim.png', 1) }}" class="smallerlogo" alt="smaller gensim logo" />
           <a href="{{ pathto('index') }}"><img src="{{ pathto('_static/images/gensim-footer.png', 1) }}" alt="gensim footer image" title="Gensim home" /></a>
           <div class="copyright">
-            &copy; Copyright 2009-2013, <a href="mailto:radimrehurek@seznam.cz" style="color:white"> Radim Řehůřek</a>.
+            &copy; Copyright 2009-2013, <a href="mailto:radimrehurek@seznam.cz" style="color:white"> Radim Řehůřek</a>
             <br />
             {%- if last_updated %}
               {% trans last_updated=last_updated|e %}Last updated on {{ last_updated }}.{% endtrans %}
@@ -166,7 +166,7 @@ <h1 class="h1gensim">
 
             <div class="tweetodsazeni">
               <div class="tweet">
-                Feel free to tweet !
+                <a href="https://twitter.com/radimrehurek" target="_blank" style="color: white">Tweet @RadimRehurek</a>
               </div>
             </div>
 
@@ -178,7 +178,7 @@ <h1 class="h1gensim">
             </div>
             <div class="googlegroupsodsazeni">
               <a href="https://groups.google.com/group/gensim" class="googlegroups">
-                Join the Google discussion group
+                Join the gensim discussion group
               </a>
 
               <form action="http://groups.google.com/group/gensim/boxsubscribe">

diff --git a/docs/src/models/word2vec.rst b/docs/src/models/word2vec.rst
@@ -0,0 +1,8 @@
+:mod:`models.word2vec` -- Deep learning with word2vec
+======================================================
+
+.. automodule:: gensim.models.word2vec
+    :synopsis: Deep learning with word2vec
+    :members:
+    :inherited-members:
+
diff --git a/docs/src/support.rst b/docs/src/support.rst
@@ -18,13 +18,13 @@ When posting on the mailing list, please try to include all relevant information
 
 You can also try asking on StackOverflow, using the `gensim tag <http://stackoverflow.com/questions/tagged/gensim>`_.
 
-Developer issues
-----------------
+Business support
+------------------
 
-Developers who `tweak gensim internals <https://github.com/piskvorky/gensim/wiki/Developer-page>`_ are encouraged to report issues at the `GitHub issue tracker <https://github.com/piskvorky/gensim/issues>`_.
-Note that this is not a medium for discussions or asking open-ended questions; please use the mailing list for that.
+In case you need SLA-based support, design validation, training or custom development, `contact me <http://radimrehurek.com/contact/>`_ for a commercial quote.
 
-Commercial support
+Developer support
 ------------------
 
-In case you need deterministic response times or have extra support/development needs, `contact me <mailto:me@radimrehurek.com>`_ for a quote on commercial support and contracting.
+Developers who `tweak gensim internals <https://github.com/piskvorky/gensim/wiki/Developer-page>`_ are encouraged to report issues at the `GitHub issue tracker <https://github.com/piskvorky/gensim/issues>`_.
+Note that this is not a medium for discussions or asking open-ended questions; please use the mailing list for that.
diff --git a/docs/src/wiki.rst b/docs/src/wiki.rst
@@ -33,7 +33,7 @@ Preparing the corpus
   I recommend compressing these files immediately, e.g. with bzip2 (down to ~13GB). Gensim
   can work with compressed files directly, so this lets you save disk space.
 
-Latent Sematic Analysis
+Latent Semantic Analysis
 --------------------------
 
 First let's load the corpus iterator and dictionary, created in the second step above::

diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2013 Zygmunt Zając <zygmunt@fastml.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Corpus in CSV format.
+
+"""
+
+
+from __future__ import with_statement
+
+import logging
+import csv
+import itertools
+
+from gensim import interfaces
+
+logger = logging.getLogger('gensim.corpora.csvcorpus')
+
+
+class CsvCorpus(interfaces.CorpusABC):
+    """
+    Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
+    based on the file content.
+
+    All row values are expected to be ints/floats.
+
+    """
+
+    def __init__(self, fname, labels):
+        """
+        Initialize the corpus from a file.
+        `labels` = are class labels present in the input file? => skip the first column
+
+        """
+        logger.info("loading corpus from %s" % fname)
+        self.fname = fname
+        self.length = None
+        self.labels = labels
+
+        # load the first few lines, to guess the CSV dialect
+        head = ''.join(itertools.islice(open(self.fname), 5))
+        self.headers = csv.Sniffer().has_header(head)
+        self.dialect = csv.Sniffer().sniff(head)
+        logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))
+
+
+    def __iter__(self):
+        """
+        Iterate over the corpus, returning one sparse vector at a time.
+
+        """
+        reader = csv.reader(open(self.fname), self.dialect)
+        if self.headers:
+            reader.next()  # skip the headers
+
+        line_no = -1
+        for line_no, line in enumerate(reader):
+            if self.labels:
+                line.pop(0)  # ignore the first column = class label
+            yield list(enumerate(map(float, line)))
+
+        self.length = line_no + 1  # store the total number of CSV rows = documents
diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
@@ -73,6 +73,9 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres
         >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
         >>> print mm[42] # retrieve document no. 42, etc.
         """
+        if getattr(corpus, 'fname', None) == fname:
+            raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)
+
         if index_fname is None:
             index_fname = fname + '.index'
 

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -289,7 +289,7 @@ def get_texts(self):
             for tokens in pool.imap(process_article, group): # chunksize=10):
                 articles_all += 1
                 positions_all += len(tokens)
-                if len(tokens) > ARTICLE_MIN_WORDS: # article redirects are pruned here
+                if len(tokens) > ARTICLE_MIN_WORDS: # article redirects and short stubs are pruned here
                     articles += 1
                     positions += len(tokens)
                     yield tokens

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -135,6 +135,14 @@ def pad(mat, padrow, padcol):
                       [numpy.matrix(numpy.zeros((padrow, cols + padcol)))]])
 
 
+def zeros_aligned(shape, dtype, order='C', align=128):
+    """Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary."""
+    nbytes = numpy.prod(shape) * numpy.dtype(dtype).itemsize
+    buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8)
+    start_index = -buffer.ctypes.data % align
+    return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order)
+
+
 def ismatrix(m):
     return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)
 

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -175,6 +175,9 @@ class LdaModel(interfaces.TransformationABC):
     def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
                  chunksize=2000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5):
         """
+        If given, start training from the iterable `corpus` straight away. If not given,
+        the model is left untrained (presumably because you want to call `update()` manually).
+
         `num_topics` is the number of requested latent topics to be extracted from
         the training corpus.
 
@@ -378,7 +381,7 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N
         """
         Train the model with new documents, by EM-iterating over `corpus` until
         the topics converge (or until the maximum number of allowed iterations
-        is reached).
+        is reached). `corpus` must be an iterable (repeatable stream of documents),
 
         In distributed mode, the E step is distributed over a cluster of machines.
 
@@ -412,6 +415,7 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N
         if lencorpus == 0:
             logger.warning("LdaModel.update() called with an empty corpus")
             return
+
         self.state.numdocs += lencorpus
 
         if update_every > 0:
@@ -438,7 +442,9 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N
                 other = LdaState(self.eta, self.state.sstats.shape)
             dirty = False
 
+            reallen = 0
             for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)):
+                reallen += len(chunk)  # keep track of how many documents we've processed so far
                 if self.dispatcher:
                     # add the chunk to dispatcher's job queue, so workers can munch on it
                     logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' %
@@ -468,6 +474,8 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N
                         other = LdaState(self.eta, self.state.sstats.shape)
                     dirty = False
             #endfor single corpus iteration
+            if reallen != lencorpus:
+                raise RuntimeError("input corpus size changed during training (don't use generators as input)")
 
             if dirty:
                 # finish any remaining updates
@@ -541,7 +549,7 @@ def bound(self, corpus, gamma=None):
 
 
     def print_topics(self, topics=10, topn=10):
-        return self.show_topics(topics, topn, True)
+        return self.show_topics(topics, topn, log=True)
 
     def show_topics(self, topics=10, topn=10, log=False, formatted=True):
         """
@@ -551,6 +559,9 @@ def show_topics(self, topics=10, topn=10, log=False, formatted=True):
         Unlike LSA, there is no ordering between the topics in LDA.
         The printed `topics <= self.num_topics` subset of all topics is therefore
         arbitrary and may change between two runs.
+
+        Set `formatted=True` to return the topics as a list of strings, or `False` as lists of (weight, word) pairs.
+
         """
         if topics < 0:
             # print all topics if `topics` is negative