Merge branch 'release-0.10.0'

hsingjun0 · Jun 4, 2014 · 65d4656 · 65d4656
2 parents d60b96f + 9b67a4a
commit 65d4656
Show file tree

Hide file tree

Showing 9 changed files with 69 additions and 29 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,10 +5,16 @@ python:
   - "3.3"
   - "3.4"
 before_install:
-    - sudo apt-get update -qq
-    - sudo apt-get install -qq libatlas-dev liblapack-dev gfortran
-    - travis_wait pip install --quiet numpy
-    - travis_wait pip install --quiet scipy
+  - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b
+  - export PATH=/home/travis/miniconda/bin:$PATH
+  - conda update --yes conda
+  # The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda
+  - sudo rm -rf /dev/shm
+  - sudo ln -s /run/shm /dev/shm
 install:
-    - python setup.py install
+  - conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy
+  - source activate gensim-test
+  - python setup.py install
 script: python setup.py test
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -7,4 +7,6 @@ include CHANGELOG.txt
 include COPYING
 include COPYING.LESSER
 include ez_setup.py
+include gensim/models/voidptr.h
 include gensim/models/word2vec_inner.pyx
+include gensim_addons/models/word2vec_inner.pyx
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -52,9 +52,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.10.0rc1'
+version = '0.10.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.10.0rc1'
+release = '0.10.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/src/gensim_theme/layout.html b/docs/src/gensim_theme/layout.html
@@ -80,7 +80,7 @@ <h1 class="h1gensim">
             <div class="consulting-banner">
               <h3><a href="http://radimrehurek.com/">Get Expert Help</a></h3>
               <p>• machine learning, NLP, data mining</p>
-              <p>• custom system design, development, optimizations</p>
+              <p>• custom SW design, development, optimizations</p>
               <p>• tech trainings &amp; IT consulting</p>
             </div>
           </div>

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -70,11 +70,16 @@ def argsort(x, topn=None):
 
 def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0):
     """
-    Convert corpus into a sparse matrix, in scipy.sparse.csc_matrix format,
+    Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format,
     with documents as columns.
 
     If the number of terms, documents and non-zero elements is known, you can pass
     them here as parameters and a more memory efficient code path will be taken.
+
+    The input corpus may be a non-repeatable stream (generator).
+
+    This is the mirror function to `Sparse2Corpus`.
+
     """
     try:
         # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
@@ -151,7 +156,7 @@ def ismatrix(m):
 
 
 def any2sparse(vec, eps=1e-9):
-    """Convert a numpy/scipy vector into gensim format (list of 2-tuples)."""
+    """Convert a numpy/scipy vector into gensim document format (=list of 2-tuples)."""
     if isinstance(vec, numpy.ndarray):
         return dense2vec(vec, eps)
     if scipy.sparse.issparse(vec):
@@ -160,15 +165,25 @@ def any2sparse(vec, eps=1e-9):
 
 
 def scipy2sparse(vec, eps=1e-9):
-    """Convert a scipy.sparse vector to gensim format (list of 2-tuples)."""
+    """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
     vec = vec.tocsr()
     assert vec.shape[0] == 1
     return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps]
 
 
 class Scipy2Corpus(object):
+    """
+    Convert a sequence of dense/sparse vectors into a streamed gensim corpus object.
+
+    This is the mirror function to `corpus2csc`.
+
+    """
     def __init__(self, vecs):
-        """Convert a sequence of dense/sparse vector to a gensim corpus object."""
+        """
+        `vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array,
+        or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors.
+
+        """
         self.vecs = vecs
 
     def __iter__(self):
@@ -184,8 +199,11 @@ def __len__(self):
 
 def sparse2full(doc, length):
     """
-    Convert a document in sparse corpus format (sequence of 2-tuples) into a dense
+    Convert a document in sparse document format (=sequence of 2-tuples) into a dense
     numpy array (of size `length`).
+
+    This is the mirror function to `full2sparse`.
+
     """
     result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value)
     doc = dict(doc)
@@ -196,9 +214,12 @@ def sparse2full(doc, length):
 
 def full2sparse(vec, eps=1e-9):
     """
-    Convert a dense numpy array into the sparse corpus format (sequence of 2-tuples).
+    Convert a dense numpy array into the sparse document format (sequence of 2-tuples).
 
     Values of magnitude < `eps` are treated as zero (ignored).
+
+    This is the mirror function to `sparse2full`.
+
     """
     vec = numpy.asarray(vec, dtype=float)
     nnz = numpy.nonzero(abs(vec) > eps)[0]
@@ -209,7 +230,8 @@ def full2sparse(vec, eps=1e-9):
 
 def full2sparse_clipped(vec, topn, eps=1e-9):
     """
-    Like `full2sparse`, but only return the `topn` greatest elements (not all).
+    Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
+
     """
     # use numpy.argsort and only form tuples that are actually returned.
     # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
@@ -225,10 +247,12 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32):
     """
     Convert corpus into a dense numpy array (documents will be columns). You
     must supply the number of features `num_terms`, because dimensionality
-    cannot be deduced from sparse vectors alone.
+    cannot be deduced from the sparse vectors alone.
 
     You can optionally supply `num_docs` (=the corpus length) as well, so that
-    a more memory efficient code path is taken.
+    a more memory-efficient code path is taken.
+
+    This is the mirror function to `Dense2Corpus`.
 
     """
     if num_docs is not None:
@@ -249,6 +273,9 @@ class Dense2Corpus(object):
 
     No data copy is made (changes to the underlying matrix imply changes in the
     corpus).
+
+    This is the mirror function to `corpus2dense`.
+
     """
     def __init__(self, dense, documents_columns=True):
         if documents_columns:
@@ -268,6 +295,9 @@ def __len__(self):
 class Sparse2Corpus(object):
     """
     Convert a matrix in scipy.sparse format into a streaming gensim corpus.
+
+    This is the mirror function to `corpus2csc`.
+
     """
     def __init__(self, sparse, documents_columns=True):
         if documents_columns:

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -391,8 +391,8 @@ def inference(self, chunk, collect_sstats=False):
                 sstats[:, ids] += numpy.outer(expElogthetad.T, cts / phinorm)
 
         if len(chunk) > 1:
-            logger.info("%i/%i documents converged within %i iterations" %
-                         (converged, len(chunk), self.iterations))
+            logger.debug("%i/%i documents converged within %i iterations" %
+                (converged, len(chunk), self.iterations))
 
         if collect_sstats:
             # This step finishes computing the sufficient statistics for the
@@ -518,7 +518,7 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N
         logger.info("running %s LDA training, %s topics, %i passes over "
                     "the supplied corpus of %i documents, updating model once "
                     "every %i documents, evaluating perplexity every %i documents, "
-                    "iterating %i with a convergence threshold of %i" %
+                    "iterating %ix with a convergence threshold of %f" %
                     (updatetype, self.num_topics, passes, lencorpus,
                         updateafter, evalafter, iterations,
                         gamma_threshold))

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -805,18 +805,20 @@ def __iter__(self):
 
 
 class LineSentence(object):
+    """Simple format: one sentence = one line; words already preprocessed and separated by whitespace."""
     def __init__(self, source):
-        """Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
-
-        source can be either a string or a file object
+        """
+        `source` can be either a string or a file object.
 
-        Thus, one can use this for just plain files:
+        Example::
 
             sentences = LineSentence('myfile.txt')
 
-        Or for compressed files:
+        Or for compressed files::
+
+            sentences = LineSentence('compressed_text.txt.bz2')
+            sentences = LineSentence('compressed_text.txt.gz')
 
-            sentences = LineSentence(bz2.BZ2File('compressed_text.bz2'))
         """
         self.source = source
 

diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx
@@ -22,7 +22,7 @@ from scipy.linalg.blas import fblas
 REAL = np.float32
 ctypedef np.float32_t REAL_t
 
-DEF MAX_SENTENCE_LEN = 1000
+DEF MAX_SENTENCE_LEN = 10000
 
 ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
 ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil

diff --git a/setup.py b/setup.py
@@ -23,11 +23,11 @@
 
 # Commonly used information
 pkg_name = 'gensim'
-pkg_ver = '0.10.0rc1'
+pkg_ver = '0.10.0'
 pkg_desc = 'Python framework for fast Vector Space Modelling'
 
 # there is a bug in python2.5, preventing distutils from using any non-ascii characters :( http://bugs.python.org/issue2562
-pkg_author = 'Radim Rehurek', # u'Radim Řehůřek', # <- should really be this...
+pkg_author = 'Radim Rehurek' # u'Radim Řehůřek', # <- should really be this...
 pkg_author_email = 'radimrehurek@seznam.cz'
 pkg_url = 'http://radimrehurek.com/gensim'
 pkg_download_url = 'http://pypi.python.org/pypi/gensim'