From ff0c66e9012295fac785a2289ab318e057e72542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 18 May 2014 22:06:22 +0200 Subject: [PATCH 1/8] add missing cython files to MANIFEST.in --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index d514fa4993..5f682d46b4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,4 +7,6 @@ include CHANGELOG.txt include COPYING include COPYING.LESSER include ez_setup.py +include gensim/models/voidptr.h include gensim/models/word2vec_inner.pyx +include gensim_addons/models/word2vec_inner.pyx From 1f297e20214f08ff651863110874e776356da880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 18 May 2014 22:10:20 +0200 Subject: [PATCH 2/8] re #197: fix typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 36acee42d8..7945b6038d 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ pkg_desc = 'Python framework for fast Vector Space Modelling' # there is a bug in python2.5, preventing distutils from using any non-ascii characters :( http://bugs.python.org/issue2562 -pkg_author = 'Radim Rehurek', # u'Radim Řehůřek', # <- should really be this... +pkg_author = 'Radim Rehurek' # u'Radim Řehůřek', # <- should really be this... pkg_author_email = 'radimrehurek@seznam.cz' pkg_url = 'http://radimrehurek.com/gensim' pkg_download_url = 'http://pypi.python.org/pypi/gensim' From be148f298fd43d973bb620ec65d90fcdb5680f0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Wed, 21 May 2014 10:26:35 +0200 Subject: [PATCH 3/8] use miniconda in travis CI --- .travis.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index f70a39267e..6c205e8e74 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,10 +5,15 @@ python: - "3.3" - "3.4" before_install: - - sudo apt-get update -qq - - sudo apt-get install -qq libatlas-dev liblapack-dev gfortran - - travis_wait pip install --quiet numpy - - travis_wait pip install --quiet scipy + - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh + - chmod +x miniconda.sh + - ./miniconda.sh -b + - export PATH=/home/travis/miniconda/bin:$PATH + - conda update --yes conda + # The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda + - sudo rm -rf /dev/shm + - sudo ln -s /run/shm /dev/shm install: - - python setup.py install + - conda install --yes python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy + - python setup.py install script: python setup.py test From 1d2d19b0bea40d5b68506c40520307014c99f152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Wed, 21 May 2014 10:47:45 +0200 Subject: [PATCH 4/8] create conda env in travis CI --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6c205e8e74..4490d19536 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ before_install: - sudo rm -rf /dev/shm - sudo ln -s /run/shm /dev/shm install: - - conda install --yes python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy + - conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy + - source activate gensim-test - python setup.py install script: python setup.py test From 04365588788d29518928df4568e75642086ccad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 26 May 2014 16:56:56 +0200 Subject: [PATCH 5/8] improve matutils docs --- gensim/matutils.py | 48 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/gensim/matutils.py b/gensim/matutils.py index 584f59adc9..961c9b9269 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -70,11 +70,16 @@ def argsort(x, topn=None): def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0): """ - Convert corpus into a sparse matrix, in scipy.sparse.csc_matrix format, + Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format, with documents as columns. If the number of terms, documents and non-zero elements is known, you can pass them here as parameters and a more memory efficient code path will be taken. + + The input corpus may be a non-repeatable stream (generator). + + This is the mirror function to `Sparse2Corpus`. + """ try: # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes @@ -151,7 +156,7 @@ def ismatrix(m): def any2sparse(vec, eps=1e-9): - """Convert a numpy/scipy vector into gensim format (list of 2-tuples).""" + """Convert a numpy/scipy vector into gensim document format (=list of 2-tuples).""" if isinstance(vec, numpy.ndarray): return dense2vec(vec, eps) if scipy.sparse.issparse(vec): @@ -160,15 +165,25 @@ def any2sparse(vec, eps=1e-9): def scipy2sparse(vec, eps=1e-9): - """Convert a scipy.sparse vector to gensim format (list of 2-tuples).""" + """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" vec = vec.tocsr() assert vec.shape[0] == 1 return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps] class Scipy2Corpus(object): + """ + Convert a sequence of dense/sparse vectors into a streamed gensim corpus object. + + This is the mirror function to `corpus2csc`. + + """ def __init__(self, vecs): - """Convert a sequence of dense/sparse vector to a gensim corpus object.""" + """ + `vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array, + or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors. + + """ self.vecs = vecs def __iter__(self): @@ -184,8 +199,11 @@ def __len__(self): def sparse2full(doc, length): """ - Convert a document in sparse corpus format (sequence of 2-tuples) into a dense + Convert a document in sparse document format (=sequence of 2-tuples) into a dense numpy array (of size `length`). + + This is the mirror function to `full2sparse`. + """ result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value) doc = dict(doc) @@ -196,9 +214,12 @@ def sparse2full(doc, length): def full2sparse(vec, eps=1e-9): """ - Convert a dense numpy array into the sparse corpus format (sequence of 2-tuples). + Convert a dense numpy array into the sparse document format (sequence of 2-tuples). Values of magnitude < `eps` are treated as zero (ignored). + + This is the mirror function to `sparse2full`. + """ vec = numpy.asarray(vec, dtype=float) nnz = numpy.nonzero(abs(vec) > eps)[0] @@ -209,7 +230,8 @@ def full2sparse(vec, eps=1e-9): def full2sparse_clipped(vec, topn, eps=1e-9): """ - Like `full2sparse`, but only return the `topn` greatest elements (not all). + Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs). + """ # use numpy.argsort and only form tuples that are actually returned. # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on. @@ -225,10 +247,12 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32): """ Convert corpus into a dense numpy array (documents will be columns). You must supply the number of features `num_terms`, because dimensionality - cannot be deduced from sparse vectors alone. + cannot be deduced from the sparse vectors alone. You can optionally supply `num_docs` (=the corpus length) as well, so that - a more memory efficient code path is taken. + a more memory-efficient code path is taken. + + This is the mirror function to `Dense2Corpus`. """ if num_docs is not None: @@ -249,6 +273,9 @@ class Dense2Corpus(object): No data copy is made (changes to the underlying matrix imply changes in the corpus). + + This is the mirror function to `corpus2dense`. + """ def __init__(self, dense, documents_columns=True): if documents_columns: @@ -268,6 +295,9 @@ def __len__(self): class Sparse2Corpus(object): """ Convert a matrix in scipy.sparse format into a streaming gensim corpus. + + This is the mirror function to `corpus2csc`. + """ def __init__(self, sparse, documents_columns=True): if documents_columns: From 6c8f478811cad9ea4138de3594bf355b610eb5f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 30 May 2014 19:00:21 +0200 Subject: [PATCH 6/8] increase max word2vec sentence length to 10k * was: max 1k tokens per sentence (rest is ignored) --- gensim/models/word2vec_inner.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index b0f8c86e23..a7266a577e 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -22,7 +22,7 @@ from scipy.linalg.blas import fblas REAL = np.float32 ctypedef np.float32_t REAL_t -DEF MAX_SENTENCE_LEN = 1000 +DEF MAX_SENTENCE_LEN = 10000 ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil From 668d701e465f7a1f617172be6cea9a16d760a715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 30 May 2014 19:01:16 +0200 Subject: [PATCH 7/8] improve docs --- docs/src/gensim_theme/layout.html | 2 +- gensim/models/ldamodel.py | 6 +++--- gensim/models/word2vec.py | 14 ++++++++------ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/src/gensim_theme/layout.html b/docs/src/gensim_theme/layout.html index cedf9d1905..ac9900ad21 100644 --- a/docs/src/gensim_theme/layout.html +++ b/docs/src/gensim_theme/layout.html @@ -80,7 +80,7 @@

Get Expert Help

• machine learning, NLP, data mining

-

• custom system design, development, optimizations

+

• custom SW design, development, optimizations

• tech trainings & IT consulting

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index f8f9fe662c..04c3ac3adc 100644 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -391,8 +391,8 @@ def inference(self, chunk, collect_sstats=False): sstats[:, ids] += numpy.outer(expElogthetad.T, cts / phinorm) if len(chunk) > 1: - logger.info("%i/%i documents converged within %i iterations" % - (converged, len(chunk), self.iterations)) + logger.debug("%i/%i documents converged within %i iterations" % + (converged, len(chunk), self.iterations)) if collect_sstats: # This step finishes computing the sufficient statistics for the @@ -518,7 +518,7 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N logger.info("running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, " - "iterating %i with a convergence threshold of %i" % + "iterating %ix with a convergence threshold of %f" % (updatetype, self.num_topics, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold)) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 35b1270799..f35c7957be 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -805,18 +805,20 @@ def __iter__(self): class LineSentence(object): + """Simple format: one sentence = one line; words already preprocessed and separated by whitespace.""" def __init__(self, source): - """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - - source can be either a string or a file object + """ + `source` can be either a string or a file object. - Thus, one can use this for just plain files: + Example:: sentences = LineSentence('myfile.txt') - Or for compressed files: + Or for compressed files:: + + sentences = LineSentence('compressed_text.txt.bz2') + sentences = LineSentence('compressed_text.txt.gz') - sentences = LineSentence(bz2.BZ2File('compressed_text.bz2')) """ self.source = source From 9b67a4a0117fd3220f32be116646c1160a02790e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Wed, 4 Jun 2014 15:25:16 +0200 Subject: [PATCH 8/8] bump version: 0.10.0 --- docs/src/conf.py | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index 87207929b9..3d5400367e 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -52,9 +52,9 @@ # built documents. # # The short X.Y version. -version = '0.10.0rc1' +version = '0.10.0' # The full version, including alpha/beta/rc tags. -release = '0.10.0rc1' +release = '0.10.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 7945b6038d..3d5ced683b 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ # Commonly used information pkg_name = 'gensim' -pkg_ver = '0.10.0rc1' +pkg_ver = '0.10.0' pkg_desc = 'Python framework for fast Vector Space Modelling' # there is a bug in python2.5, preventing distutils from using any non-ascii characters :( http://bugs.python.org/issue2562