Skip to content

Commit

Permalink
Merge branch 'release-0.10.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Jun 4, 2014
2 parents d60b96f + 9b67a4a commit 65d4656
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 29 deletions.
16 changes: 11 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@ python:
- "3.3"
- "3.4"
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libatlas-dev liblapack-dev gfortran
- travis_wait pip install --quiet numpy
- travis_wait pip install --quiet scipy
- wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh
- chmod +x miniconda.sh
- ./miniconda.sh -b
- export PATH=/home/travis/miniconda/bin:$PATH
- conda update --yes conda
# The next couple lines fix a crash with multiprocessing on Travis and are not specific to using Miniconda
- sudo rm -rf /dev/shm
- sudo ln -s /run/shm /dev/shm
install:
- python setup.py install
- conda create --yes -n gensim-test python=$TRAVIS_PYTHON_VERSION pip atlas numpy scipy
- source activate gensim-test
- python setup.py install
script: python setup.py test
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ include CHANGELOG.txt
include COPYING
include COPYING.LESSER
include ez_setup.py
include gensim/models/voidptr.h
include gensim/models/word2vec_inner.pyx
include gensim_addons/models/word2vec_inner.pyx
4 changes: 2 additions & 2 deletions docs/src/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@
# built documents.
#
# The short X.Y version.
version = '0.10.0rc1'
version = '0.10.0'
# The full version, including alpha/beta/rc tags.
release = '0.10.0rc1'
release = '0.10.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion docs/src/gensim_theme/layout.html
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ <h1 class="h1gensim">
<div class="consulting-banner">
<h3><a href="http://radimrehurek.com/">Get Expert Help</a></h3>
<p>• machine learning, NLP, data mining</p>
<p>• custom system design, development, optimizations</p>
<p>• custom SW design, development, optimizations</p>
<p>• tech trainings &amp; IT consulting</p>
</div>
</div>
Expand Down
48 changes: 39 additions & 9 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,16 @@ def argsort(x, topn=None):

def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0):
"""
Convert corpus into a sparse matrix, in scipy.sparse.csc_matrix format,
Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format,
with documents as columns.
If the number of terms, documents and non-zero elements is known, you can pass
them here as parameters and a more memory efficient code path will be taken.
The input corpus may be a non-repeatable stream (generator).
This is the mirror function to `Sparse2Corpus`.
"""
try:
# if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
Expand Down Expand Up @@ -151,7 +156,7 @@ def ismatrix(m):


def any2sparse(vec, eps=1e-9):
"""Convert a numpy/scipy vector into gensim format (list of 2-tuples)."""
"""Convert a numpy/scipy vector into gensim document format (=list of 2-tuples)."""
if isinstance(vec, numpy.ndarray):
return dense2vec(vec, eps)
if scipy.sparse.issparse(vec):
Expand All @@ -160,15 +165,25 @@ def any2sparse(vec, eps=1e-9):


def scipy2sparse(vec, eps=1e-9):
"""Convert a scipy.sparse vector to gensim format (list of 2-tuples)."""
"""Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
vec = vec.tocsr()
assert vec.shape[0] == 1
return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps]


class Scipy2Corpus(object):
"""
Convert a sequence of dense/sparse vectors into a streamed gensim corpus object.
This is the mirror function to `corpus2csc`.
"""
def __init__(self, vecs):
"""Convert a sequence of dense/sparse vector to a gensim corpus object."""
"""
`vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array,
or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors.
"""
self.vecs = vecs

def __iter__(self):
Expand All @@ -184,8 +199,11 @@ def __len__(self):

def sparse2full(doc, length):
"""
Convert a document in sparse corpus format (sequence of 2-tuples) into a dense
Convert a document in sparse document format (=sequence of 2-tuples) into a dense
numpy array (of size `length`).
This is the mirror function to `full2sparse`.
"""
result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value)
doc = dict(doc)
Expand All @@ -196,9 +214,12 @@ def sparse2full(doc, length):

def full2sparse(vec, eps=1e-9):
"""
Convert a dense numpy array into the sparse corpus format (sequence of 2-tuples).
Convert a dense numpy array into the sparse document format (sequence of 2-tuples).
Values of magnitude < `eps` are treated as zero (ignored).
This is the mirror function to `sparse2full`.
"""
vec = numpy.asarray(vec, dtype=float)
nnz = numpy.nonzero(abs(vec) > eps)[0]
Expand All @@ -209,7 +230,8 @@ def full2sparse(vec, eps=1e-9):

def full2sparse_clipped(vec, topn, eps=1e-9):
"""
Like `full2sparse`, but only return the `topn` greatest elements (not all).
Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
"""
# use numpy.argsort and only form tuples that are actually returned.
# this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
Expand All @@ -225,10 +247,12 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32):
"""
Convert corpus into a dense numpy array (documents will be columns). You
must supply the number of features `num_terms`, because dimensionality
cannot be deduced from sparse vectors alone.
cannot be deduced from the sparse vectors alone.
You can optionally supply `num_docs` (=the corpus length) as well, so that
a more memory efficient code path is taken.
a more memory-efficient code path is taken.
This is the mirror function to `Dense2Corpus`.
"""
if num_docs is not None:
Expand All @@ -249,6 +273,9 @@ class Dense2Corpus(object):
No data copy is made (changes to the underlying matrix imply changes in the
corpus).
This is the mirror function to `corpus2dense`.
"""
def __init__(self, dense, documents_columns=True):
if documents_columns:
Expand All @@ -268,6 +295,9 @@ def __len__(self):
class Sparse2Corpus(object):
"""
Convert a matrix in scipy.sparse format into a streaming gensim corpus.
This is the mirror function to `corpus2csc`.
"""
def __init__(self, sparse, documents_columns=True):
if documents_columns:
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,8 @@ def inference(self, chunk, collect_sstats=False):
sstats[:, ids] += numpy.outer(expElogthetad.T, cts / phinorm)

if len(chunk) > 1:
logger.info("%i/%i documents converged within %i iterations" %
(converged, len(chunk), self.iterations))
logger.debug("%i/%i documents converged within %i iterations" %
(converged, len(chunk), self.iterations))

if collect_sstats:
# This step finishes computing the sufficient statistics for the
Expand Down Expand Up @@ -518,7 +518,7 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N
logger.info("running %s LDA training, %s topics, %i passes over "
"the supplied corpus of %i documents, updating model once "
"every %i documents, evaluating perplexity every %i documents, "
"iterating %i with a convergence threshold of %i" %
"iterating %ix with a convergence threshold of %f" %
(updatetype, self.num_topics, passes, lencorpus,
updateafter, evalafter, iterations,
gamma_threshold))
Expand Down
14 changes: 8 additions & 6 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,18 +805,20 @@ def __iter__(self):


class LineSentence(object):
"""Simple format: one sentence = one line; words already preprocessed and separated by whitespace."""
def __init__(self, source):
"""Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
source can be either a string or a file object
"""
`source` can be either a string or a file object.
Thus, one can use this for just plain files:
Example::
sentences = LineSentence('myfile.txt')
Or for compressed files:
Or for compressed files::
sentences = LineSentence('compressed_text.txt.bz2')
sentences = LineSentence('compressed_text.txt.gz')
sentences = LineSentence(bz2.BZ2File('compressed_text.bz2'))
"""
self.source = source

Expand Down
2 changes: 1 addition & 1 deletion gensim/models/word2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ from scipy.linalg.blas import fblas
REAL = np.float32
ctypedef np.float32_t REAL_t

DEF MAX_SENTENCE_LEN = 1000
DEF MAX_SENTENCE_LEN = 10000

ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@

# Commonly used information
pkg_name = 'gensim'
pkg_ver = '0.10.0rc1'
pkg_ver = '0.10.0'
pkg_desc = 'Python framework for fast Vector Space Modelling'

# there is a bug in python2.5, preventing distutils from using any non-ascii characters :( http://bugs.python.org/issue2562
pkg_author = 'Radim Rehurek', # u'Radim Řehůřek', # <- should really be this...
pkg_author = 'Radim Rehurek' # u'Radim Řehůřek', # <- should really be this...
pkg_author_email = 'radimrehurek@seznam.cz'
pkg_url = 'http://radimrehurek.com/gensim'
pkg_download_url = 'http://pypi.python.org/pypi/gensim'
Expand Down

0 comments on commit 65d4656

Please sign in to comment.