diff --git a/.gitignore b/.gitignore index 3bca94ef00..c26502811d 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,8 @@ data *.bak /build/ /dist/ - +*.prof +*.lprof +*.bin +*.old +*.model diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 047551c6d4..0474a210a9 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,6 +1,16 @@ Changes ======= +0.8.8 (aka "word2vec release") + +* python3 port by Parikshit Samant: https://github.com/samantp/gensimPy3 +* massive optimizations to word2vec (cython, BLAS, multithreading): ~20x-300x speedup +* new word2vec functionality (thx to Ghassen Hamrouni, PR #124) +* new CSV corpus class (thx to Zygmunt Zając) +* corpus serialization checks to prevent overwriting (by Ian Langmore, PR #125) +* add context manager support for older Python<=2.6 for gzip and bz2 +* added unittests for word2vec + 0.8.7 * initial version of word2vec, a neural network deep learning algo @@ -13,7 +23,7 @@ Changes * save/load directly from bzip2 files (by Luis Pedro Coelho, PR #101) * Blei corpus now tries harder to find its vocabulary file (by Luis Pedro Coelho, PR #100) * sparse vector elements can now be a list (was: only a 2-tuple) -* simple_preprocess now optionally de-accents letters (ř/š/ú etc.) +* simple_preprocess now optionally deaccents letters (ř/š/ú=>r/s/u etc.) * better serialization of numpy corpora * print_topics() returns the topics, in addition to printing/logging * fixes for more robust Windows multiprocessing diff --git a/MANIFEST.in b/MANIFEST.in index 459601806f..d514fa4993 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,3 +7,4 @@ include CHANGELOG.txt include COPYING include COPYING.LESSER include ez_setup.py +include gensim/models/word2vec_inner.pyx diff --git a/README.rst b/README.rst index 18d3be1f27..4bbbe7cc1f 100644 --- a/README.rst +++ b/README.rst @@ -6,6 +6,7 @@ gensim -- Python Framework for Topic Modelling Gensim is a Python library for *topic modelling*, *document indexing* and *similarity retrieval* with large corpora. Target audience is the *natural language processing* (NLP) and *information retrieval* (IR) community. +For a Python3 port of gensim by Parikshit Samant, visit `this fork `_. Features --------- @@ -59,6 +60,5 @@ It is also included in the source distribution package. ---------------- -Gensim is open source software, and has been released under the -`GNU LGPL license `_. +Gensim is open source software released under the `GNU LGPL license `_. Copyright (c) 2009-2013 Radim Rehurek diff --git a/docs/src/Makefile b/docs/src/Makefile index 29bbc3559b..6edac0c761 100644 --- a/docs/src/Makefile +++ b/docs/src/Makefile @@ -33,7 +33,7 @@ clean: html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html rm -r $(BUILDDIR)/html/_sources -# cp -r $(BUILDDIR)/html/* ../ + cp -r $(BUILDDIR)/html/* ../ @echo @echo "Build finished. The HTML pages are in ../" diff --git a/docs/src/_templates/indexcontent.html b/docs/src/_templates/indexcontent.html index a967020f2e..d3d7d1c160 100644 --- a/docs/src/_templates/indexcontent.html +++ b/docs/src/_templates/indexcontent.html @@ -124,15 +124,15 @@

Who is using Gensim?
Doing something
    +
  • + “Gensim hits the sweetest spot of being a simple yet powerful way to access some incredibly complex NLP goodness.” Alan J. Salmoni, Roistr.com +
  • “I used gensim at Ghent university. I found it easy to build prototypes with various models, extend it with additional features and gain empirical insights quickly. It's a reliable library that can be used beyond prototyping too.” Dieter Plaetinck, IBCN group
  • “We used gensim in several text mining projects at Sports Authority. The data were from free-form text fields in customer surveys, as well as social media sources. Having gensim significantly sped our time to development, and it is still my go-to package for topic modeling with large retail data sets.” Josh Hemann, Sports Authority
  • -
  • - “Gensim hits the sweetest spot of being a simple yet powerful way to access some incredibly complex NLP goodness.” Alan J. Salmoni, Roistr.com -
  • “Semantic analysis is a hot topic in online marketing, but there are few products on the market that are truly powerful. Gensim is undoubtedly one of the best frameworks that efficiently implement algorithms for statistical analysis. diff --git a/docs/src/conf.py b/docs/src/conf.py index 0bce0ac24c..bc9aaedfdf 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -52,9 +52,9 @@ # built documents. # # The short X.Y version. -version = '0.8.7' +version = '0.8.8' # The full version, including alpha/beta/rc tags. -release = '0.8.7' +release = '0.8.8' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/src/gensim_theme/layout.html b/docs/src/gensim_theme/layout.html index 6ffdb20613..364c5a787d 100644 --- a/docs/src/gensim_theme/layout.html +++ b/docs/src/gensim_theme/layout.html @@ -128,7 +128,7 @@

    gensim footer image
    - Join the Google discussion group + Join the gensim discussion group
    diff --git a/docs/src/models/word2vec.rst b/docs/src/models/word2vec.rst new file mode 100644 index 0000000000..f0bdaa59d2 --- /dev/null +++ b/docs/src/models/word2vec.rst @@ -0,0 +1,8 @@ +:mod:`models.word2vec` -- Deep learning with word2vec +====================================================== + +.. automodule:: gensim.models.word2vec + :synopsis: Deep learning with word2vec + :members: + :inherited-members: + diff --git a/docs/src/support.rst b/docs/src/support.rst index 640c639537..21dee93837 100644 --- a/docs/src/support.rst +++ b/docs/src/support.rst @@ -18,13 +18,13 @@ When posting on the mailing list, please try to include all relevant information You can also try asking on StackOverflow, using the `gensim tag `_. -Developer issues ----------------- +Business support +------------------ -Developers who `tweak gensim internals `_ are encouraged to report issues at the `GitHub issue tracker `_. -Note that this is not a medium for discussions or asking open-ended questions; please use the mailing list for that. +In case you need SLA-based support, design validation, training or custom development, `contact me `_ for a commercial quote. -Commercial support +Developer support ------------------ -In case you need deterministic response times or have extra support/development needs, `contact me `_ for a quote on commercial support and contracting. +Developers who `tweak gensim internals `_ are encouraged to report issues at the `GitHub issue tracker `_. +Note that this is not a medium for discussions or asking open-ended questions; please use the mailing list for that. diff --git a/docs/src/wiki.rst b/docs/src/wiki.rst index 3bb6aa097f..ddc4b399ef 100644 --- a/docs/src/wiki.rst +++ b/docs/src/wiki.rst @@ -33,7 +33,7 @@ Preparing the corpus I recommend compressing these files immediately, e.g. with bzip2 (down to ~13GB). Gensim can work with compressed files directly, so this lets you save disk space. -Latent Sematic Analysis +Latent Semantic Analysis -------------------------- First let's load the corpus iterator and dictionary, created in the second step above:: diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py new file mode 100644 index 0000000000..52b25abf5f --- /dev/null +++ b/gensim/corpora/csvcorpus.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 Zygmunt Zając +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Corpus in CSV format. + +""" + + +from __future__ import with_statement + +import logging +import csv +import itertools + +from gensim import interfaces + +logger = logging.getLogger('gensim.corpora.csvcorpus') + + +class CsvCorpus(interfaces.CorpusABC): + """ + Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically + based on the file content. + + All row values are expected to be ints/floats. + + """ + + def __init__(self, fname, labels): + """ + Initialize the corpus from a file. + `labels` = are class labels present in the input file? => skip the first column + + """ + logger.info("loading corpus from %s" % fname) + self.fname = fname + self.length = None + self.labels = labels + + # load the first few lines, to guess the CSV dialect + head = ''.join(itertools.islice(open(self.fname), 5)) + self.headers = csv.Sniffer().has_header(head) + self.dialect = csv.Sniffer().sniff(head) + logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers)) + + + def __iter__(self): + """ + Iterate over the corpus, returning one sparse vector at a time. + + """ + reader = csv.reader(open(self.fname), self.dialect) + if self.headers: + reader.next() # skip the headers + + line_no = -1 + for line_no, line in enumerate(reader): + if self.labels: + line.pop(0) # ignore the first column = class label + yield list(enumerate(map(float, line))) + + self.length = line_no + 1 # store the total number of CSV rows = documents diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index a9aaac71ee..51e57975fa 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -73,6 +73,9 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progres >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print mm[42] # retrieve document no. 42, etc. """ + if getattr(corpus, 'fname', None) == fname: + raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) + if index_fname is None: index_fname = fname + '.index' diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 4ea81d9bfd..ab4fc47b4e 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -289,7 +289,7 @@ def get_texts(self): for tokens in pool.imap(process_article, group): # chunksize=10): articles_all += 1 positions_all += len(tokens) - if len(tokens) > ARTICLE_MIN_WORDS: # article redirects are pruned here + if len(tokens) > ARTICLE_MIN_WORDS: # article redirects and short stubs are pruned here articles += 1 positions += len(tokens) yield tokens diff --git a/gensim/matutils.py b/gensim/matutils.py index 03947b4c18..937505709e 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -135,6 +135,14 @@ def pad(mat, padrow, padcol): [numpy.matrix(numpy.zeros((padrow, cols + padcol)))]]) +def zeros_aligned(shape, dtype, order='C', align=128): + """Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary.""" + nbytes = numpy.prod(shape) * numpy.dtype(dtype).itemsize + buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8) + start_index = -buffer.ctypes.data % align + return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order) + + def ismatrix(m): return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index d5da6d449d..90ca39c81f 100644 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -175,6 +175,9 @@ class LdaModel(interfaces.TransformationABC): def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5): """ + If given, start training from the iterable `corpus` straight away. If not given, + the model is left untrained (presumably because you want to call `update()` manually). + `num_topics` is the number of requested latent topics to be extracted from the training corpus. @@ -378,7 +381,7 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations - is reached). + is reached). `corpus` must be an iterable (repeatable stream of documents), In distributed mode, the E step is distributed over a cluster of machines. @@ -412,6 +415,7 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return + self.state.numdocs += lencorpus if update_every > 0: @@ -438,7 +442,9 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N other = LdaState(self.eta, self.state.sstats.shape) dirty = False + reallen = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)): + reallen += len(chunk) # keep track of how many documents we've processed so far if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' % @@ -468,6 +474,8 @@ def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=N other = LdaState(self.eta, self.state.sstats.shape) dirty = False #endfor single corpus iteration + if reallen != lencorpus: + raise RuntimeError("input corpus size changed during training (don't use generators as input)") if dirty: # finish any remaining updates @@ -541,7 +549,7 @@ def bound(self, corpus, gamma=None): def print_topics(self, topics=10, topn=10): - return self.show_topics(topics, topn, True) + return self.show_topics(topics, topn, log=True) def show_topics(self, topics=10, topn=10, log=False, formatted=True): """ @@ -551,6 +559,9 @@ def show_topics(self, topics=10, topn=10, log=False, formatted=True): Unlike LSA, there is no ordering between the topics in LDA. The printed `topics <= self.num_topics` subset of all topics is therefore arbitrary and may change between two runs. + + Set `formatted=True` to return the topics as a list of strings, or `False` as lists of (weight, word) pairs. + """ if topics < 0: # print all topics if `topics` is negative diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d011cf78d9..10ae47a425 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -7,24 +7,44 @@ """ Module for deep learning via *hierarchical softmax skip-gram* from [1]_. -The algorithm is ported from the C package https://code.google.com/p/word2vec/ . +The training algorithm was originally ported from the C package https://code.google.com/p/word2vec/ +and extended with additional functionality. + +**Install Cython with `pip install cython` before to use optimized word2vec training** (70x speedup [2]_). Initialize a model with e.g.:: ->>> model = Word2Vec(sentences, size=100, window=5, min_count=5) +>>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) -Store/load a model with:: +Persist a model to disk with:: >>> model.save(fname) ->>> model = Word2Vec.load(fname) +>>> model = Word2Vec.load(fname) # you can continue training with the loaded model! -The model can also be instantiated from an existing, trained file on disk in word2vec format:: +The model can also be instantiated from an existing file on disk in the word2vec C format:: ->>> model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False) # text format ->>> model = Word2Vec.load_word2vec_format('/tmp/vectors.bin', binary=True) # binary format + >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format + >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format -.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. +You can perform various syntactic/semantic NLP word tasks with the model. Some of them +are already built-in:: + + >>> model.most_similar(positive=['woman', 'king'], negative=['man']) + [('queen', 0.50882536), ...] + + >>> model.doesnt_match("breakfast cereal dinner lunch".split()) + 'cereal' + + >>> model.similarity('woman', 'man') + 0.73723527 + + >>> model['computer'] # raw numpy vector of a word + array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) + +and so on. +.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. +.. [2] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ """ import logging @@ -32,15 +52,61 @@ import os import heapq import time +import itertools +import threading +from multiprocessing.pool import ThreadPool +from Queue import Queue + +from numpy import zeros_like, empty, exp, dot, outer, random, dtype, get_include,\ + float32 as REAL, uint32, seterr, array, uint8, vstack, argsort, fromstring + +logger = logging.getLogger("gensim.models.word2vec") -from numpy import zeros_like, empty, exp, dot, outer, random, dtype,\ - float32 as REAL, seterr, array, uint8, vstack, argsort, fromstring from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -logger = logging.getLogger(__name__) -MIN_ALPHA = 0.0001 # don't allow learning rate to drop below this threshold +try: + # try to compile and use the faster cython version + import pyximport + pyximport.install(setup_args={"include_dirs": get_include()}) + from word2vec_inner import train_sentence, FAST_VERSION +except: + # failed... fall back to plain numpy (20-80x slower training than the above) + FAST_VERSION = -1 + + def train_sentence(model, sentence, alpha, work=None): + """ + Update skip-gram hierarchical softmax model by training on a single sentence. + + The sentence is a list of Vocab objects (or None, where the corresponding + word is not in the vocabulary. Called internally from `Word2Vec.train()`. + + """ + for pos, word in enumerate(sentence): + if word is None: + continue # OOV word in the input sentence => skip + reduced_window = random.randint(model.window) # `b` in the original word2vec code + + # now go over all words from the (reduced) window, predicting each one in turn + start = max(0, pos - model.window + reduced_window) + for pos2, word2 in enumerate(sentence[start : pos + model.window + 1 - reduced_window], start): + if pos2 == pos or word2 is None: + # don't train on OOV words and on the `word` itself + continue + + l1 = model.syn0[word2.index] + # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) + l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size + fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output + ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate + model.syn1[word.point] += outer(ga, l1) # learn hidden -> output + + # TODO add negative sampling? + + l1 += dot(ga, l2a) # learn input -> hidden + + return len([word for word in sentence if word is not None]) class Vocab(object): @@ -61,14 +127,15 @@ class Word2Vec(utils.SaveLoad): """ Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/ - The model can be stored/loaded via its `save()` and `load()` methods, or stored in a format - compatible with the original word2vec implementation via `save_word2vec_format()`. + The model can be stored/loaded via its `save()` and `load()` methods, or stored/loaded in a format + compatible with the original word2vec implementation via `save_word2vec_format()` and `load_word2vec_format()`. """ - def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, seed=1): + def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, seed=1, workers=1, min_alpha=0.0001): """ - Initialize a model from `sentences`. Each sentence is a list of words - (utf8 strings) that will be used for training. + Initialize the model from an iterable of `sentences`. Each sentence is a + list of words (utf8 strings) that will be used for training. + See :class:`BrownCorpus` in this module for an example. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. @@ -78,6 +145,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, `alpha` is the initial learning rate (will linearly drop to zero as training progresses). `seed` = for the random number generator. `min_count` = ignore all words with total frequency lower than this. + `workers` = use this many worker threads to train the model (=faster training with multicore machines) """ self.vocab = {} # mapping from a word (string) to a Vocab object @@ -87,10 +155,11 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, self.window = int(window) self.seed = seed self.min_count = min_count + self.workers = workers + self.min_alpha = min_alpha if sentences is not None: self.build_vocab(sentences) - self.reset_weights() - self.train_model(sentences) + self.train(sentences) def create_binary_tree(self): @@ -119,15 +188,18 @@ def create_binary_tree(self): max_depth = max(len(codes), max_depth) else: # inner node => continue recursion - points = array(list(points) + [node.index - len(self.vocab)], dtype=int) + points = array(list(points) + [node.index - len(self.vocab)], dtype=uint32) stack.append((node.left, array(list(codes) + [0], dtype=uint8), points)) stack.append((node.right, array(list(codes) + [1], dtype=uint8), points)) logger.info("built huffman tree with maximum node depth %i" % max_depth) - def build_vocab(self, sentences): - """Build vocabulary from a sequence of sentences.""" + """ + Build vocabulary from a sequence of sentences (can be a once-only generator stream). + Each sentence must be a list of utf8 strings. + + """ logger.info("collecting all words and their counts") sentence_no, vocab = -1, {} total_words = lambda: sum(v.count for v in vocab.itervalues()) @@ -136,8 +208,10 @@ def build_vocab(self, sentences): logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words(), len(vocab))) for word in sentence: - v = vocab.setdefault(word, Vocab()) - v.count += 1 + if word in vocab: + vocab[word].count += 1 + else: + vocab[word] = Vocab(count=1) logger.info("collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words(), sentence_no + 1)) @@ -152,76 +226,85 @@ def build_vocab(self, sentences): # add info about each word's Huffman encoding self.create_binary_tree() + self.reset_weights() - def train_sentence(self, words, alpha): + def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ - Update skip-gram hierarchical softmax model by training on a single sentence, - where `sentence` is a list of Vocab objects (or None, where the corresponding - word is not in the vocabulary). Called internally from `train_model())`. + Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). + Each sentence must be a list of utf8 strings. """ - for pos, word in enumerate(words): - if word is None: - continue # OOV word in the input sentence => skip - reduced_window = random.randint(self.window) # `b` in the original word2vec code - - # now go over all words from the (reduced) window, predicting each one in turn - start = max(0, pos - self.window + reduced_window) - for pos2, word2 in enumerate(words[start : pos + self.window + 1 - reduced_window], start): - if pos2 == pos or word2 is None: - # don't train on OOV words and on the `word` itself - continue - l1 = self.syn0[word2.index] - # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) - l2a = self.syn1[word.point] # 2d matrix, codelen x layer1_size - fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output - ga = (1 - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - self.syn1[word.point] += outer(ga, l1) # learn hidden -> output + if FAST_VERSION < 0: + import warnings + warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`") + logger.info("training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size)) - # TODO add negative sampling? + if not self.vocab: + raise RuntimeError("you must first build vocabulary before training the model") - l1 += dot(ga, l2a) # learn input -> hidden + start, next_report = time.time(), [1.0] + word_count, total_words = [word_count], total_words or sum(v.count for v in self.vocab.itervalues()) + jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( + lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) + def worker_train(): + """Train the model, lifting lists of sentences from the jobs queue.""" + work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # each thread must have its own work memory - def train_model(self, sentences, total_words=None): - """ - Train the model on a sequence of sentences, updating its existing neural weights. - Each sentence is a list of utf8 strings. - - """ - logger.info("training model with %i words and %i features" % (len(self.vocab), self.layer1_size)) - - # iterate over documents, training the model one sentence at a time - total_words = total_words or sum(v.count for v in self.vocab.itervalues()) - alpha = self.alpha - word_count, sentence_no, start = 0, -1, time.clock() - for sentence_no, sentence in enumerate(sentences): - if sentence_no % 100 == 0: - # decrease learning rate as the training progresses - alpha = max(MIN_ALPHA, self.alpha * (1 - 1.0 * word_count / total_words)) - - # print progress and training stats - elapsed = time.clock() - start - logger.info("PROGRESS: at sentence #%i, %.2f%% words, alpha %f, %.0f words per second" % - (sentence_no, 100.0 * word_count / total_words, alpha, word_count / elapsed if elapsed else 0.0)) - words = [self.vocab.get(word, None) for word in sentence] # replace OOV words with None - self.train_sentence(words, alpha=alpha) - word_count += len(filter(None, words)) # don't consider OOV words for the statistics - logger.info("training took %.1fs" % (time.clock() - start)) + while True: + job = jobs.get() + if job is None: # data finished, exit + break + # update the learning rate before every job + alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) + # how many words did we train on? out-of-vocabulary (unknown) words do not count + job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job) + with lock: + word_count[0] += job_words + elapsed = time.time() - start + if elapsed >= next_report[0]: + logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % + (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) + next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports + + workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] + for thread in workers: + thread.daemon = True # make interrupting the process with ctrl+c easier + thread.start() + + # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue + no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) + for job_no, job in enumerate(utils.grouper(no_oov, chunksize)): + logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) + jobs.put(job) + logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) + for _ in xrange(self.workers): + jobs.put(None) # give the workers heads up that they can finish -- no more work! + + for thread in workers: + thread.join() + + elapsed = time.time() - start + logger.info("training on %i words took %.1fs, %.0f words/s" % + (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) + + return word_count[0] def reset_weights(self): - """Reset all projection weights, but keep the existing vocabulary.""" + """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" random.seed(self.seed) - self.syn0 = ((random.rand(len(self.vocab), self.layer1_size) - 0.5) / self.layer1_size).astype(dtype=REAL) - self.syn1 = zeros_like(self.syn0) + self.syn0 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) + self.syn1 = matutils.zeros_aligned((len(self.vocab), self.layer1_size), dtype=REAL) + self.syn0 += (random.rand(len(self.vocab), self.layer1_size) - 0.5) / self.layer1_size + self.syn0norm = None def save_word2vec_format(self, fname, binary=False): """ Store the input-hidden weight matrix in the same format used by the original - word2vec-tool, for compatibility. + C word2vec-tool, for compatibility. """ logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname)) @@ -230,6 +313,7 @@ def save_word2vec_format(self, fname, binary=False): fout.write("%s %s\n" % self.syn0.shape) # store in sorted order: most frequent words at the top for word, vocab in sorted(self.vocab.iteritems(), key=lambda item: -item[1].count): + word = utils.to_utf8(word) # always store in utf8 row = self.syn0[vocab.index] if binary: fout.write("%s %s\n" % (word, row.tostring())) @@ -240,7 +324,7 @@ def save_word2vec_format(self, fname, binary=False): @classmethod def load_word2vec_format(cls, fname, binary=False): """ - Load the input-hidden weight matrix from the original word2vec-tool format. + Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information loaded is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training @@ -287,8 +371,8 @@ def most_similar(self, positive=[], negative=[], topn=10): similarity, negative words negatively. This method computes cosine similarity between a simple mean of the projection - weight vectors of the given words, and corresponds to the `word-analogy` - script in the original word2vec implementation. + weight vectors of the given words, and corresponds to the `word-analogy` and + `distance` scripts in the original word2vec implementation. Example:: @@ -298,19 +382,26 @@ def most_similar(self, positive=[], negative=[], topn=10): """ self.init_sims() + if isinstance(positive, basestring) and not negative: + # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) + positive = [positive] + # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words positive = [(word, 1.0) if isinstance(word, basestring) else word for word in positive] negative = [(word, -1.0) if isinstance(word, basestring) else word for word in negative] + + # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if word in self.vocab: mean.append(weight * matutils.unitvec(self.syn0[self.vocab[word].index])) all_words.add(self.vocab[word].index) else: - logger.warning("word '%s' not in vocabulary; ignoring it" % word) + raise KeyError("word '%s' not in vocabulary" % word) if not mean: raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) + dists = dot(self.syn0norm, mean) if not topn: return dists @@ -320,6 +411,59 @@ def most_similar(self, positive=[], negative=[], topn=10): return result[:topn] + def doesnt_match(self, words): + """ + Which word from the given list doesn't go with the others? + + Example:: + + >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) + 'cereal' + + """ + words = [word for word in words if word in self.vocab] # filter out OOV words + logger.debug("using words %s" % words) + if not words: + raise ValueError("cannot select a word from an empty list") + vectors = vstack(matutils.unitvec(self.syn0[self.vocab[word].index]) for word in words).astype(REAL) + mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) + dists = dot(vectors, mean) + return sorted(zip(dists, words))[0][1] + + + def __getitem__(self, word): + """ + Return a word's representations in vector space, as a 1D numpy array. + + Example:: + + >>> trained_model['woman'] + array([ -1.40128313e-02, ...] + + """ + return self.syn0[self.vocab[word].index] + + + def __contains__(self, word): + return word in self.vocab + + + def similarity(self, w1, w2): + """ + Compute cosine similarity between two words. + + Example:: + + >>> trained_model.similarity('woman', 'man') + 0.73723527 + + >>> trained_model.similarity('woman', 'woman') + 1.0 + + """ + return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2])) + + def init_sims(self): if getattr(self, 'syn0norm', None) is None: logger.info("precomputing L2-norms of word weight vectors") @@ -332,11 +476,11 @@ def accuracy(self, questions, restrict_vocab=30000): 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. - The accuracy is reported (=printed to log and returned as list) for each + The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency - is not in the top-N most frequent words (default top 30000). + is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. @@ -353,6 +497,7 @@ def log_accuracy(section): sections, section = [], None for line_no, line in enumerate(open(questions)): + # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed if line.startswith(': '): # a new section starts => store the old section if section: @@ -363,15 +508,16 @@ def log_accuracy(section): if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: - a, b, c, expected = [word.lower() for word in line.split()] # TODO assumes vocabulary uses lowercase, too... + a, b, c, expected = [word.lower() for word in line.split()] # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line)) continue - predicted, ignore = None, set(self.vocab[v].index for v in [a, b, c]) - # go over predicted words, starting from the most likely, but ignoring OOV words and input words + ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore + predicted = None + # find the most likely prediction, ignoring OOV words and input words for index in argsort(self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] @@ -390,9 +536,13 @@ def log_accuracy(section): return sections + def __str__(self): + return "Word2Vec(vocab=%s, size=%s, alpha=%s)" % (len(self.index2word), self.layer1_size, self.alpha) + + class BrownCorpus(object): - """Yield sentences from the Brown corpus (part of NLTK data).""" + """Iterate over sentences from the Brown corpus (part of NLTK data).""" def __init__(self, dirname): self.dirname = dirname @@ -413,7 +563,7 @@ def __iter__(self): class Text8Corpus(object): - """Yield sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .""" + """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .""" def __init__(self, fname): self.fname = fname @@ -437,23 +587,44 @@ def __iter__(self): sentence = sentence[max_sentence_length:] +class LineSentence(object): + def __init__(self, fname): + """Simple format: one sentence = one line; words already preprocessed and separated by whitespace.""" + self.fname = fname + + def __iter__(self): + for line in open(self.fname): + yield line.split() + + +# Example: ./word2vec.py ~/workspace/word2vec/text8 ~/workspace/word2vec/questions-words.txt ./text8 if __name__ == "__main__": - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) + logging.info("using optimization %s" % FAST_VERSION) # check and process cmdline input program = os.path.basename(sys.argv[0]) - if len(sys.argv) < 3: + if len(sys.argv) < 2: print globals()['__doc__'] % locals() sys.exit(1) - infile, outfile = sys.argv[1:3] + infile = sys.argv[1] + from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle seterr(all='raise') # don't ignore numpy errors - w = Word2Vec(BrownCorpus(infile), size=20, min_count=5) - w.save(outfile + '.model') - w.save_word2vec_format(outfile + '.model.bin', binary=True) - w.save_word2vec_format(outfile + '.model.txt', binary=False) + # model = Word2Vec(LineSentence(infile), size=200, min_count=5, workers=4) + model = Word2Vec(Text8Corpus(infile), size=200, min_count=5, workers=1) + + if len(sys.argv) > 3: + outfile = sys.argv[3] + model.save(outfile + '.model') + model.save_word2vec_format(outfile + '.model.bin', binary=True) + model.save_word2vec_format(outfile + '.model.txt', binary=False) + + if len(sys.argv) > 2: + questions_file = sys.argv[2] + model.accuracy(sys.argv[2]) logging.info("finished running %s" % program) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx new file mode 100755 index 0000000000..95c68dc9ef --- /dev/null +++ b/gensim/models/word2vec_inner.pyx @@ -0,0 +1,213 @@ +#!/usr/bin/env cython +# cython: boundscheck=False +# cython: wraparound=False +# cython: cdivision=True +# coding: utf-8 +# +# Copyright (C) 2013 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +import cython +import numpy as np +cimport numpy as np + +from libc.math cimport exp +from libc.string cimport memset + +from cpython cimport PyCObject_AsVoidPtr +from scipy.linalg.blas import fblas + +REAL = np.float32 +ctypedef np.float32_t REAL_t + + +ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil +ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil +ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil +ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil +ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil +ctypedef void (*fast_sentence_ptr) ( + const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, + REAL_t *syn0, REAL_t *syn1, const int size, + const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work) nogil + +cdef scopy_ptr scopy=PyCObject_AsVoidPtr(fblas.scopy._cpointer) # y = x +cdef saxpy_ptr saxpy=PyCObject_AsVoidPtr(fblas.saxpy._cpointer) # y += alpha * x +cdef sdot_ptr sdot=PyCObject_AsVoidPtr(fblas.sdot._cpointer) # float = dot(x, y) +cdef dsdot_ptr dsdot=PyCObject_AsVoidPtr(fblas.sdot._cpointer) # double = dot(x, y) +cdef snrm2_ptr snrm2=PyCObject_AsVoidPtr(fblas.snrm2._cpointer) # sqrt(x^2) +cdef fast_sentence_ptr fast_sentence + + +DEF EXP_TABLE_SIZE = 1000 +DEF MAX_EXP = 6 + +cdef REAL_t[EXP_TABLE_SIZE] EXP_TABLE + +cdef int ONE = 1 +cdef REAL_t ONEF = 1.0 + + +cdef void fast_sentence0( + const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, + REAL_t *syn0, REAL_t *syn1, const int size, + const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work) nogil: + + cdef long long a, b + cdef long long row1 = word2_index * size, row2 + cdef REAL_t f, g + + memset(work, 0, size * cython.sizeof(REAL_t)) + for b in range(codelen): + row2 = word_point[b] * size + f = dsdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE) + if f <= -MAX_EXP or f >= MAX_EXP: + continue + f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] + g = (1 - word_code[b] - f) * alpha + saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) + saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE) + saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE) + + +cdef void fast_sentence1( + const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, + REAL_t *syn0, REAL_t *syn1, const int size, + const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work) nogil: + + cdef long long a, b + cdef long long row1 = word2_index * size, row2 + cdef REAL_t f, g + + memset(work, 0, size * cython.sizeof(REAL_t)) + for b in range(codelen): + row2 = word_point[b] * size + f = sdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE) + if f <= -MAX_EXP or f >= MAX_EXP: + continue + f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] + g = (1 - word_code[b] - f) * alpha + saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) + saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE) + saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE) + + +cdef void fast_sentence2( + const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen, + REAL_t *syn0, REAL_t *syn1, const int size, + const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work) nogil: + + cdef long long a, b + cdef long long row1 = word2_index * size, row2 + cdef REAL_t f, g + + for a in range(size): + work[a] = 0.0 + for b in range(codelen): + row2 = word_point[b] * size + f = 0.0 + for a in range(size): + f += syn0[row1 + a] * syn1[row2 + a] + if f <= -MAX_EXP or f >= MAX_EXP: + continue + f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] + g = (1 - word_code[b] - f) * alpha + for a in range(size): + work[a] += g * syn1[row2 + a] + for a in range(size): + syn1[row2 + a] += g * syn0[row1 + a] + for a in range(size): + syn0[row1 + a] += work[a] + + +DEF MAX_SENTENCE_LEN = 1000 + +def train_sentence(model, sentence, alpha, _work): + cdef REAL_t *syn0 = (np.PyArray_DATA(model.syn0)) + cdef REAL_t *syn1 = (np.PyArray_DATA(model.syn1)) + cdef REAL_t *work + cdef np.uint32_t word2_index + cdef REAL_t _alpha = alpha + cdef int size = model.layer1_size + + cdef np.uint32_t *points[MAX_SENTENCE_LEN] + cdef np.uint8_t *codes[MAX_SENTENCE_LEN] + cdef int codelens[MAX_SENTENCE_LEN] + cdef np.uint32_t indexes[MAX_SENTENCE_LEN] + cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN] + cdef int sentence_len + cdef int window = model.window + + cdef int i, j, k + cdef long result = 0 + + # convert Python structures to primitive types, so we can release the GIL + work = np.PyArray_DATA(_work) + sentence_len = min(MAX_SENTENCE_LEN, len(sentence)) + for i in range(sentence_len): + word = sentence[i] + if word is None: + codelens[i] = 0 + else: + indexes[i] = word.index + codelens[i] = len(word.code) + codes[i] = np.PyArray_DATA(word.code) + points[i] = np.PyArray_DATA(word.point) + reduced_windows[i] = np.random.randint(window) + result += 1 + + # release GIL & train on the sentence + with nogil: + for i in range(sentence_len): + if codelens[i] == 0: + continue + j = i - window + reduced_windows[i] + if j < 0: + j = 0 + k = i + window + 1 - reduced_windows[i] + if k > sentence_len: + k = sentence_len + for j in range(j, k): + if j == i or codelens[j] == 0: + continue + fast_sentence(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], _alpha, work) + + return result + + +def init(): + """ + Precompute function `sigmoid(x) = 1 / (1 + exp(-x))`, for x values discretized + into table EXP_TABLE. + + """ + global fast_sentence + cdef int i + cdef float *x = [10.0] + cdef float *y = [0.01] + cdef float expected = 0.1 + cdef int size = 1 + cdef double d_res + cdef float *p_res + + # build the sigmoid table + for i in range(EXP_TABLE_SIZE): + EXP_TABLE[i] = exp((i / EXP_TABLE_SIZE * 2 - 1) * MAX_EXP) + EXP_TABLE[i] = (EXP_TABLE[i] / (EXP_TABLE[i] + 1)) + + # check whether sdot returns double or float + d_res = dsdot(&size, x, &ONE, y, &ONE) + p_res = &d_res + if (abs(d_res - expected) < 0.0001): + fast_sentence = fast_sentence0 + return 0 # double + elif (abs(p_res[0] - expected) < 0.0001): + fast_sentence = fast_sentence1 + return 1 # float + else: + # neither => use cython loops, no BLAS + # actually, the BLAS is so messed up we'll probably have segfaulted above and never even reach here + fast_sentence = fast_sentence2 + return 2 + +FAST_VERSION = init() # initialize the module diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py new file mode 100644 index 0000000000..b493457734 --- /dev/null +++ b/gensim/test/test_word2vec.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking transformation algorithms (the models package). +""" + + +import logging +import unittest +import os +import tempfile +import itertools + +import numpy + +from gensim import utils, matutils +from gensim.models import word2vec + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + + +class LeeCorpus(object): + def __iter__(self): + for line in open(datapath('lee_background.cor')): + yield utils.simple_preprocess(line) + + +sentences = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] + + +def testfile(): + # temporary data will be stored to this file + return os.path.join(tempfile.gettempdir(), 'gensim_word2vec.tst') + + +class TestWord2VecModel(unittest.TestCase): + def testPersistence(self): + """Test storing/loading the entire model.""" + model = word2vec.Word2Vec(sentences, min_count=1) + model.save(testfile()) + self.models_equal(model, word2vec.Word2Vec.load(testfile())) + + def testVocab(self): + """Test word2vec vocabulary building.""" + corpus = LeeCorpus() + total_words = sum(len(sentence) for sentence in corpus) + + # try vocab building explicitly, using all words + model = word2vec.Word2Vec(min_count=1) + model.build_vocab(corpus) + self.assertTrue(len(model.vocab) == 6981) + # with min_count=1, we're not throwing away anything, so make sure the word counts add up to be the entire corpus + self.assertTrue(sum(v.count for v in model.vocab.itervalues()) == total_words) + # make sure the binary codes are correct + numpy.allclose(model.vocab['the'].code, [1, 1, 0, 0]) + + # test building vocab with default params + model = word2vec.Word2Vec() + model.build_vocab(corpus) + self.assertTrue(len(model.vocab) == 1750) + numpy.allclose(model.vocab['the'].code, [1, 1, 1, 0]) + + # no input => "RuntimeError: you must first build vocabulary before training the model" + self.assertRaises(RuntimeError, word2vec.Word2Vec, []) + + # input not empty, but rather completely filtered out + self.assertRaises(RuntimeError, word2vec.Word2Vec, corpus, min_count=total_words+1) + + + def testTraining(self): + """Test word2vec training.""" + # to test training, make the corpus larger by repeating its sentences over and over + # build vocabulary, don't train yet + model = word2vec.Word2Vec(size=2, min_count=1) + model.build_vocab(sentences) + self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.vocab), 2)) + + model.train(sentences) + sims = model.most_similar('graph') + self.assertTrue(sims[0][0] == 'trees', sims) # most similar + + # build vocab and train in one step; must be the same as above + model2 = word2vec.Word2Vec(sentences, size=2, min_count=1) + self.models_equal(model, model2) + + + def testParallel(self): + """Test word2vec parallel training.""" + if word2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow) + return + + corpus = utils.RepeatCorpus(LeeCorpus(), 10000) + + for workers in [2, 4]: + model = word2vec.Word2Vec(corpus, workers=workers) + sims = model.most_similar('israeli') + # the exact vectors and therefore similarities may differ, due to different thread collisions + # so let's test only for top3 + self.assertTrue('palestinian' in [sims[i][0] for i in xrange(3)]) + + + def testRNG(self): + """Test word2vec results identical with identical RNG seed.""" + model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) + model2 = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1) + self.models_equal(model, model2) + + + def models_equal(self, model, model2): + self.assertEqual(len(model.vocab), len(model2.vocab)) + self.assertTrue(numpy.allclose(model.syn0, model2.syn0)) + self.assertTrue(numpy.allclose(model.syn1, model2.syn1)) + most_common_word = max(model.vocab.iteritems(), key=lambda item: item[1].count)[0] + self.assertTrue(numpy.allclose(model[most_common_word], model2[most_common_word])) +#endclass TestWord2VecModel + + +if __name__ == '__main__': + logging.root.setLevel(logging.DEBUG) + unittest.main() diff --git a/gensim/utils.py b/gensim/utils.py index b659122a76..c8f87c59ea 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -496,15 +496,31 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): yield chunk -def smart_open(fname, mode): +def make_closing(base, **attrs): + """ + Add support for `with Base(attrs) as fout:` to the base class if it's missing. + The base class' `close()` method will be called on context exit, to always close the file properly. + + This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise + raise "AttributeError: GzipFile instance has no attribute '__exit__'". + + """ + if not hasattr(base, '__enter__'): + attrs['__enter__'] = lambda self: self + if not hasattr(base, '__exit__'): + attrs['__exit__'] = lambda self, type, value, traceback: self.close() + return type('Closing' + base.__name__, (base, object), attrs) + + +def smart_open(fname, mode='r'): from os import path _, ext = path.splitext(fname) if ext == '.bz2': from bz2 import BZ2File - return BZ2File(fname, mode) + return make_closing(BZ2File)(fname, mode) if ext == '.gz': from gzip import GzipFile - return GzipFile(fname, mode) + return make_closing(GzipFile)(fname, mode) return open(fname, mode) diff --git a/setup.py b/setup.py index eaf45815fc..45c2ff2866 100644 --- a/setup.py +++ b/setup.py @@ -25,10 +25,9 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() - setup( name = 'gensim', - version = '0.8.6', + version = '0.8.8', description = 'Python framework for fast Vector Space Modelling', long_description = read('README.rst'), @@ -42,7 +41,7 @@ def read(fname): download_url = 'http://pypi.python.org/pypi/gensim', keywords = 'Singular Value Decomposition, Latent Semantic Indexing, SVD, LSA, ' - 'LSI, LDA, Latent Dirichlet Allocation, VSM, Random Projections, TFIDF', + 'LSI, LDA, Latent Dirichlet Allocation, VSM, Random Projections, TFIDF, word2vec', license = 'LGPL', platforms = 'any',