From 39cb21abf3a76823cc72f324cb3f39ba28b007ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 30 Nov 2020 14:13:32 +0100 Subject: [PATCH] Fix initialization for position_dependent_vector_size < vector_size --- gensim/models/fasttext.py | 54 +++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 271665e324..7d3b09d40c 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -484,8 +484,8 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100 bucket = 0 self.wv = FastTextKeyedVectors( - vector_size, position_dependent_weights, position_dependent_vector_size, min_n, max_n, - bucket) + vector_size, self.position_dependent_weights, position_dependent_vector_size, + min_n, max_n, bucket) self.wv.bucket = bucket super(FastText, self).__init__( @@ -1366,10 +1366,6 @@ def init_ngrams_weights(self, seed, window): rand_obj = np.random.default_rng(seed=seed) # use new instance of numpy's recommended generator/algorithm - vocab_shape = (len(self), self.vector_size) - ngrams_shape = (self.bucket, self.vector_size) - positions_shape = (2 * window, self.position_dependent_vector_size) - # # We could have initialized vectors_ngrams at construction time, but we # do it here for two reasons: # @@ -1379,13 +1375,32 @@ def init_ngrams_weights(self, seed, window): # time because the vocab is not initialized at that stage. # if self.position_dependent_weights: + vocab_shape = (len(self), self.position_dependent_vector_size) + ngrams_shape = (self.bucket, self.position_dependent_vector_size) + positions_shape = (2 * window, self.position_dependent_vector_size) hi = sqrt(sqrt(3.0) / self.vector_size) lo = -hi self.vectors_positions = rand_obj.uniform(lo, hi, positions_shape).astype(REAL) + self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL) + self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL) + if self.vector_size > self.position_dependent_vector_size: + vocab_shape = (len(self), self.vector_size - self.position_dependent_vector_size) + ngrams_shape = (self.bucket, self.vector_size - self.position_dependent_vector_size) + lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size + self.vectors_vocab = np.concatenate(( + self.vectors_vocab, + rand_obj.uniform(lo, hi, vocab_shape).astype(REAL), + )) + self.vectors_ngrams = np.concatenate(( + self.vectors_ngrams, + rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL), + )) else: + vocab_shape = (len(self), self.vector_size) + ngrams_shape = (self.bucket, self.vector_size) lo, hi = -1.0 / self.vector_size, 1.0 / self.vector_size - self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL) - self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL) + self.vectors_vocab = rand_obj.uniform(lo, hi, vocab_shape).astype(REAL) + self.vectors_ngrams = rand_obj.uniform(lo, hi, ngrams_shape).astype(REAL) def update_ngrams_weights(self, seed, old_vocab_len): """Update the vocabulary weights for training continuation. @@ -1408,8 +1423,13 @@ def update_ngrams_weights(self, seed, old_vocab_len): rand_obj.seed(seed) new_vocab = len(self) - old_vocab_len - self.vectors_vocab = _pad_random(self.vectors_vocab, new_vocab, rand_obj, - squared=self.position_dependent_weights) + self.vectors_vocab = _pad_random( + self.vectors_vocab, + new_vocab, + rand_obj, + position_dependent_weights=self.position_dependent_weights, + position_dependent_vector_size=self.position_dependent_vector_size, + ) def init_post_load(self, fb_vectors): """Perform initialization after loading a native Facebook model. @@ -1476,16 +1496,22 @@ def recalc_char_ngram_buckets(self): ) -def _pad_random(m, new_rows, rand, squared=False): +def _pad_random(m, new_rows, rand, position_dependent_weights=False, position_dependent_vector_size=0): """Pad a matrix with additional rows filled with random values.""" _, columns = m.shape - shape = (new_rows, columns) - if squared: + if position_dependent_weights: + shape = (new_rows, position_dependent_vector_size) high = sqrt(sqrt(3.0) / columns) low = -high + suffix = rand.uniform(low, high, shape).astype(REAL) + if columns > position_dependent_vector_size: + shape = (new_rows, columns - position_dependent_vector_size) + low, high = -1.0 / columns, 1.0 / columns + suffix = np.concatenate((suffix, rand.uniform(low, high, shape).astype(REAL))) else: + shape = (new_rows, columns) low, high = -1.0 / columns, 1.0 / columns - suffix = rand.uniform(low, high, shape).astype(REAL) + suffix = rand.uniform(low, high, shape).astype(REAL) return vstack([m, suffix])