Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sequential lda optimization #3172

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,7 @@ gensim/models/fasttext_inner.c
gensim/models/nmf_pgd.c
gensim/models/word2vec_corpusfile.cpp
gensim/models/word2vec_inner.c
gensim/models/ldaseq_sslm_inner.c
gensim/models/ldaseq_posterior_inner.c

.ipynb_checkpoints
331 changes: 331 additions & 0 deletions gensim/models/ldaseq_posterior_inner.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
#!/usr/bin/env cython
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True


import numpy as np
from scipy.special import gammaln, psi

from libc.math cimport exp, log, abs
from libc.string cimport memcpy
from libc.stdlib cimport malloc, free

cimport numpy as np


ctypedef np.float64_t REAL_t


cdef update_phi(
REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
int * word_ids, REAL_t * lda_topics, const int num_topics,
const int doc_length
):

"""Update variational multinomial parameters, based on a document and a time-slice.

This is done based on the original Blei-LDA paper, where:
log_phi := beta * exp(Ψ(gamma)), over every topic for every word.

TODO: incorporate lee-sueng trick used in
**Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.

Parameters
----------
gamma : size of num_topics. in-parameter

phi : size of (max_doc_len, num_topics). int/out-parameter

log_phi: size of (max_doc_len, num_topics). in/out-parameter

word_ids: size of doc_length. in-parameter

lda_topics: size of (vocab_len, num_topics). in-parameter

num_topics: number of topics in the model. in-parameter

doc_length: length of the document (number of words in the document). in-parameter

"""

cdef int k, i

# digamma values
cdef REAL_t * dig = <REAL_t *> malloc(num_topics * sizeof(REAL_t))
if dig == NULL:
raise

for k in range(num_topics):
dig[k] = psi(gamma[k])

cdef REAL_t *log_phi_row = NULL
cdef REAL_t *phi_row = NULL

for i in range(doc_length):
for k in range(num_topics):
log_phi[i * num_topics + k] = dig[k] + lda_topics[word_ids[i] * num_topics + k]

log_phi_row = log_phi + i * num_topics

phi_row = phi + i * num_topics

# log normalize
v = log_phi_row[0]
for i in range(1, num_topics):
v = log(exp(v) + exp(log_phi_row[i]))

# subtract every element by v
for i in range(num_topics):
log_phi_row[i] = log_phi_row[i] - v

for i in range(num_topics):
phi_row[i] = exp(log_phi_row[i])

free(dig)


cdef update_phi_fixed():
return


cdef update_gamma(
REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha,
const int *word_counts, const int num_topics, const int doc_length
):
"""Update variational dirichlet parameters.

This operations is described in the original Blei LDA paper:
gamma = alpha + sum(phi), over every topic for every word.

Parameters
----------
gamma: size of num_topics.

phi: size of (max_doc_len, num_topics).

lda_alpha: size of num_topics.

word_counts: size of doc_length.

num_topics: number of topics in the model

doc_length: length of the document

"""
memcpy(gamma, lda_alpha, num_topics * sizeof(REAL_t))

cdef int i, k
# TODO BLAS matrix*vector
for i in range(doc_length):
for k in range(num_topics):
gamma[k] += phi[i * num_topics + k] * word_counts[i]


cdef REAL_t compute_lda_lhood(
REAL_t * lhood, const REAL_t *gamma, const REAL_t * phi, const REAL_t *log_phi,
const REAL_t * lda_alpha, const REAL_t *lda_topics,
int *word_counts, int *word_ids,
const int num_topics, const int doc_length
):
"""Compute the log likelihood bound.
Parameters
----------
gamma: size of num_topics

lhood: size of num_topics + 1

phi: size of (max_doc_len, num_topics).

log_phi: size of (max_doc_len, num_topics). in-parameter

lda_alpha: size of num_topics. in-parameter

lda_topics: size of (vocab_len, num_topics). in-parameter

word_counts: size of doc_len

word_ids: size of doc_len

num_topics: number of topics in the model

doc_length: length of the document

Returns
-------
float
The optimal lower bound for the true posterior using the approximate distribution.

"""
cdef int i

cdef REAL_t gamma_sum = 0.0
for i in range(num_topics):
gamma_sum += gamma[i]

cdef REAL_t alpha_sum = 0.0
for i in range(num_topics):
alpha_sum += lda_alpha[i]

cdef REAL_t lhood_v = gammaln(alpha_sum) - gammaln(gamma_sum)
lhood[num_topics] = lhood_v

cdef REAL_t digsum = psi(gamma_sum)
cdef REAL_t lhood_term, e_log_theta_k

for k in range(num_topics):

e_log_theta_k = psi(gamma[k]) - digsum

lhood_term = (lda_alpha[k] - gamma[k]) * e_log_theta_k + gammaln(gamma[k]) - gammaln(lda_alpha[k])

# TODO: check why there's an IF
for i in range(doc_length):
if phi[i * num_topics + k] > 0:
lhood_term += \
word_counts[i] * phi[i * num_topics + k] \
* (e_log_theta_k + lda_topics[word_ids[i] * num_topics + k]
- log_phi[i * num_topics + k])

lhood[k] = lhood_term
lhood_v += lhood_term

return lhood_v

cdef init_lda_post(
REAL_t *gamma, REAL_t * phi, const int *word_counts,
const REAL_t *lda_alpha,
const int doc_length, const int num_topics
):

"""Initialize variational posterior. """

cdef int i, j

cdef int total = 0

# BLAS sum of absolute numbers
for i in range(doc_length):
total += word_counts[i]

cdef REAL_t init_value = lda_alpha[0] + float(total) / num_topics

for i in range(num_topics):
gamma[i] = init_value

init_value = 1.0 / num_topics

for i in range(doc_length):
phi_doc = phi + i * num_topics
for j in range(num_topics):
phi_doc[j] = init_value


def fit_lda_post(
self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None
):
"""Posterior inference for lda.

Parameters
----------

Returns
-------
float
The optimal lower bound for the true posterior using the approximate distribution.
"""

cdef int i

###############
# Setup C structures
###############

cdef int num_topics = self.lda.num_topics
cdef int vocab_len = len(self.lda.id2word)

cdef int doc_length = len(self.doc)
cdef int max_doc_len = doc_length

# TODO adopt implementation to avoid memory allocation for every document.
# E.g. create numpy array field of Python class. Be careful with the array size, it should be at least
# the length of the longest document
cdef int * word_ids = <int *> malloc(doc_length * sizeof(int))
if word_ids == NULL:
raise
cdef int * word_counts = <int *> malloc(doc_length * sizeof(int))
if word_counts == NULL:
raise
# TODO Would it be better to create numpy array first?
for i in range(doc_length):
word_ids[i] = self.doc[i][0]
word_counts[i] = self.doc[i][1]

cdef REAL_t * gamma = <REAL_t *> (np.PyArray_DATA(self.gamma))
cdef REAL_t * phi = <REAL_t *> (np.PyArray_DATA(self.phi))
cdef REAL_t * log_phi = <REAL_t *> (np.PyArray_DATA(self.log_phi))
cdef REAL_t * lhood = <REAL_t *> (np.PyArray_DATA(self.lhood))

cdef REAL_t * lda_topics = <REAL_t *> (np.PyArray_DATA(self.lda.topics))
cdef REAL_t * lda_alpha = <REAL_t *> (np.PyArray_DATA(self.lda.alpha))

###############
# Finished setup of c structures here
###############

init_lda_post(gamma, phi, word_counts, lda_alpha, doc_length, num_topics)

# sum of counts in a doc
cdef REAL_t total = sum(count for word_id, count in self.doc)

cdef REAL_t lhood_v = compute_lda_lhood(
lhood, gamma, phi, log_phi,
lda_alpha, lda_topics, word_counts, word_ids,
num_topics, doc_length
)

cdef REAL_t lhood_old = 0.0
cdef REAL_t converged = 0.0
cdef int iter_ = 0

# TODO Why first iteration starts here is done outside of the loop?
iter_ += 1
lhood_old = lhood_v
update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length)

model = "DTM"

# if model == "DTM" or sslm is None:
update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)

lhood_v = compute_lda_lhood(
lhood, gamma, phi, log_phi,
lda_alpha, lda_topics, word_counts, word_ids,
num_topics, doc_length
)

converged = abs((lhood_old - lhood_v) / (lhood_old * total))

while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter:

iter_ += 1
lhood_old = lhood_v
update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length)
model = "DTM"

update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)

lhood_v = compute_lda_lhood(
lhood, gamma, phi, log_phi,
lda_alpha, lda_topics, word_counts, word_ids,
num_topics, doc_length
)

converged = np.fabs((lhood_old - lhood_v) / (lhood_old * total))

free(word_ids)
free(word_counts)

return lhood_v
27 changes: 27 additions & 0 deletions gensim/models/ldaseq_sslm_inner.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
cimport numpy as np
ctypedef np.float64_t REAL_t



cdef struct StateSpaceLanguageModelConfig:
int num_time_slices, vocab_len
REAL_t chain_variance, obs_variance

REAL_t * obs
REAL_t * mean
REAL_t * variance

REAL_t * fwd_mean
REAL_t * fwd_variance
REAL_t * zeta
REAL_t * e_log_prob
# T
REAL_t * word_counts
# T
REAL_t * totals

REAL_t * deriv
# T * (T+1)
REAL_t * mean_deriv_mtx
int word

Loading