Skip to content

Commit f811aff

Browse files
committed
Try to use the same notation across the different models.
1 parent 489aaeb commit f811aff

21 files changed

+1242
-1143
lines changed

ptm/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .lda_gibbs import GibbsLDA
2+
from .slda_gibbs import GibbsSupervisedLDA
3+
from .ctm import CorrelatedTopicModel
4+
from .rtm import RelationalTopicModel
5+

ptm/at_model.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import numpy as n
22
import os
33

4-
class at_model:
54

5+
class AuthorTopicModel:
66
def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01):
77
"""
88
Initialize at_model
@@ -42,26 +42,26 @@ def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01):
4242
self.topic_sum = n.zeros(self._K)
4343
self.author_sum = n.zeros(self._A)
4444

45-
#initialization
45+
# initialization
4646
for di in xrange(0, self._D):
4747
self.author_assigned.append(list())
4848
self.topic_assigned.append(list())
4949
doc = self._docList[di]
5050
authors = self._authorList[di]
5151
for wi in xrange(0, len(doc)):
5252
w = doc[wi]
53-
#random sampling topic
53+
# random sampling topic
5454
z = n.random.choice(self._K, 1)[0]
55-
#random sampling author
56-
a = n.random.choice(len(authors),1)[0]
55+
# random sampling author
56+
a = n.random.choice(len(authors), 1)[0]
5757

58-
#assigning sampled value (sufficient statistics)
59-
self.c_wt[w,z] += 1
60-
self.c_at[authors[a],z] += 1
58+
# assigning sampled value (sufficient statistics)
59+
self.c_wt[w, z] += 1
60+
self.c_at[authors[a], z] += 1
6161
self.topic_sum[z] += 1
6262
self.author_sum[authors[a]] += 1
6363

64-
#keep sampled value for future sampling
64+
# keep sampled value for future sampling
6565
self.topic_assigned[di].append(z)
6666
self.author_assigned[di].append(authors[a])
6767

@@ -81,21 +81,22 @@ def sampling_topics(self, max_iter):
8181
self.topic_sum[old_z] -= 1
8282
self.author_sum[old_a] -= 1
8383

84-
wt = (self.c_wt[w, :]+ self._eta)/(self.topic_sum+self._W*self._eta)
85-
at = (self.c_at[authors,:] + self._alpha)/(self.author_sum[authors].repeat(self._K).reshape(len(authors),self._K)+self._K*self._alpha)
84+
wt = (self.c_wt[w, :] + self._eta) / (self.topic_sum + self._W * self._eta)
85+
at = (self.c_at[authors, :] + self._alpha) / (
86+
self.author_sum[authors].repeat(self._K).reshape(len(authors), self._K) + self._K * self._alpha)
8687

87-
pdf = at*wt
88-
pdf = pdf.reshape(len(authors)*self._K)
89-
pdf = pdf/pdf.sum()
88+
pdf = at * wt
89+
pdf = pdf.reshape(len(authors) * self._K)
90+
pdf = pdf / pdf.sum()
9091

91-
#sampling author and topic
92+
# sampling author and topic
9293
idx = n.random.multinomial(1, pdf).argmax()
9394

94-
new_ai = idx/self._K
95-
new_z = idx%self._K
95+
new_ai = idx / self._K
96+
new_z = idx % self._K
9697

9798
new_a = authors[new_ai]
98-
self.c_wt[w,new_z] += 1
99+
self.c_wt[w, new_z] += 1
99100
self.c_at[new_a, new_z] += 1
100101
self.topic_sum[new_z] += 1
101102
self.author_sum[new_a] += 1
@@ -104,8 +105,8 @@ def sampling_topics(self, max_iter):
104105

105106

106107
if __name__ == '__main__':
107-
#test case
108-
atm = at_model([0,1,2,3,4], 2, 3, [[0,0,2,2,3],[1,3,3,4,4]], [[0,1],[1,2]])
108+
# test case
109+
atm = at_model([0, 1, 2, 3, 4], 2, 3, [[0, 0, 2, 2, 3], [1, 3, 3, 4, 4]], [[0, 1], [1, 2]])
109110
atm.sampling_topics(10)
110111

111112
folder = 'at-result'

ptm/base.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import numpy as np
2+
3+
class BaseTopicModel():
4+
"""
5+
Attributes
6+
----------
7+
n_doc: int
8+
the number of total documents in the corpus
9+
n_voca: int
10+
the vocabulary size of the corpus
11+
"""
12+
def __init__(self, n_doc, n_voca):
13+
self.n_doc = n_doc
14+
self.n_voca = n_voca
15+
16+
17+
class BaseGibbsParamTopicModel(BaseTopicModel):
18+
""" Base class of parametric topic models with Gibbs sampling inference
19+
20+
Attributes
21+
----------
22+
n_topic: int
23+
a number of topics to be inferred through the Gibbs sampling
24+
WT: ndarray, shape (n_voca, n_topic)
25+
word-topic matrix, keeps the number of assigned word tokens for each word-topic pair
26+
DT: ndarray, shape (n_doc, n_topic)
27+
document-topic matrix, keeps the number of assigned word tokens for each document-topic pair
28+
sum_T: ndarray, shape (n_topic)
29+
number of word tokens assigned for each topic
30+
alpha: float
31+
symmetric parameter of Dirichlet prior for document-topic distribution
32+
beta: float
33+
symmetric parameter of Dirichlet prior for topic-word distribution
34+
"""
35+
36+
def __init__(self, n_doc, n_voca, n_topic, alpha, beta):
37+
super(BaseGibbsParamTopicModel, self).__init__(n_doc=n_doc, n_voca=n_voca)
38+
self.n_topic = n_topic
39+
self.WT = np.zeros([self.n_voca, self.n_topic])
40+
self.DT = np.zeros([self.n_doc, self.n_topic])
41+
self.sum_T = np.zeros(self.n_topic)
42+
43+
self.alpha = alpha
44+
self.beta = beta
45+
46+
self.topic_assignment = list()
47+
48+
self.WT += self.beta
49+
self.sum_T += self.beta * self.n_voca
50+
self.DT += self.alpha

ptm/ctm.py

Lines changed: 56 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,77 @@
11
import time
22
import numpy as np
33
import numpy.linalg
4-
import simplex_projection
54
import scipy.optimize
5+
from .simplex_projection import euclidean_proj_simplex
66

77
e = 1e-100
88
error_diff = 10
99

10-
class CTM:
10+
11+
class CorrelatedTopicModel():
1112
"""
1213
Correlated topic models,
1314
Blei, David and Lafferty, John,
1415
2006
1516
"""
16-
17-
def __init__(self, topic_size, voca_size, user_size, item_size, doc_item, doc_cnt, ratings=None):
17+
18+
def __init__(self, n_topic, n_voca, n_user, n_item, doc_item, doc_cnt, ratings=None):
1819
self.lambda_u = 0.01
1920
self.lambda_v = 0.01
2021
self.alpha = 1
2122
self.eta = 0.01
2223
self.a = 1
2324
self.b = 0.01
2425

25-
self.topic_size = topic_size
26-
self.voca_size = voca_size
27-
self.user_size = user_size
28-
self.item_size = item_size
29-
30-
#U = user_topic matrix, U x K
31-
self.U = np.random.multivariate_normal(np.zeros(topic_size), np.identity(topic_size)*(1./self.lambda_u), size=self.user_size)
32-
#V = item(doc)_topic matrix, V x K
33-
self.V = np.random.multivariate_normal(np.zeros(topic_size), np.identity(topic_size)*(1./self.lambda_u), size=self.item_size)
34-
self.theta = np.random.random([item_size,topic_size])
35-
self.theta = self.theta/self.theta.sum(1)[:,np.newaxis] #normalize
36-
self.beta = np.random.random([voca_size,topic_size])
37-
self.beta = self.beta/self.beta.sum(0) #normalize
26+
self.n_topic = n_topic
27+
self.n_voca = n_voca
28+
self.n_user = n_user
29+
self.n_item = n_item
30+
31+
# U = user_topic matrix, U x K
32+
self.U = np.random.multivariate_normal(np.zeros(n_topic), np.identity(n_topic) * (1. / self.lambda_u),
33+
size=self.n_user)
34+
# V = item(doc)_topic matrix, V x K
35+
self.V = np.random.multivariate_normal(np.zeros(n_topic), np.identity(n_topic) * (1. / self.lambda_u),
36+
size=self.n_item)
37+
self.theta = np.random.random([n_item, n_topic])
38+
self.theta = self.theta / self.theta.sum(1)[:, np.newaxis] # normalize
39+
self.beta = np.random.random([n_voca, n_topic])
40+
self.beta = self.beta / self.beta.sum(0) # normalize
3841

3942
self.doc_item = doc_item
4043
self.doc_cnt = doc_cnt
4144

42-
self.C = np.zeros([user_size, item_size]) + self.b
43-
self.R = np.zeros([user_size, item_size]) #user_size x item_size
45+
self.C = np.zeros([n_user, n_item]) + self.b
46+
self.R = np.zeros([n_user, n_item]) # user_size x item_size
4447

4548
if ratings:
4649
for di in xrange(len(ratings)):
4750
rate = ratings[di]
4851
for user in rate:
49-
self.C[user,di] += self.a - self.b
50-
self.R[user,di] = 1
52+
self.C[user, di] += self.a - self.b
53+
self.R[user, di] = 1
54+
55+
self.phi_sum = np.zeros([n_voca, n_topic]) + self.eta
5156

52-
self.phi_sum = np.zeros([voca_size, topic_size]) + self.eta
53-
5457
def learning_fixed_theta(self, max_iter):
5558
old_err = 0
5659
for iteration in xrange(max_iter):
5760
prev = time.clock()
5861
self.update_u()
5962
self.update_v()
6063
err = self.sqr_error()
61-
print 'Iteration-', iteration, time.clock() - prev, err
64+
print('Iteration-', iteration, time.clock() - prev, err)
6265
if abs(old_err - err) < error_diff:
6366
break
6467

65-
#reconstructing matrix for prediction
68+
# reconstructing matrix for prediction
6669
def predict_item(self):
6770
return np.dot(self.U, self.V.T)
68-
69-
#reconstruction error
70-
def sqr_error(self):
71-
err = (self.R - self.predict_item())**2
71+
72+
# reconstruction error
73+
def sqr_error(self):
74+
err = (self.R - self.predict_item()) ** 2
7275
err = err.sum()
7376

7477
return err
@@ -80,39 +83,43 @@ def do_e_step(self):
8083

8184
def update_theta(self):
8285
def func(x, v, phi, beta, lambda_v):
83-
return 0.5 * lambda_v * np.dot((v-x).T, v-x) - np.sum(np.sum(phi * ( np.log(x*beta) - np.log(phi) )))
84-
85-
for vi in xrange(self.item_size):
86+
return 0.5 * lambda_v * np.dot((v - x).T, v - x) - np.sum(np.sum(phi * (np.log(x * beta) - np.log(phi))))
87+
88+
for vi in xrange(self.n_item):
8689
W = np.array(self.doc_item[vi])
87-
word_beta = self.beta[W,:]
88-
phi = self.theta[vi,:] * word_beta + e # W x K
89-
phi = phi/phi.sum(1)[:,np.newaxis]
90-
result = scipy.optimize.minimize(func, self.theta[vi,:], method='nelder-mead', args=(self.V[vi,:], phi, word_beta, self.lambda_v))
91-
self.theta[vi,:] = simplex_projection.euclidean_proj_simplex(result.x)
92-
self.phi_sum[W,:] += np.array(self.doc_cnt[vi])[:,np.newaxis] * phi
90+
word_beta = self.beta[W, :]
91+
phi = self.theta[vi, :] * word_beta + e # W x K
92+
phi = phi / phi.sum(1)[:, np.newaxis]
93+
result = scipy.optimize.minimize(func, self.theta[vi, :], method='nelder-mead',
94+
args=(self.V[vi, :], phi, word_beta, self.lambda_v))
95+
self.theta[vi, :] = euclidean_proj_simplex(result.x)
96+
self.phi_sum[W, :] += np.array(self.doc_cnt[vi])[:, np.newaxis] * phi
9397

9498
def update_u(self):
95-
for ui in xrange(self.user_size):
96-
left = np.dot(self.V.T * self.C[ui,:], self.V) + self.lambda_u * np.identity(self.topic_size)
99+
for ui in xrange(self.n_user):
100+
left = np.dot(self.V.T * self.C[ui, :], self.V) + self.lambda_u * np.identity(self.n_topic)
97101

98-
self.U[ui,:] = numpy.linalg.solve(left, np.dot(self.V.T * self.C[ui,:],self.R[ui,:]))
102+
self.U[ui, :] = numpy.linalg.solve(left, np.dot(self.V.T * self.C[ui, :], self.R[ui, :]))
99103

100104
def update_v(self):
101-
for vi in xrange(self.item_size):
102-
left = np.dot(self.U.T * self.C[:,vi], self.U) + self.lambda_v * np.identity(self.topic_size)
105+
for vi in xrange(self.n_item):
106+
left = np.dot(self.U.T * self.C[:, vi], self.U) + self.lambda_v * np.identity(self.n_topic)
103107

104-
self.V[vi,:] = numpy.linalg.solve(left, np.dot(self.U.T * self.C[:,vi],self.R[:,vi] ) + self.lambda_v * self.theta[vi,:])
108+
self.V[vi, :] = numpy.linalg.solve(left, np.dot(self.U.T * self.C[:, vi],
109+
self.R[:, vi]) + self.lambda_v * self.theta[vi, :])
105110

106111
def do_m_step(self):
107112
self.beta = self.phi_sum / self.phi_sum.sum(0)
108-
self.phi_sum = np.zeros([self.voca_size, self.topic_size]) + self.eta
113+
self.phi_sum = np.zeros([self.n_voca, self.n_topic]) + self.eta
114+
109115

110116
def main():
111-
doc_word = [[0,1,2,4,5], [2,3,5,6,7,8,9]]
112-
doc_cnt = [[1,2,3,2,1], [3,4,5,1,2,3,4]]
113-
rate_user = [[0,1,2],[2,3]]
114-
model = CTM(3, 10, 4, 2, doc_word, doc_cnt, rate_user)
117+
doc_word = [[0, 1, 2, 4, 5], [2, 3, 5, 6, 7, 8, 9]]
118+
doc_cnt = [[1, 2, 3, 2, 1], [3, 4, 5, 1, 2, 3, 4]]
119+
rate_user = [[0, 1, 2], [2, 3]]
120+
model = CorrelatedTopicModel(3, 10, 4, 2, doc_word, doc_cnt, rate_user)
115121
model.learning(10)
116122

123+
117124
if __name__ == '__main__':
118125
main()

0 commit comments

Comments
 (0)