Skip to content

Commit 15233d8

Browse files
committed
implement log likelihood of author topic model
1 parent 7adf203 commit 15233d8

File tree

3 files changed

+130
-95
lines changed

3 files changed

+130
-95
lines changed

ptm/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from .rtm import RelationalTopicModel
66
from .diln import DILN
77
from .hmm_lda import HMM_LDA
8+
from .at_model import AuthorTopicModel

ptm/at_model.py

Lines changed: 128 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,116 +1,152 @@
1-
import numpy as n
2-
import os
3-
4-
5-
class AuthorTopicModel:
6-
def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01):
7-
"""
8-
Initialize at_model
9-
10-
vocab = vocabulary list
11-
K = number of topics
12-
A = number of authors
13-
alpha = author-topic distribution dirichlet parameter
14-
eta = word-topic distribution dirichlet parameter
15-
16-
docList
17-
list of documents, constructed based on the vocab
18-
format = list(list(words))
19-
ex) [[0,2,2,3],[1,3,3,4]]
20-
tokens of 1st document= 0,2,2,3 (note that 2 appears twice becase word 2 used twice in the first document)
21-
authorList
22-
format = list(list(authors))
23-
at least one author should be exist for each document
24-
ex) [[0,1],[1,2]]
25-
authors of 1st doc = 0, 1
26-
"""
27-
28-
self._vocab = vocab
29-
self._W = len(vocab)
30-
self._K = K
31-
self._A = A
32-
self._D = len(docList)
33-
self._docList = docList
34-
self._authorList = authorList
35-
self._alpha = alpha
36-
self._eta = eta
37-
38-
self.c_wt = n.zeros([self._W, self._K])
39-
self.c_at = n.zeros([self._A, self._K])
1+
from __future__ import print_function
2+
3+
import numpy as np
4+
from scipy.special import gammaln
5+
import time
6+
7+
from six.moves import xrange
8+
9+
from .base import BaseGibbsParamTopicModel
10+
from .formatted_logger import formatted_logger
11+
12+
logger = formatted_logger('AuthorTopicModel', 'info')
13+
14+
15+
class AuthorTopicModel(BaseGibbsParamTopicModel):
16+
"""Author Topic Model
17+
18+
implementation of `The Author-Topic Model for Authors and Documents` by Rosen-Zvi, et al. (UAI 2004)
19+
20+
Attributes
21+
----------
22+
23+
vocab:
24+
vocabulary list
25+
n_topic:
26+
number of topics
27+
n_author:
28+
number of authors
29+
alpha:
30+
author-topic distribution dirichlet parameter
31+
beta:
32+
word-topic distribution dirichlet parameter
33+
docList:
34+
list of documents, constructed based on the vocab
35+
format = list(list(words))
36+
ex) [[0,2,2,3],[1,3,3,4]]
37+
tokens of 1st document= 0,2,2,3 (note that 2 appears twice becase word 2 used twice in the first document)
38+
authorList:
39+
format = list(list(authors))
40+
at least one author should be exist for each document
41+
ex) [[0,1],[1,2]]
42+
authors of 1st doc = 0, 1
43+
"""
44+
45+
def __init__(self, n_doc, n_voca, n_topic, n_author, alpha=0.1, beta=0.01, **kwargs):
46+
super(AuthorTopicModel, self).__init__(n_doc, n_voca, n_topic, alpha, beta, **kwargs)
47+
self.n_author = n_author
48+
49+
self.AT = np.zeros([self.n_author, self.n_topic]) + self.alpha
4050
self.topic_assigned = list()
4151
self.author_assigned = list()
42-
self.topic_sum = n.zeros(self._K)
43-
self.author_sum = n.zeros(self._A)
44-
45-
# initialization
46-
for di in xrange(0, self._D):
52+
self.sum_A = np.zeros(self.n_author) + self.alpha * self.n_author
53+
54+
def fit(self, docs, doc_authors, max_iter=100):
55+
if type(docs[0][0]) != int:
56+
_docs = list()
57+
for doc in docs:
58+
_doc = list()
59+
for word in doc:
60+
doc.append(int(word))
61+
_docs.append(doc)
62+
docs = _docs
63+
64+
if type(doc_authors[0][0]) != int:
65+
_doc_authors = list()
66+
for doc in doc_authors:
67+
_doc = list()
68+
for author in doc:
69+
_doc.append(int(author))
70+
_doc_authors.append(_doc)
71+
doc_authors = _doc_authors
72+
73+
self.random_init(docs, doc_authors)
74+
self.gibbs_sampling(docs, doc_authors, max_iter)
75+
76+
def random_init(self, docs, doc_authors):
77+
for di in xrange(self.n_doc):
4778
self.author_assigned.append(list())
4879
self.topic_assigned.append(list())
49-
doc = self._docList[di]
50-
authors = self._authorList[di]
51-
for wi in xrange(0, len(doc)):
52-
w = doc[wi]
80+
doc = docs[di]
81+
authors = doc_authors[di]
82+
for w in doc:
5383
# random sampling topic
54-
z = n.random.choice(self._K, 1)[0]
84+
z = np.random.choice(self.n_topic, 1)[0]
5585
# random sampling author
56-
a = n.random.choice(len(authors), 1)[0]
86+
a = np.random.choice(len(authors), 1)[0]
5787

5888
# assigning sampled value (sufficient statistics)
59-
self.c_wt[w, z] += 1
60-
self.c_at[authors[a], z] += 1
61-
self.topic_sum[z] += 1
62-
self.author_sum[authors[a]] += 1
89+
self.TW[z, w] += 1
90+
self.AT[authors[a], z] += 1
91+
self.sum_T[z] += 1
92+
self.sum_A[authors[a]] += 1
6393

6494
# keep sampled value for future sampling
6595
self.topic_assigned[di].append(z)
6696
self.author_assigned[di].append(authors[a])
6797

68-
def sampling_topics(self, max_iter):
69-
for iter in xrange(0, max_iter):
70-
for di in xrange(0, len(self._docList)):
71-
doc = self._docList[di]
72-
authors = self._authorList[di]
98+
def gibbs_sampling(self, docs, doc_authors, max_iter):
99+
for iter in xrange(max_iter):
100+
tic = time.time()
101+
for di in xrange(len(docs)):
102+
doc = docs[di]
103+
authors = doc_authors[di]
73104

74-
for wi in xrange(0, len(doc)):
105+
for wi in xrange(len(doc)):
75106
w = doc[wi]
76107
old_z = self.topic_assigned[di][wi]
77108
old_a = self.author_assigned[di][wi]
78109

79-
self.c_wt[w, old_z] -= 1
80-
self.c_at[old_a, old_z] -= 1
81-
self.topic_sum[old_z] -= 1
82-
self.author_sum[old_a] -= 1
110+
self.TW[old_z, w] -= 1
111+
self.AT[old_a, old_z] -= 1
112+
self.sum_T[old_z] -= 1
113+
self.sum_A[old_a] -= 1
83114

84-
wt = (self.c_wt[w, :] + self._eta) / (self.topic_sum + self._W * self._eta)
85-
at = (self.c_at[authors, :] + self._alpha) / (
86-
self.author_sum[authors].repeat(self._K).reshape(len(authors), self._K) + self._K * self._alpha)
115+
wt = (self.TW[:, w] + self.beta) / (self.sum_T + self.n_voca * self.beta)
116+
at = (self.AT[authors, :] + self.alpha) / (
117+
self.sum_A[authors].repeat(self.n_topic).reshape(len(authors),
118+
self.n_topic) + self.n_topic * self.alpha)
87119

88120
pdf = at * wt
89-
pdf = pdf.reshape(len(authors) * self._K)
121+
pdf = pdf.reshape(len(authors) * self.n_topic)
90122
pdf = pdf / pdf.sum()
91123

92124
# sampling author and topic
93-
idx = n.random.multinomial(1, pdf).argmax()
94-
95-
new_ai = idx / self._K
96-
new_z = idx % self._K
97-
98-
new_a = authors[new_ai]
99-
self.c_wt[w, new_z] += 1
100-
self.c_at[new_a, new_z] += 1
101-
self.topic_sum[new_z] += 1
102-
self.author_sum[new_a] += 1
103-
self.topic_assigned[di][wi] = new_z
104-
self.author_assigned[di][wi] = new_a
105-
106-
107-
if __name__ == '__main__':
108-
# test case
109-
atm = at_model([0, 1, 2, 3, 4], 2, 3, [[0, 0, 2, 2, 3], [1, 3, 3, 4, 4]], [[0, 1], [1, 2]])
110-
atm.sampling_topics(10)
111-
112-
folder = 'at-result'
113-
if not os.path.exists(folder):
114-
os.makedirs(folder)
115-
n.savetxt(folder + '/word-topic.dat', atm.c_wt)
116-
n.savetxt(folder + '/author-topic.dat', atm.c_at)
125+
idx = np.random.multinomial(1, pdf).argmax()
126+
127+
new_ai = int(idx / self.n_topic)
128+
new_topic = idx % self.n_topic
129+
130+
new_author = authors[new_ai]
131+
self.TW[new_topic, w] += 1
132+
self.AT[new_author, new_topic] += 1
133+
self.sum_T[new_topic] += 1
134+
self.sum_A[new_author] += 1
135+
self.topic_assigned[di][wi] = new_topic
136+
self.author_assigned[di][wi] = new_author
137+
138+
ll = self.log_likelihood()
139+
logger.info('[INIT] %d\telapsed_time:%.2f\tlog_likelihood:%.2f', iter, time.time() - tic, ll)
140+
141+
def log_likelihood(self):
142+
ll = self.n_author * gammaln(self.alpha * self.n_topic)
143+
ll -= self.n_author * self.n_topic * gammaln(self.alpha)
144+
ll += self.n_topic * gammaln(self.beta * self.n_voca)
145+
ll -= self.n_topic * self.n_voca * gammaln(self.beta)
146+
147+
for ai in xrange(self.n_author):
148+
ll += gammaln(self.AT[ai, :]).sum() - gammaln(self.AT[ai, :].sum())
149+
for ti in xrange(self.n_topic):
150+
ll += gammaln(self.TW[ti, :]).sum() - gammaln(self.TW[ti, :].sum())
151+
152+
return ll

ptm/lda_gibbs.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,7 @@ def log_likelihood(self, docs):
9191
"""
9292
likelihood function
9393
"""
94-
ll = 0
95-
96-
ll += len(docs) * gammaln(self.alpha * self.n_topic)
94+
ll = len(docs) * gammaln(self.alpha * self.n_topic)
9795
ll -= len(docs) * self.n_topic * gammaln(self.alpha)
9896
ll += self.n_topic * gammaln(self.beta * self.n_voca)
9997
ll -= self.n_topic * self.n_voca * gammaln(self.beta)

0 commit comments

Comments
 (0)