|
1 |
| -import numpy as n |
2 |
| -import os |
3 |
| - |
4 |
| - |
5 |
| -class AuthorTopicModel: |
6 |
| - def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01): |
7 |
| - """ |
8 |
| - Initialize at_model |
9 |
| -
|
10 |
| - vocab = vocabulary list |
11 |
| - K = number of topics |
12 |
| - A = number of authors |
13 |
| - alpha = author-topic distribution dirichlet parameter |
14 |
| - eta = word-topic distribution dirichlet parameter |
15 |
| -
|
16 |
| - docList |
17 |
| - list of documents, constructed based on the vocab |
18 |
| - format = list(list(words)) |
19 |
| - ex) [[0,2,2,3],[1,3,3,4]] |
20 |
| - tokens of 1st document= 0,2,2,3 (note that 2 appears twice becase word 2 used twice in the first document) |
21 |
| - authorList |
22 |
| - format = list(list(authors)) |
23 |
| - at least one author should be exist for each document |
24 |
| - ex) [[0,1],[1,2]] |
25 |
| - authors of 1st doc = 0, 1 |
26 |
| - """ |
27 |
| - |
28 |
| - self._vocab = vocab |
29 |
| - self._W = len(vocab) |
30 |
| - self._K = K |
31 |
| - self._A = A |
32 |
| - self._D = len(docList) |
33 |
| - self._docList = docList |
34 |
| - self._authorList = authorList |
35 |
| - self._alpha = alpha |
36 |
| - self._eta = eta |
37 |
| - |
38 |
| - self.c_wt = n.zeros([self._W, self._K]) |
39 |
| - self.c_at = n.zeros([self._A, self._K]) |
| 1 | +from __future__ import print_function |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +from scipy.special import gammaln |
| 5 | +import time |
| 6 | + |
| 7 | +from six.moves import xrange |
| 8 | + |
| 9 | +from .base import BaseGibbsParamTopicModel |
| 10 | +from .formatted_logger import formatted_logger |
| 11 | + |
| 12 | +logger = formatted_logger('AuthorTopicModel', 'info') |
| 13 | + |
| 14 | + |
| 15 | +class AuthorTopicModel(BaseGibbsParamTopicModel): |
| 16 | + """Author Topic Model |
| 17 | +
|
| 18 | + implementation of `The Author-Topic Model for Authors and Documents` by Rosen-Zvi, et al. (UAI 2004) |
| 19 | +
|
| 20 | + Attributes |
| 21 | + ---------- |
| 22 | +
|
| 23 | + vocab: |
| 24 | + vocabulary list |
| 25 | + n_topic: |
| 26 | + number of topics |
| 27 | + n_author: |
| 28 | + number of authors |
| 29 | + alpha: |
| 30 | + author-topic distribution dirichlet parameter |
| 31 | + beta: |
| 32 | + word-topic distribution dirichlet parameter |
| 33 | + docList: |
| 34 | + list of documents, constructed based on the vocab |
| 35 | + format = list(list(words)) |
| 36 | + ex) [[0,2,2,3],[1,3,3,4]] |
| 37 | + tokens of 1st document= 0,2,2,3 (note that 2 appears twice becase word 2 used twice in the first document) |
| 38 | + authorList: |
| 39 | + format = list(list(authors)) |
| 40 | + at least one author should be exist for each document |
| 41 | + ex) [[0,1],[1,2]] |
| 42 | + authors of 1st doc = 0, 1 |
| 43 | + """ |
| 44 | + |
| 45 | + def __init__(self, n_doc, n_voca, n_topic, n_author, alpha=0.1, beta=0.01, **kwargs): |
| 46 | + super(AuthorTopicModel, self).__init__(n_doc, n_voca, n_topic, alpha, beta, **kwargs) |
| 47 | + self.n_author = n_author |
| 48 | + |
| 49 | + self.AT = np.zeros([self.n_author, self.n_topic]) + self.alpha |
40 | 50 | self.topic_assigned = list()
|
41 | 51 | self.author_assigned = list()
|
42 |
| - self.topic_sum = n.zeros(self._K) |
43 |
| - self.author_sum = n.zeros(self._A) |
44 |
| - |
45 |
| - # initialization |
46 |
| - for di in xrange(0, self._D): |
| 52 | + self.sum_A = np.zeros(self.n_author) + self.alpha * self.n_author |
| 53 | + |
| 54 | + def fit(self, docs, doc_authors, max_iter=100): |
| 55 | + if type(docs[0][0]) != int: |
| 56 | + _docs = list() |
| 57 | + for doc in docs: |
| 58 | + _doc = list() |
| 59 | + for word in doc: |
| 60 | + doc.append(int(word)) |
| 61 | + _docs.append(doc) |
| 62 | + docs = _docs |
| 63 | + |
| 64 | + if type(doc_authors[0][0]) != int: |
| 65 | + _doc_authors = list() |
| 66 | + for doc in doc_authors: |
| 67 | + _doc = list() |
| 68 | + for author in doc: |
| 69 | + _doc.append(int(author)) |
| 70 | + _doc_authors.append(_doc) |
| 71 | + doc_authors = _doc_authors |
| 72 | + |
| 73 | + self.random_init(docs, doc_authors) |
| 74 | + self.gibbs_sampling(docs, doc_authors, max_iter) |
| 75 | + |
| 76 | + def random_init(self, docs, doc_authors): |
| 77 | + for di in xrange(self.n_doc): |
47 | 78 | self.author_assigned.append(list())
|
48 | 79 | self.topic_assigned.append(list())
|
49 |
| - doc = self._docList[di] |
50 |
| - authors = self._authorList[di] |
51 |
| - for wi in xrange(0, len(doc)): |
52 |
| - w = doc[wi] |
| 80 | + doc = docs[di] |
| 81 | + authors = doc_authors[di] |
| 82 | + for w in doc: |
53 | 83 | # random sampling topic
|
54 |
| - z = n.random.choice(self._K, 1)[0] |
| 84 | + z = np.random.choice(self.n_topic, 1)[0] |
55 | 85 | # random sampling author
|
56 |
| - a = n.random.choice(len(authors), 1)[0] |
| 86 | + a = np.random.choice(len(authors), 1)[0] |
57 | 87 |
|
58 | 88 | # assigning sampled value (sufficient statistics)
|
59 |
| - self.c_wt[w, z] += 1 |
60 |
| - self.c_at[authors[a], z] += 1 |
61 |
| - self.topic_sum[z] += 1 |
62 |
| - self.author_sum[authors[a]] += 1 |
| 89 | + self.TW[z, w] += 1 |
| 90 | + self.AT[authors[a], z] += 1 |
| 91 | + self.sum_T[z] += 1 |
| 92 | + self.sum_A[authors[a]] += 1 |
63 | 93 |
|
64 | 94 | # keep sampled value for future sampling
|
65 | 95 | self.topic_assigned[di].append(z)
|
66 | 96 | self.author_assigned[di].append(authors[a])
|
67 | 97 |
|
68 |
| - def sampling_topics(self, max_iter): |
69 |
| - for iter in xrange(0, max_iter): |
70 |
| - for di in xrange(0, len(self._docList)): |
71 |
| - doc = self._docList[di] |
72 |
| - authors = self._authorList[di] |
| 98 | + def gibbs_sampling(self, docs, doc_authors, max_iter): |
| 99 | + for iter in xrange(max_iter): |
| 100 | + tic = time.time() |
| 101 | + for di in xrange(len(docs)): |
| 102 | + doc = docs[di] |
| 103 | + authors = doc_authors[di] |
73 | 104 |
|
74 |
| - for wi in xrange(0, len(doc)): |
| 105 | + for wi in xrange(len(doc)): |
75 | 106 | w = doc[wi]
|
76 | 107 | old_z = self.topic_assigned[di][wi]
|
77 | 108 | old_a = self.author_assigned[di][wi]
|
78 | 109 |
|
79 |
| - self.c_wt[w, old_z] -= 1 |
80 |
| - self.c_at[old_a, old_z] -= 1 |
81 |
| - self.topic_sum[old_z] -= 1 |
82 |
| - self.author_sum[old_a] -= 1 |
| 110 | + self.TW[old_z, w] -= 1 |
| 111 | + self.AT[old_a, old_z] -= 1 |
| 112 | + self.sum_T[old_z] -= 1 |
| 113 | + self.sum_A[old_a] -= 1 |
83 | 114 |
|
84 |
| - wt = (self.c_wt[w, :] + self._eta) / (self.topic_sum + self._W * self._eta) |
85 |
| - at = (self.c_at[authors, :] + self._alpha) / ( |
86 |
| - self.author_sum[authors].repeat(self._K).reshape(len(authors), self._K) + self._K * self._alpha) |
| 115 | + wt = (self.TW[:, w] + self.beta) / (self.sum_T + self.n_voca * self.beta) |
| 116 | + at = (self.AT[authors, :] + self.alpha) / ( |
| 117 | + self.sum_A[authors].repeat(self.n_topic).reshape(len(authors), |
| 118 | + self.n_topic) + self.n_topic * self.alpha) |
87 | 119 |
|
88 | 120 | pdf = at * wt
|
89 |
| - pdf = pdf.reshape(len(authors) * self._K) |
| 121 | + pdf = pdf.reshape(len(authors) * self.n_topic) |
90 | 122 | pdf = pdf / pdf.sum()
|
91 | 123 |
|
92 | 124 | # sampling author and topic
|
93 |
| - idx = n.random.multinomial(1, pdf).argmax() |
94 |
| - |
95 |
| - new_ai = idx / self._K |
96 |
| - new_z = idx % self._K |
97 |
| - |
98 |
| - new_a = authors[new_ai] |
99 |
| - self.c_wt[w, new_z] += 1 |
100 |
| - self.c_at[new_a, new_z] += 1 |
101 |
| - self.topic_sum[new_z] += 1 |
102 |
| - self.author_sum[new_a] += 1 |
103 |
| - self.topic_assigned[di][wi] = new_z |
104 |
| - self.author_assigned[di][wi] = new_a |
105 |
| - |
106 |
| - |
107 |
| -if __name__ == '__main__': |
108 |
| - # test case |
109 |
| - atm = at_model([0, 1, 2, 3, 4], 2, 3, [[0, 0, 2, 2, 3], [1, 3, 3, 4, 4]], [[0, 1], [1, 2]]) |
110 |
| - atm.sampling_topics(10) |
111 |
| - |
112 |
| - folder = 'at-result' |
113 |
| - if not os.path.exists(folder): |
114 |
| - os.makedirs(folder) |
115 |
| - n.savetxt(folder + '/word-topic.dat', atm.c_wt) |
116 |
| - n.savetxt(folder + '/author-topic.dat', atm.c_at) |
| 125 | + idx = np.random.multinomial(1, pdf).argmax() |
| 126 | + |
| 127 | + new_ai = int(idx / self.n_topic) |
| 128 | + new_topic = idx % self.n_topic |
| 129 | + |
| 130 | + new_author = authors[new_ai] |
| 131 | + self.TW[new_topic, w] += 1 |
| 132 | + self.AT[new_author, new_topic] += 1 |
| 133 | + self.sum_T[new_topic] += 1 |
| 134 | + self.sum_A[new_author] += 1 |
| 135 | + self.topic_assigned[di][wi] = new_topic |
| 136 | + self.author_assigned[di][wi] = new_author |
| 137 | + |
| 138 | + ll = self.log_likelihood() |
| 139 | + logger.info('[INIT] %d\telapsed_time:%.2f\tlog_likelihood:%.2f', iter, time.time() - tic, ll) |
| 140 | + |
| 141 | + def log_likelihood(self): |
| 142 | + ll = self.n_author * gammaln(self.alpha * self.n_topic) |
| 143 | + ll -= self.n_author * self.n_topic * gammaln(self.alpha) |
| 144 | + ll += self.n_topic * gammaln(self.beta * self.n_voca) |
| 145 | + ll -= self.n_topic * self.n_voca * gammaln(self.beta) |
| 146 | + |
| 147 | + for ai in xrange(self.n_author): |
| 148 | + ll += gammaln(self.AT[ai, :]).sum() - gammaln(self.AT[ai, :].sum()) |
| 149 | + for ti in xrange(self.n_topic): |
| 150 | + ll += gammaln(self.TW[ti, :]).sum() - gammaln(self.TW[ti, :].sum()) |
| 151 | + |
| 152 | + return ll |
0 commit comments