implement log likelihood of author topic model

dongwookim-ml · dongwookim-ml · commit 15233d801791 · 2016-02-14T12:55:01.000+11:00
diff --git a/ptm/__init__.py b/ptm/__init__.py
@@ -5,3 +5,4 @@
 from .rtm import RelationalTopicModel
 from .diln import DILN
 from .hmm_lda import HMM_LDA
+from .at_model import AuthorTopicModel
diff --git a/ptm/at_model.py b/ptm/at_model.py
@@ -1,116 +1,152 @@
-import numpy as n
-import os
-
-
-class AuthorTopicModel:
-    def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01):
-        """
-        Initialize at_model
-
-        vocab = vocabulary list
-        K = number of topics
-        A = number of authors
-        alpha = author-topic distribution dirichlet parameter
-        eta = word-topic distribution dirichlet parameter
-
-        docList
-            list of documents, constructed based on the vocab
-            format = list(list(words))
-            ex) [[0,2,2,3],[1,3,3,4]]
-                tokens of 1st document= 0,2,2,3 (note that 2 appears twice becase word 2 used twice in the first document)
-        authorList 
-            format = list(list(authors))
-            at least one author should be exist for each document
-            ex) [[0,1],[1,2]] 
-                authors of 1st doc = 0, 1
-        """
-
-        self._vocab = vocab
-        self._W = len(vocab)
-        self._K = K
-        self._A = A
-        self._D = len(docList)
-        self._docList = docList
-        self._authorList = authorList
-        self._alpha = alpha
-        self._eta = eta
-
-        self.c_wt = n.zeros([self._W, self._K])
-        self.c_at = n.zeros([self._A, self._K])
+from __future__ import print_function
+
+import numpy as np
+from scipy.special import gammaln
+import time
+
+from six.moves import xrange
+
+from .base import BaseGibbsParamTopicModel
+from .formatted_logger import formatted_logger
+
+logger = formatted_logger('AuthorTopicModel', 'info')
+
+
+class AuthorTopicModel(BaseGibbsParamTopicModel):
+    """Author Topic Model
+
+    implementation of `The Author-Topic Model for Authors and Documents` by Rosen-Zvi, et al. (UAI 2004)
+
+    Attributes
+    ----------
+
+    vocab:
+        vocabulary list
+    n_topic:
+        number of topics
+    n_author:
+        number of authors
+    alpha:
+        author-topic distribution dirichlet parameter
+    beta:
+        word-topic distribution dirichlet parameter
+    docList:
+        list of documents, constructed based on the vocab
+        format = list(list(words))
+        ex) [[0,2,2,3],[1,3,3,4]]
+            tokens of 1st document= 0,2,2,3 (note that 2 appears twice becase word 2 used twice in the first document)
+    authorList:
+        format = list(list(authors))
+        at least one author should be exist for each document
+        ex) [[0,1],[1,2]]
+            authors of 1st doc = 0, 1
+    """
+
+    def __init__(self, n_doc, n_voca, n_topic, n_author, alpha=0.1, beta=0.01, **kwargs):
+        super(AuthorTopicModel, self).__init__(n_doc, n_voca, n_topic, alpha, beta, **kwargs)
+        self.n_author = n_author
+
+        self.AT = np.zeros([self.n_author, self.n_topic]) + self.alpha
         self.topic_assigned = list()
         self.author_assigned = list()
-        self.topic_sum = n.zeros(self._K)
-        self.author_sum = n.zeros(self._A)
-
-        # initialization
-        for di in xrange(0, self._D):
+        self.sum_A = np.zeros(self.n_author) + self.alpha * self.n_author
+
+    def fit(self, docs, doc_authors, max_iter=100):
+        if type(docs[0][0]) != int:
+            _docs = list()
+            for doc in docs:
+                _doc = list()
+                for word in doc:
+                    doc.append(int(word))
+                _docs.append(doc)
+            docs = _docs
+
+        if type(doc_authors[0][0]) != int:
+            _doc_authors = list()
+            for doc in doc_authors:
+                _doc = list()
+                for author in doc:
+                    _doc.append(int(author))
+                _doc_authors.append(_doc)
+            doc_authors = _doc_authors
+
+        self.random_init(docs, doc_authors)
+        self.gibbs_sampling(docs, doc_authors, max_iter)
+
+    def random_init(self, docs, doc_authors):
+        for di in xrange(self.n_doc):
             self.author_assigned.append(list())
             self.topic_assigned.append(list())
-            doc = self._docList[di]
-            authors = self._authorList[di]
-            for wi in xrange(0, len(doc)):
-                w = doc[wi]
+            doc = docs[di]
+            authors = doc_authors[di]
+            for w in doc:
                 # random sampling topic
-                z = n.random.choice(self._K, 1)[0]
+                z = np.random.choice(self.n_topic, 1)[0]
                 # random sampling author
-                a = n.random.choice(len(authors), 1)[0]
+                a = np.random.choice(len(authors), 1)[0]
 
                 # assigning sampled value (sufficient statistics)
-                self.c_wt[w, z] += 1
-                self.c_at[authors[a], z] += 1
-                self.topic_sum[z] += 1
-                self.author_sum[authors[a]] += 1
+                self.TW[z, w] += 1
+                self.AT[authors[a], z] += 1
+                self.sum_T[z] += 1
+                self.sum_A[authors[a]] += 1
 
                 # keep sampled value for future sampling
                 self.topic_assigned[di].append(z)
                 self.author_assigned[di].append(authors[a])
 
-    def sampling_topics(self, max_iter):
-        for iter in xrange(0, max_iter):
-            for di in xrange(0, len(self._docList)):
-                doc = self._docList[di]
-                authors = self._authorList[di]
+    def gibbs_sampling(self, docs, doc_authors, max_iter):
+        for iter in xrange(max_iter):
+            tic = time.time()
+            for di in xrange(len(docs)):
+                doc = docs[di]
+                authors = doc_authors[di]
 
-                for wi in xrange(0, len(doc)):
+                for wi in xrange(len(doc)):
                     w = doc[wi]
                     old_z = self.topic_assigned[di][wi]
                     old_a = self.author_assigned[di][wi]
 
-                    self.c_wt[w, old_z] -= 1
-                    self.c_at[old_a, old_z] -= 1
-                    self.topic_sum[old_z] -= 1
-                    self.author_sum[old_a] -= 1
+                    self.TW[old_z, w] -= 1
+                    self.AT[old_a, old_z] -= 1
+                    self.sum_T[old_z] -= 1
+                    self.sum_A[old_a] -= 1
 
-                    wt = (self.c_wt[w, :] + self._eta) / (self.topic_sum + self._W * self._eta)
-                    at = (self.c_at[authors, :] + self._alpha) / (
-                    self.author_sum[authors].repeat(self._K).reshape(len(authors), self._K) + self._K * self._alpha)
+                    wt = (self.TW[:, w] + self.beta) / (self.sum_T + self.n_voca * self.beta)
+                    at = (self.AT[authors, :] + self.alpha) / (
+                        self.sum_A[authors].repeat(self.n_topic).reshape(len(authors),
+                                                                         self.n_topic) + self.n_topic * self.alpha)
 
                     pdf = at * wt
-                    pdf = pdf.reshape(len(authors) * self._K)
+                    pdf = pdf.reshape(len(authors) * self.n_topic)
                     pdf = pdf / pdf.sum()
 
                     # sampling author and topic
-                    idx = n.random.multinomial(1, pdf).argmax()
-
-                    new_ai = idx / self._K
-                    new_z = idx % self._K
-
-                    new_a = authors[new_ai]
-                    self.c_wt[w, new_z] += 1
-                    self.c_at[new_a, new_z] += 1
-                    self.topic_sum[new_z] += 1
-                    self.author_sum[new_a] += 1
-                    self.topic_assigned[di][wi] = new_z
-                    self.author_assigned[di][wi] = new_a
-
-
-if __name__ == '__main__':
-    # test case
-    atm = at_model([0, 1, 2, 3, 4], 2, 3, [[0, 0, 2, 2, 3], [1, 3, 3, 4, 4]], [[0, 1], [1, 2]])
-    atm.sampling_topics(10)
-
-    folder = 'at-result'
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-    n.savetxt(folder + '/word-topic.dat', atm.c_wt)
-    n.savetxt(folder + '/author-topic.dat', atm.c_at)
+                    idx = np.random.multinomial(1, pdf).argmax()
+
+                    new_ai = int(idx / self.n_topic)
+                    new_topic = idx % self.n_topic
+
+                    new_author = authors[new_ai]
+                    self.TW[new_topic, w] += 1
+                    self.AT[new_author, new_topic] += 1
+                    self.sum_T[new_topic] += 1
+                    self.sum_A[new_author] += 1
+                    self.topic_assigned[di][wi] = new_topic
+                    self.author_assigned[di][wi] = new_author
+
+            ll = self.log_likelihood()
+            logger.info('[INIT] %d\telapsed_time:%.2f\tlog_likelihood:%.2f', iter, time.time() - tic, ll)
+
+    def log_likelihood(self):
+        ll = self.n_author * gammaln(self.alpha * self.n_topic)
+        ll -= self.n_author * self.n_topic * gammaln(self.alpha)
+        ll += self.n_topic * gammaln(self.beta * self.n_voca)
+        ll -= self.n_topic * self.n_voca * gammaln(self.beta)
+
+        for ai in xrange(self.n_author):
+            ll += gammaln(self.AT[ai, :]).sum() - gammaln(self.AT[ai, :].sum())
+        for ti in xrange(self.n_topic):
+            ll += gammaln(self.TW[ti, :]).sum() - gammaln(self.TW[ti, :].sum())
+
+        return ll
diff --git a/ptm/lda_gibbs.py b/ptm/lda_gibbs.py
@@ -91,9 +91,7 @@ def log_likelihood(self, docs):
         """
         likelihood function
         """
-        ll = 0
-
-        ll += len(docs) * gammaln(self.alpha * self.n_topic)
+        ll = len(docs) * gammaln(self.alpha * self.n_topic)
         ll -= len(docs) * self.n_topic * gammaln(self.alpha)
         ll += self.n_topic * gammaln(self.beta * self.n_voca)
         ll -= self.n_topic * self.n_voca * gammaln(self.beta)