binhtrantt
diff --git a/‎ptm/__init__.py
Lines changed: 5 additions & 0 deletions b/‎ptm/__init__.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎ptm/at_model.py
Lines changed: 21 additions & 20 deletions b/‎ptm/at_model.py
Lines changed: 21 additions & 20 deletions
diff --git a/‎ptm/base.py
Lines changed: 50 additions & 0 deletions b/‎ptm/base.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎ptm/ctm.py
Lines changed: 56 additions & 49 deletions b/‎ptm/ctm.py
Lines changed: 56 additions & 49 deletions
@@ -0,0 +1,5 @@
+from .lda_gibbs import GibbsLDA
+from .slda_gibbs import GibbsSupervisedLDA
+from .ctm import CorrelatedTopicModel
+from .rtm import RelationalTopicModel
+
@@ -1,8 +1,8 @@
 import numpy as n
 import os
 
-class at_model:
 
+class AuthorTopicModel:
     def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01):
         """
         Initialize at_model
@@ -42,26 +42,26 @@ def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01):
         self.topic_sum = n.zeros(self._K)
         self.author_sum = n.zeros(self._A)
 
-        #initialization
+        # initialization
         for di in xrange(0, self._D):
             self.author_assigned.append(list())
             self.topic_assigned.append(list())
             doc = self._docList[di]
             authors = self._authorList[di]
             for wi in xrange(0, len(doc)):
                 w = doc[wi]
-                #random sampling topic
+                # random sampling topic
                 z = n.random.choice(self._K, 1)[0]
-                #random sampling author
-                a = n.random.choice(len(authors),1)[0]
+                # random sampling author
+                a = n.random.choice(len(authors), 1)[0]
 
-                #assigning sampled value (sufficient statistics)
-                self.c_wt[w,z] += 1
-                self.c_at[authors[a],z] += 1
+                # assigning sampled value (sufficient statistics)
+                self.c_wt[w, z] += 1
+                self.c_at[authors[a], z] += 1
                 self.topic_sum[z] += 1
                 self.author_sum[authors[a]] += 1
 
-                #keep sampled value for future sampling
+                # keep sampled value for future sampling
                 self.topic_assigned[di].append(z)
                 self.author_assigned[di].append(authors[a])
 
@@ -81,21 +81,22 @@ def sampling_topics(self, max_iter):
                     self.topic_sum[old_z] -= 1
                     self.author_sum[old_a] -= 1
 
-                    wt = (self.c_wt[w, :]+ self._eta)/(self.topic_sum+self._W*self._eta) 
-                    at = (self.c_at[authors,:] + self._alpha)/(self.author_sum[authors].repeat(self._K).reshape(len(authors),self._K)+self._K*self._alpha)
+                    wt = (self.c_wt[w, :] + self._eta) / (self.topic_sum + self._W * self._eta)
+                    at = (self.c_at[authors, :] + self._alpha) / (
+                    self.author_sum[authors].repeat(self._K).reshape(len(authors), self._K) + self._K * self._alpha)
 
-                    pdf = at*wt
-                    pdf = pdf.reshape(len(authors)*self._K)
-                    pdf = pdf/pdf.sum()
+                    pdf = at * wt
+                    pdf = pdf.reshape(len(authors) * self._K)
+                    pdf = pdf / pdf.sum()
 
-                    #sampling author and topic
+                    # sampling author and topic
                     idx = n.random.multinomial(1, pdf).argmax()
 
-                    new_ai = idx/self._K
-                    new_z = idx%self._K
+                    new_ai = idx / self._K
+                    new_z = idx % self._K
 
                     new_a = authors[new_ai]
-                    self.c_wt[w,new_z] += 1
+                    self.c_wt[w, new_z] += 1
                     self.c_at[new_a, new_z] += 1
                     self.topic_sum[new_z] += 1
                     self.author_sum[new_a] += 1
@@ -104,8 +105,8 @@ def sampling_topics(self, max_iter):
 
 
 if __name__ == '__main__':
-    #test case
-    atm = at_model([0,1,2,3,4], 2, 3, [[0,0,2,2,3],[1,3,3,4,4]], [[0,1],[1,2]])
+    # test case
+    atm = at_model([0, 1, 2, 3, 4], 2, 3, [[0, 0, 2, 2, 3], [1, 3, 3, 4, 4]], [[0, 1], [1, 2]])
     atm.sampling_topics(10)
 
     folder = 'at-result'
 
@@ -0,0 +1,50 @@
+import numpy as np
+
+class BaseTopicModel():
+    """
+    Attributes
+    ----------
+    n_doc: int
+        the number of total documents in the corpus
+    n_voca: int
+        the vocabulary size of the corpus
+    """
+    def __init__(self, n_doc, n_voca):
+        self.n_doc = n_doc
+        self.n_voca = n_voca
+
+
+class BaseGibbsParamTopicModel(BaseTopicModel):
+    """ Base class of parametric topic models with Gibbs sampling inference
+
+    Attributes
+    ----------
+    n_topic: int
+        a number of topics to be inferred through the Gibbs sampling
+    WT: ndarray, shape (n_voca, n_topic)
+        word-topic matrix, keeps the number of assigned word tokens for each word-topic pair
+    DT: ndarray, shape (n_doc, n_topic)
+        document-topic matrix, keeps the number of assigned word tokens for each document-topic pair
+    sum_T: ndarray, shape (n_topic)
+        number of word tokens assigned for each topic
+    alpha: float
+        symmetric parameter of Dirichlet prior for document-topic distribution
+    beta: float
+        symmetric parameter of Dirichlet prior for topic-word distribution
+    """
+
+    def __init__(self, n_doc, n_voca, n_topic, alpha, beta):
+        super(BaseGibbsParamTopicModel, self).__init__(n_doc=n_doc, n_voca=n_voca)
+        self.n_topic = n_topic
+        self.WT = np.zeros([self.n_voca, self.n_topic])
+        self.DT = np.zeros([self.n_doc, self.n_topic])
+        self.sum_T = np.zeros(self.n_topic)
+
+        self.alpha = alpha
+        self.beta = beta
+
+        self.topic_assignment = list()
+
+        self.WT += self.beta
+        self.sum_T += self.beta * self.n_voca
+        self.DT += self.alpha
@@ -1,74 +1,77 @@
 import time
 import numpy as np
 import numpy.linalg
-import simplex_projection
 import scipy.optimize
+from .simplex_projection import euclidean_proj_simplex
 
 e = 1e-100
 error_diff = 10
 
-class CTM:
+
+class CorrelatedTopicModel():
     """
     Correlated topic models,
     Blei, David and Lafferty, John,
     2006
     """
-    
-    def __init__(self, topic_size, voca_size, user_size, item_size, doc_item, doc_cnt, ratings=None):
+
+    def __init__(self, n_topic, n_voca, n_user, n_item, doc_item, doc_cnt, ratings=None):
         self.lambda_u = 0.01
         self.lambda_v = 0.01
         self.alpha = 1
         self.eta = 0.01
         self.a = 1
         self.b = 0.01
 
-        self.topic_size = topic_size
-        self.voca_size = voca_size
-        self.user_size = user_size
-        self.item_size = item_size
-
-        #U = user_topic matrix, U x K
-        self.U = np.random.multivariate_normal(np.zeros(topic_size), np.identity(topic_size)*(1./self.lambda_u), size=self.user_size)
-        #V = item(doc)_topic matrix, V x K
-        self.V = np.random.multivariate_normal(np.zeros(topic_size), np.identity(topic_size)*(1./self.lambda_u), size=self.item_size)
-        self.theta = np.random.random([item_size,topic_size])
-        self.theta = self.theta/self.theta.sum(1)[:,np.newaxis] #normalize
-        self.beta = np.random.random([voca_size,topic_size])
-        self.beta = self.beta/self.beta.sum(0) #normalize
+        self.n_topic = n_topic
+        self.n_voca = n_voca
+        self.n_user = n_user
+        self.n_item = n_item
+
+        # U = user_topic matrix, U x K
+        self.U = np.random.multivariate_normal(np.zeros(n_topic), np.identity(n_topic) * (1. / self.lambda_u),
+                                               size=self.n_user)
+        # V = item(doc)_topic matrix, V x K
+        self.V = np.random.multivariate_normal(np.zeros(n_topic), np.identity(n_topic) * (1. / self.lambda_u),
+                                               size=self.n_item)
+        self.theta = np.random.random([n_item, n_topic])
+        self.theta = self.theta / self.theta.sum(1)[:, np.newaxis]  # normalize
+        self.beta = np.random.random([n_voca, n_topic])
+        self.beta = self.beta / self.beta.sum(0)  # normalize
 
         self.doc_item = doc_item
         self.doc_cnt = doc_cnt
 
-        self.C = np.zeros([user_size, item_size]) + self.b
-        self.R = np.zeros([user_size, item_size]) #user_size x item_size
+        self.C = np.zeros([n_user, n_item]) + self.b
+        self.R = np.zeros([n_user, n_item])  # user_size x item_size
 
         if ratings:
             for di in xrange(len(ratings)):
                 rate = ratings[di]
                 for user in rate:
-                    self.C[user,di] += self.a - self.b
-                    self.R[user,di] = 1
+                    self.C[user, di] += self.a - self.b
+                    self.R[user, di] = 1
+
+        self.phi_sum = np.zeros([n_voca, n_topic]) + self.eta
 
-        self.phi_sum = np.zeros([voca_size, topic_size]) + self.eta
-        
     def learning_fixed_theta(self, max_iter):
         old_err = 0
         for iteration in xrange(max_iter):
             prev = time.clock()
             self.update_u()
             self.update_v()
             err = self.sqr_error()
-            print 'Iteration-', iteration,  time.clock() - prev, err
+            print('Iteration-', iteration, time.clock() - prev, err)
             if abs(old_err - err) < error_diff:
                 break
 
-    #reconstructing matrix for prediction
+    # reconstructing matrix for prediction
     def predict_item(self):
         return np.dot(self.U, self.V.T)
-        
-    #reconstruction error    
-    def sqr_error(self):    
-        err = (self.R - self.predict_item())**2
+
+    # reconstruction error
+    def sqr_error(self):
+        err = (self.R - self.predict_item()) ** 2
         err = err.sum()
 
         return err
@@ -80,39 +83,43 @@ def do_e_step(self):
 
     def update_theta(self):
         def func(x, v, phi, beta, lambda_v):
-            return 0.5 * lambda_v * np.dot((v-x).T, v-x) - np.sum(np.sum(phi * ( np.log(x*beta) - np.log(phi) )))
-        
-        for vi in xrange(self.item_size):
+            return 0.5 * lambda_v * np.dot((v - x).T, v - x) - np.sum(np.sum(phi * (np.log(x * beta) - np.log(phi))))
+
+        for vi in xrange(self.n_item):
             W = np.array(self.doc_item[vi])
-            word_beta = self.beta[W,:]
-            phi = self.theta[vi,:] * word_beta + e     # W x K
-            phi = phi/phi.sum(1)[:,np.newaxis]
-            result = scipy.optimize.minimize(func, self.theta[vi,:], method='nelder-mead', args=(self.V[vi,:], phi, word_beta, self.lambda_v))
-            self.theta[vi,:] = simplex_projection.euclidean_proj_simplex(result.x)
-            self.phi_sum[W,:] += np.array(self.doc_cnt[vi])[:,np.newaxis] * phi
+            word_beta = self.beta[W, :]
+            phi = self.theta[vi, :] * word_beta + e  # W x K
+            phi = phi / phi.sum(1)[:, np.newaxis]
+            result = scipy.optimize.minimize(func, self.theta[vi, :], method='nelder-mead',
+                                             args=(self.V[vi, :], phi, word_beta, self.lambda_v))
+            self.theta[vi, :] = euclidean_proj_simplex(result.x)
+            self.phi_sum[W, :] += np.array(self.doc_cnt[vi])[:, np.newaxis] * phi
 
     def update_u(self):
-        for ui in xrange(self.user_size):
-            left = np.dot(self.V.T * self.C[ui,:], self.V) + self.lambda_u * np.identity(self.topic_size)
+        for ui in xrange(self.n_user):
+            left = np.dot(self.V.T * self.C[ui, :], self.V) + self.lambda_u * np.identity(self.n_topic)
 
-            self.U[ui,:] = numpy.linalg.solve(left, np.dot(self.V.T * self.C[ui,:],self.R[ui,:]))
+            self.U[ui, :] = numpy.linalg.solve(left, np.dot(self.V.T * self.C[ui, :], self.R[ui, :]))
 
     def update_v(self):
-        for vi in xrange(self.item_size):
-            left = np.dot(self.U.T * self.C[:,vi], self.U) + self.lambda_v * np.identity(self.topic_size)
+        for vi in xrange(self.n_item):
+            left = np.dot(self.U.T * self.C[:, vi], self.U) + self.lambda_v * np.identity(self.n_topic)
 
-            self.V[vi,:] = numpy.linalg.solve(left, np.dot(self.U.T * self.C[:,vi],self.R[:,vi] ) + self.lambda_v * self.theta[vi,:])
+            self.V[vi, :] = numpy.linalg.solve(left, np.dot(self.U.T * self.C[:, vi],
+                                                            self.R[:, vi]) + self.lambda_v * self.theta[vi, :])
 
     def do_m_step(self):
         self.beta = self.phi_sum / self.phi_sum.sum(0)
-        self.phi_sum = np.zeros([self.voca_size, self.topic_size]) + self.eta
+        self.phi_sum = np.zeros([self.n_voca, self.n_topic]) + self.eta
+
 
 def main():
-    doc_word = [[0,1,2,4,5], [2,3,5,6,7,8,9]]
-    doc_cnt = [[1,2,3,2,1], [3,4,5,1,2,3,4]]
-    rate_user = [[0,1,2],[2,3]]
-    model = CTM(3, 10, 4, 2, doc_word, doc_cnt, rate_user)
+    doc_word = [[0, 1, 2, 4, 5], [2, 3, 5, 6, 7, 8, 9]]
+    doc_cnt = [[1, 2, 3, 2, 1], [3, 4, 5, 1, 2, 3, 4]]
+    rate_user = [[0, 1, 2], [2, 3]]
+    model = CorrelatedTopicModel(3, 10, 4, 2, doc_word, doc_cnt, rate_user)
     model.learning(10)
 
+
 if __name__ == '__main__':
     main()