1
1
import time
2
2
import numpy as np
3
3
import numpy .linalg
4
- import simplex_projection
5
4
import scipy .optimize
5
+ from .simplex_projection import euclidean_proj_simplex
6
6
7
7
e = 1e-100
8
8
error_diff = 10
9
9
10
- class CTM :
10
+
11
+ class CorrelatedTopicModel ():
11
12
"""
12
13
Correlated topic models,
13
14
Blei, David and Lafferty, John,
14
15
2006
15
16
"""
16
-
17
- def __init__ (self , topic_size , voca_size , user_size , item_size , doc_item , doc_cnt , ratings = None ):
17
+
18
+ def __init__ (self , n_topic , n_voca , n_user , n_item , doc_item , doc_cnt , ratings = None ):
18
19
self .lambda_u = 0.01
19
20
self .lambda_v = 0.01
20
21
self .alpha = 1
21
22
self .eta = 0.01
22
23
self .a = 1
23
24
self .b = 0.01
24
25
25
- self .topic_size = topic_size
26
- self .voca_size = voca_size
27
- self .user_size = user_size
28
- self .item_size = item_size
29
-
30
- #U = user_topic matrix, U x K
31
- self .U = np .random .multivariate_normal (np .zeros (topic_size ), np .identity (topic_size )* (1. / self .lambda_u ), size = self .user_size )
32
- #V = item(doc)_topic matrix, V x K
33
- self .V = np .random .multivariate_normal (np .zeros (topic_size ), np .identity (topic_size )* (1. / self .lambda_u ), size = self .item_size )
34
- self .theta = np .random .random ([item_size ,topic_size ])
35
- self .theta = self .theta / self .theta .sum (1 )[:,np .newaxis ] #normalize
36
- self .beta = np .random .random ([voca_size ,topic_size ])
37
- self .beta = self .beta / self .beta .sum (0 ) #normalize
26
+ self .n_topic = n_topic
27
+ self .n_voca = n_voca
28
+ self .n_user = n_user
29
+ self .n_item = n_item
30
+
31
+ # U = user_topic matrix, U x K
32
+ self .U = np .random .multivariate_normal (np .zeros (n_topic ), np .identity (n_topic ) * (1. / self .lambda_u ),
33
+ size = self .n_user )
34
+ # V = item(doc)_topic matrix, V x K
35
+ self .V = np .random .multivariate_normal (np .zeros (n_topic ), np .identity (n_topic ) * (1. / self .lambda_u ),
36
+ size = self .n_item )
37
+ self .theta = np .random .random ([n_item , n_topic ])
38
+ self .theta = self .theta / self .theta .sum (1 )[:, np .newaxis ] # normalize
39
+ self .beta = np .random .random ([n_voca , n_topic ])
40
+ self .beta = self .beta / self .beta .sum (0 ) # normalize
38
41
39
42
self .doc_item = doc_item
40
43
self .doc_cnt = doc_cnt
41
44
42
- self .C = np .zeros ([user_size , item_size ]) + self .b
43
- self .R = np .zeros ([user_size , item_size ]) # user_size x item_size
45
+ self .C = np .zeros ([n_user , n_item ]) + self .b
46
+ self .R = np .zeros ([n_user , n_item ]) # user_size x item_size
44
47
45
48
if ratings :
46
49
for di in xrange (len (ratings )):
47
50
rate = ratings [di ]
48
51
for user in rate :
49
- self .C [user ,di ] += self .a - self .b
50
- self .R [user ,di ] = 1
52
+ self .C [user , di ] += self .a - self .b
53
+ self .R [user , di ] = 1
54
+
55
+ self .phi_sum = np .zeros ([n_voca , n_topic ]) + self .eta
51
56
52
- self .phi_sum = np .zeros ([voca_size , topic_size ]) + self .eta
53
-
54
57
def learning_fixed_theta (self , max_iter ):
55
58
old_err = 0
56
59
for iteration in xrange (max_iter ):
57
60
prev = time .clock ()
58
61
self .update_u ()
59
62
self .update_v ()
60
63
err = self .sqr_error ()
61
- print 'Iteration-' , iteration , time .clock () - prev , err
64
+ print ( 'Iteration-' , iteration , time .clock () - prev , err )
62
65
if abs (old_err - err ) < error_diff :
63
66
break
64
67
65
- #reconstructing matrix for prediction
68
+ # reconstructing matrix for prediction
66
69
def predict_item (self ):
67
70
return np .dot (self .U , self .V .T )
68
-
69
- #reconstruction error
70
- def sqr_error (self ):
71
- err = (self .R - self .predict_item ())** 2
71
+
72
+ # reconstruction error
73
+ def sqr_error (self ):
74
+ err = (self .R - self .predict_item ()) ** 2
72
75
err = err .sum ()
73
76
74
77
return err
@@ -80,39 +83,43 @@ def do_e_step(self):
80
83
81
84
def update_theta (self ):
82
85
def func (x , v , phi , beta , lambda_v ):
83
- return 0.5 * lambda_v * np .dot ((v - x ).T , v - x ) - np .sum (np .sum (phi * ( np .log (x * beta ) - np .log (phi ) )))
84
-
85
- for vi in xrange (self .item_size ):
86
+ return 0.5 * lambda_v * np .dot ((v - x ).T , v - x ) - np .sum (np .sum (phi * (np .log (x * beta ) - np .log (phi ))))
87
+
88
+ for vi in xrange (self .n_item ):
86
89
W = np .array (self .doc_item [vi ])
87
- word_beta = self .beta [W ,:]
88
- phi = self .theta [vi ,:] * word_beta + e # W x K
89
- phi = phi / phi .sum (1 )[:,np .newaxis ]
90
- result = scipy .optimize .minimize (func , self .theta [vi ,:], method = 'nelder-mead' , args = (self .V [vi ,:], phi , word_beta , self .lambda_v ))
91
- self .theta [vi ,:] = simplex_projection .euclidean_proj_simplex (result .x )
92
- self .phi_sum [W ,:] += np .array (self .doc_cnt [vi ])[:,np .newaxis ] * phi
90
+ word_beta = self .beta [W , :]
91
+ phi = self .theta [vi , :] * word_beta + e # W x K
92
+ phi = phi / phi .sum (1 )[:, np .newaxis ]
93
+ result = scipy .optimize .minimize (func , self .theta [vi , :], method = 'nelder-mead' ,
94
+ args = (self .V [vi , :], phi , word_beta , self .lambda_v ))
95
+ self .theta [vi , :] = euclidean_proj_simplex (result .x )
96
+ self .phi_sum [W , :] += np .array (self .doc_cnt [vi ])[:, np .newaxis ] * phi
93
97
94
98
def update_u (self ):
95
- for ui in xrange (self .user_size ):
96
- left = np .dot (self .V .T * self .C [ui ,:], self .V ) + self .lambda_u * np .identity (self .topic_size )
99
+ for ui in xrange (self .n_user ):
100
+ left = np .dot (self .V .T * self .C [ui , :], self .V ) + self .lambda_u * np .identity (self .n_topic )
97
101
98
- self .U [ui ,:] = numpy .linalg .solve (left , np .dot (self .V .T * self .C [ui ,:],self .R [ui ,:]))
102
+ self .U [ui , :] = numpy .linalg .solve (left , np .dot (self .V .T * self .C [ui , :], self .R [ui , :]))
99
103
100
104
def update_v (self ):
101
- for vi in xrange (self .item_size ):
102
- left = np .dot (self .U .T * self .C [:,vi ], self .U ) + self .lambda_v * np .identity (self .topic_size )
105
+ for vi in xrange (self .n_item ):
106
+ left = np .dot (self .U .T * self .C [:, vi ], self .U ) + self .lambda_v * np .identity (self .n_topic )
103
107
104
- self .V [vi ,:] = numpy .linalg .solve (left , np .dot (self .U .T * self .C [:,vi ],self .R [:,vi ] ) + self .lambda_v * self .theta [vi ,:])
108
+ self .V [vi , :] = numpy .linalg .solve (left , np .dot (self .U .T * self .C [:, vi ],
109
+ self .R [:, vi ]) + self .lambda_v * self .theta [vi , :])
105
110
106
111
def do_m_step (self ):
107
112
self .beta = self .phi_sum / self .phi_sum .sum (0 )
108
- self .phi_sum = np .zeros ([self .voca_size , self .topic_size ]) + self .eta
113
+ self .phi_sum = np .zeros ([self .n_voca , self .n_topic ]) + self .eta
114
+
109
115
110
116
def main ():
111
- doc_word = [[0 ,1 , 2 , 4 , 5 ], [2 ,3 , 5 , 6 , 7 , 8 , 9 ]]
112
- doc_cnt = [[1 ,2 , 3 , 2 , 1 ], [3 ,4 , 5 , 1 , 2 , 3 , 4 ]]
113
- rate_user = [[0 ,1 , 2 ],[2 ,3 ]]
114
- model = CTM (3 , 10 , 4 , 2 , doc_word , doc_cnt , rate_user )
117
+ doc_word = [[0 , 1 , 2 , 4 , 5 ], [2 , 3 , 5 , 6 , 7 , 8 , 9 ]]
118
+ doc_cnt = [[1 , 2 , 3 , 2 , 1 ], [3 , 4 , 5 , 1 , 2 , 3 , 4 ]]
119
+ rate_user = [[0 , 1 , 2 ], [2 , 3 ]]
120
+ model = CorrelatedTopicModel (3 , 10 , 4 , 2 , doc_word , doc_cnt , rate_user )
115
121
model .learning (10 )
116
122
123
+
117
124
if __name__ == '__main__' :
118
125
main ()
0 commit comments