binhtrantt
diff --git a/‎ptm/__init__.py b/‎ptm/__init__.py
diff --git a/‎at_model.py renamed to ‎ptm/at_model.py b/‎at_model.py renamed to ‎ptm/at_model.py
diff --git a/‎ctm.py renamed to ‎ptm/ctm.py b/‎ctm.py renamed to ‎ptm/ctm.py
diff --git a/‎diln.py renamed to ‎ptm/diln.py b/‎diln.py renamed to ‎ptm/diln.py
diff --git a/‎ptm/formatted_logger.py
Lines changed: 39 additions & 0 deletions b/‎ptm/formatted_logger.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎hdp_gibbs.py renamed to ‎ptm/hdp_gibbs.py b/‎hdp_gibbs.py renamed to ‎ptm/hdp_gibbs.py
diff --git a/‎hdsp.py renamed to ‎ptm/hdsp.py b/‎hdsp.py renamed to ‎ptm/hdsp.py
diff --git a/‎hmm_lda.py renamed to ‎ptm/hmm_lda.py
Lines changed: 4 additions & 0 deletions b/‎hmm_lda.py renamed to ‎ptm/hmm_lda.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎lda_gibbs.py renamed to ‎ptm/lda_gibbs.py b/‎lda_gibbs.py renamed to ‎ptm/lda_gibbs.py
diff --git a/‎lda_vb.py renamed to ‎ptm/lda_vb.py b/‎lda_vb.py renamed to ‎ptm/lda_vb.py
diff --git a/‎nltk_corpus.py renamed to ‎ptm/nltk_corpus.py b/‎nltk_corpus.py renamed to ‎ptm/nltk_corpus.py
diff --git a/‎rtm.py renamed to ‎ptm/rtm.py
Lines changed: 25 additions & 7 deletions b/‎rtm.py renamed to ‎ptm/rtm.py
Lines changed: 25 additions & 7 deletions
diff --git a/‎slda_gibbs.py renamed to ‎ptm/slda_gibbs.py b/‎slda_gibbs.py renamed to ‎ptm/slda_gibbs.py
diff --git a/‎slda_vb.py renamed to ‎ptm/slda_vb.py b/‎slda_vb.py renamed to ‎ptm/slda_vb.py
diff --git a/‎utils.py renamed to ‎ptm/utils.py
Lines changed: 9 additions & 0 deletions b/‎utils.py renamed to ‎ptm/utils.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎whdsp.py renamed to ‎ptm/whdsp.py
Lines changed: 1 addition & 1 deletion b/‎whdsp.py renamed to ‎ptm/whdsp.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py
Lines changed: 102 additions & 0 deletions b/‎setup.py
Lines changed: 102 additions & 0 deletions
@@ -0,0 +1,39 @@
+import logging
+import os
+import time
+
+default_log_path = './logs'
+
+def formatted_logger(label, level=None, format=None, date_format=None, file_path=None):
+	log = logging.getLogger(label)
+	if level is None:
+		level = logging.INFO
+	elif level.lower() == 'debug':
+		level = logging.DEBUG
+	elif level.lower() == 'info':
+		level = logging.INFO
+	elif level.lower() == 'warn':
+		level = logging.WARN
+	elif level.lower() == 'error':
+		level = logging.ERROR
+	elif level.lower() == 'critical':
+		level = logging.CRITICAL
+	log.setLevel(level)
+
+	if format is None:
+		format = '%(asctime)s %(levelname)s:%(name)s:%(message)s'
+	if date_format is None:
+		date_format = '%Y-%m-%d %H:%M:%S'
+	if file_path is None:
+		if not os.path.exists(default_log_path):
+			os.makedirs(default_log_path)
+		file_path = '%s/%s.%s.log.txt' % (default_log_path, label, time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()))
+
+	formatter = logging.Formatter(format, date_format)
+	stream_handler = logging.StreamHandler()
+	stream_handler.setFormatter(formatter)
+	file_handler = logging.FileHandler(file_path)
+	file_handler.setFormatter(formatter)
+	log.addHandler(file_handler)
+	log.addHandler(stream_handler)
+	return log
@@ -1,6 +1,10 @@
 import numpy as np
 
 class HMM_LDA:
+    """ implementation of HMM-LDA proposed by Griffiths et al. (2004)
+     Original reference : Integrating topics and syntax, Griffiths, Thomas L and Steyvers, Mark and Blei, David M and Tenenbaum, Joshua B, NIPS 2004
+    """
+
     def __init__(self, num_class, num_topic, num_voca, docs):
         self.C = num_class
         self.K = num_topic
 
@@ -1,7 +1,11 @@
-import numpy as np 
+import numpy as np
+import utils
 from scipy.special import gammaln, psi
+from formatted_logger import formatted_logger
 
-eps = 1e-10
+eps = 1e-20
+
+log = formatted_logger('RTM', 'info')
 
 class rtm:
     """ implementation of relational topic model by Chang and Blei (2009)
@@ -13,7 +17,7 @@ def __init__(self, num_topic, num_doc, num_voca, doc_ids, doc_cnt, doc_links, rh
         self.K = num_topic
         self.V = num_voca
 
-        self.alpha = 1.
+        self.alpha = .1
 
         self.gamma = np.random.gamma(100., 1./100, [self.D, self.K])
         self.beta = np.random.dirichlet([5]*self.V, self.K)
@@ -35,19 +39,21 @@ def __init__(self, num_topic, num_doc, num_voca, doc_ids, doc_cnt, doc_links, rh
         self.doc_links = doc_links
         self.rho = rho  #regularization parameter
 
+        log.info('Initialize RTM: num_voca:%d, num_topic:%d, num_doc:%d' % (self.V,self.K,self.D))
+
     def posterior_inference(self, max_iter):
         for iter in xrange(max_iter):
             self.variation_update()
             self.parameter_estimation()
-            print self.compute_elbo()
+            log.info('%d iter: ELBO = %.3f' % (iter, self.compute_elbo()))
 
     def compute_elbo(self):
         """ compute evidence lower bound for trained model
         """
         elbo = 0
 
         e_log_theta = psi(self.gamma) - psi(np.sum(self.gamma, 1))[:,np.newaxis] # D x K
-        log_beta = np.log(self.beta)
+        log_beta = np.log(self.beta+eps)
 
         for di in xrange(self.D):
             words = self.doc_ids[di]
@@ -62,7 +68,7 @@ def compute_elbo(self):
             elbo += - np.sum(cnt * self.phi[di] * np.log(self.phi[di])) # - E_q[log q(z|phi)]
 
             for adi in self.doc_links[di]:
-                elbo += np.dot(self.eta, self.pi[di]*self.pi[adi]) # E_q[log p(y_{d1,d2}|z_{d1},z_{d2},\eta,\nu)]
+                elbo += np.dot(self.eta, self.pi[di]*self.pi[adi]) + self.nu # E_q[log p(y_{d1,d2}|z_{d1},z_{d2},\eta,\nu)]
 
         return elbo
 
@@ -77,7 +83,7 @@ def variation_update(self):
             cnt = self.doc_cnt[di]
             doc_len = np.sum(cnt)
 
-            new_phi = np.log(self.beta[:,words]) + e_log_theta[di,:][:,np.newaxis]
+            new_phi = np.log(self.beta[:,words]+eps) + e_log_theta[di,:][:,np.newaxis]
 
             gradient = np.zeros(self.K)
             for adi in self.doc_links[di]:
@@ -114,6 +120,18 @@ def parameter_estimation(self):
         self.nu = np.log(num_links-np.sum(pi_sum)) - np.log(self.rho*(self.K-1)/self.K + num_links - np.sum(pi_sum))
         self.eta = np.log(pi_sum) - np.log(pi_sum + self.rho * pi_alpha) - self.nu 
 
+    def save_model(self, output_directory, vocab=None):
+        import os
+        if not os.path.exists(output_directory):
+            os.mkdir(output_directory)
+
+        np.savetxt(output_directory+'/eta.txt', self.eta, delimiter='\t')
+        with open(output_directory+'/nu.txt', 'w') as f:
+            f.write('%f\n'%self.nu)
+        np.savetxt(output_directory+'/beta.txt', self.beta, delimiter='\t')
+        np.savetxt(output_directory+'/gamma.txt',self.gamma,delimiter='\t')
+        if vocab:
+            utils.write_top_words(self.beta, vocab, output_directory+'/top_words.csv')
 
 def main():
     rho = 1
 
@@ -143,3 +143,12 @@ def convert_wrdcnt_wrdlist(corpus_ids, corpus_cnt):
         corpus.append(doc)
     return corpus
 
+
+def write_top_words(topic_word_matrix, vocab, filepath, top_words = 20, delimiter=',', newline='\n'):
+    with open(filepath, 'w') as f:
+        for ti in xrange(topic_word_matrix.shape[0]):
+            top_words = vocab[topic_word_matrix[:,ti].argsort()[::-1][:top_words]]
+            f.write( '%d' % (ti) )
+            for word in top_words:
+                f.write(delimiter + word)
+            f.write(newline)
@@ -460,7 +460,7 @@ def __init__(self, vocab, word_ids, word_cnt, num_topics, labels, label_names =
 
         if type(vocab) == list:
             self.vocab = np.array(vocab)
-        else
+        else:
             self.vocab = vocab
 
         if type(word_ids[0]) != np.ndarray:
 
@@ -0,0 +1,102 @@
+from setuptools import setup, find_packages  # Always prefer setuptools over distutils
+from codecs import open  # To use a consistent encoding
+from os import path
+
+here = path.abspath(path.dirname(__file__))
+
+# Get the long description from the relevant file
+# with open(path.join(here, 'DESCRIPTION.rst'), encoding='utf-8') as f:
+#     long_description = f.read()
+long_description = open('README.md').read()
+
+setup(
+    name='ptm',
+
+    # Versions should comply with PEP440.  For a discussion on single-sourcing
+    # the version across setup.py and the project code, see
+    # https://packaging.python.org/en/latest/development.html#single-sourcing-the-version
+    version='0.0.1',
+
+    description='Probabilistic topic model',
+    long_description=long_description,
+
+    # The project's main homepage.
+    url='https://github.com/arongdari/python-topic-model/',
+
+    # Author details
+    author='Dongwoo Kim',
+    author_email='arongdari@gmail.com',
+
+    # Choose your license
+    license='Apache License 2.0',
+
+    # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
+    classifiers=[
+        # How mature is this project? Common values are
+        #   3 - Alpha
+        #   4 - Beta
+        #   5 - Production/Stable
+        'Development Status :: 3 - Alpha',
+
+        # Indicate who your project is intended for
+        'Intended Audience :: Developers',
+        'Topic :: Software Development :: Build Tools',
+
+        # Pick your license as you wish (should match "license" above)
+        'License :: OSI Approved :: Apache Software License',
+
+        # Specify the Python versions you support here. In particular, ensure
+        # that you indicate whether you support Python 2, Python 3 or both.
+        # 'Programming Language :: Python :: 2',
+        # 'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        # 'Programming Language :: Python :: 3',
+        # 'Programming Language :: Python :: 3.2',
+        # 'Programming Language :: Python :: 3.3',
+        # 'Programming Language :: Python :: 3.4',
+    ],
+
+    # What does your project relate to?
+    keywords='topic model lda',
+
+    # You can just specify the packages manually here if your project is
+    # simple. Or you can use find_packages().
+    # packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
+    packages=find_packages(),
+
+    # List run-time dependencies here.  These will be installed by pip when your
+    # project is installed. For an analysis of "install_requires" vs pip's
+    # requirements files see:
+    # https://packaging.python.org/en/latest/technical.html#install-requires-vs-requirements-files
+    install_requires=['numpy', 'scipy', ],
+
+    # List additional groups of dependencies here (e.g. development dependencies).
+    # You can install these using the following syntax, for example:
+    # $ pip install -e .[dev,test]
+    extras_require = {
+        'dev': [],#'check-manifest'],
+        'test': [],#'coverage'],
+    },
+
+    # If there are data files included in your packages that need to be
+    # installed, specify them here.  If using Python 2.6 or less, then these
+    # have to be included in MANIFEST.in as well.
+    package_data={
+        # 'sample': ['package_data.dat'],
+    },
+
+    # Although 'package_data' is the preferred approach, in some case you may
+    # need to place data files outside of your packages.
+    # see http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files
+    # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
+    data_files=[],#('my_data', ['data/data_file'])],
+
+    # To provide executable scripts, use entry points in preference to the
+    # "scripts" keyword. Entry points provide cross-platform support and allow
+    # pip to create the appropriate form of executable for the target platform.
+    entry_points={
+        # 'console_scripts': [
+        #     'sample=sample:main',
+        # ],
+    },
+)