machine-intelligence-laboratory · Alvant · Jun 24, 2020 · May 21, 2020 · May 23, 2020 · Jun 24, 2020
@@ -2044,9 +2044,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "topicnet",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "topicnet"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -2058,7 +2058,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.6.7"
   }
  },
  "nbformat": 4,

@@ -0,0 +1,16 @@
+
+name: 20NewsGroups
+batches_prefix: 20NG
+dataset_path: '/data_mil/datasets/20_News_dataset/ /data/datasets/20_News_dataset/20NG_BOW.csv'
+
+word: "@word"
+
+min_num_topics: 10
+max_num_topics: 30
+
+num_topics_interval: 3
+num_fit_iterations: 40
+num_restarts: 6
+
+
+
@@ -0,0 +1,16 @@
+
+name: Brown
+batches_prefix: Brown
+dataset_path: '/data_mil/datasets/Brown/Brown.csv'
+
+word: "@word"
+
+min_num_topics: 5
+max_num_topics: 25
+
+num_topics_interval: 3
+num_fit_iterations: 30
+num_restarts: 6
+
+
+
@@ -0,0 +1,16 @@
+
+name: PostNauka
+batches_prefix: PN
+dataset_path: '/data_mil/datasets/postnauka/postnauka.csv'
+
+word: "@word"
+
+min_num_topics: 5
+max_num_topics: 50
+
+num_topics_interval: 3
+num_fit_iterations: 40
+num_restarts: 6
+
+
+
@@ -0,0 +1,16 @@
+
+name: Reuters
+batches_prefix: Reuters
+dataset_path: '/data_mil/datasets/Reuters/Reuters.csv'
+
+word: "@word"
+
+min_num_topics: 5
+max_num_topics: 50
+
+num_topics_interval: 3
+num_fit_iterations: 40
+num_restarts: 6
+
+
+
@@ -0,0 +1,24 @@
+name: StackOverflow
+
+dataset_path: '/data_mil/datasets/StackOverflow/SO_vw_bow.txt'
+batches_prefix: SO
+
+word: "@lemmatized"
+
+# https://link.springer.com/article/10.1007/s10664-012-9231-y
+# Anton Barua, Stephen W. Thomas & Ahmed E. Hassan 2012
+# used just 40 topics
+#
+# Rosen, C., Shihab, E. 2016
+# What are mobile developers asking about? A large scale study using stack overflow.
+# used 40 topics (but merged them down to 32)
+
+min_num_topics: 5
+max_num_topics: 60
+
+num_topics_interval: 5
+num_fit_iterations: 40
+num_restarts: 6
+
+
+
@@ -0,0 +1,13 @@
+name: WikiRef220
+
+dataset_path: '/data_mil/datasets/WikiRef220/wiki_ref220_bow.csv'
+batches_prefix: WRef
+
+word: "@lemmatized"
+
+min_num_topics: 2
+max_num_topics: 20
+
+num_topics_interval: 1
+num_fit_iterations: 40
+num_restarts: 6
@@ -0,0 +1,21 @@
+
+name: RuWikiGood
+batches_prefix: RWG
+dataset_path: '/data_mil/datasets/ruwiki_good/good_ruwiki_vw.txt'
+
+word: "@lemmatized"
+
+min_num_topics: 5
+
+# around 10 main categories
+# around 87 `ul b` tags
+# around 238 <b> tags in total
+# max_num_topics: 300?
+max_num_topics: 100
+
+num_topics_interval: 5
+num_fit_iterations: 40
+num_restarts: 4
+
+
+
@@ -154,6 +154,14 @@ def init_lda(
         dataset, modalities_to_use, main_modality, num_topics
     )
 
+    # TODO: implement this LDA also
+    # Found in doi.org/10.1007/s10664-015-9379-3
+    # Rosen, C., Shihab, E. 2016
+    # What are mobile developers asking about? A large scale study using stack overflow.
+    #
+    # "We use the defacto standard heuristics of α=50/K and β=0.01
+    # (Biggers et al. 2014) for our hyperparameter values"
+
     # what GenSim returns by default (everything is 'symmetric')
     # see https://github.com/RaRe-Technologies/gensim/blob/master/gensim/models/ldamodel.py#L521
     if prior == "symmetric":

@@ -1,6 +1,7 @@
 import logging
 import os
 import pandas as pd
+from  numpy.random import RandomState
 import uuid
 import warnings
 
@@ -87,6 +88,7 @@ def __init__(
             self._keys_mean_many.append(key)
             self._keys_std_many.append(key)
 
+    # TODO: accept either VowpalWabbitTextCollection or Dataset with modalities
     def search_for_optimum(
             self,
             text_collection: VowpalWabbitTextCollection) -> None:
@@ -95,8 +97,9 @@ def search_for_optimum(
 
         dataset = text_collection._to_dataset()
 
-        # seed == None is too similar to seed == 0
-        seeds = [None] + list(range(1, self._num_restarts))
+        # TODO: if this sophisticated seeds don't make models different,
+        #  return the simpler seeds (0, 1, 2, ...)
+        seeds = [None] + [abs(RandomState(i).tomaxint()) for i in range(1, self._num_restarts)]
 
         nums_topics = list(range(
             self._min_num_topics,