@@ -94,36 +94,21 @@ class EMLDAOptimizer extends LDAOptimizer {
94
94
/**
95
95
* Compute bipartite term/doc graph.
96
96
*/
97
- private [clustering] override def initialize (docs : RDD [(Long , Vector )], lda : LDA ):
98
- LDAOptimizer = {
97
+ private [clustering] override def initialize (docs : RDD [(Long , Vector )], lda : LDA ): LDAOptimizer = {
99
98
100
99
val docConcentration = lda.getDocConcentration
101
100
val topicConcentration = lda.getTopicConcentration
102
101
val k = lda.getK
103
102
104
- /**
105
- * Note: The restriction > 1.0 may be relaxed in the future (allowing sparse solutions),
106
- * but values in (0,1) are not yet supported.
107
- */
103
+ // Note: The restriction > 1.0 may be relaxed in the future (allowing sparse solutions),
104
+ // but values in (0,1) are not yet supported.
108
105
require(docConcentration > 1.0 || docConcentration == - 1.0 , s " LDA docConcentration must be " +
109
106
s " > 1.0 (or -1 for auto) for EM Optimizer, but was set to $docConcentration" )
110
107
require(topicConcentration > 1.0 || topicConcentration == - 1.0 , s " LDA topicConcentration " +
111
108
s " must be > 1.0 (or -1 for auto) for EM Optimizer, but was set to $topicConcentration" )
112
109
113
- /**
114
- * - For EM: default = (50 / k) + 1.
115
- * - The 50/k is common in LDA libraries.
116
- * - The +1 follows Asuncion et al. (2009), who recommend a +1 adjustment for EM.
117
- */
118
110
this .docConcentration = if (docConcentration == - 1 ) (50.0 / k) + 1.0 else docConcentration
119
-
120
- /**
121
- * - For EM: default = 0.1 + 1.
122
- * - The 0.1 gives a small amount of smoothing.
123
- * - The +1 follows Asuncion et al. (2009), who recommend a +1 adjustment for EM.
124
- */
125
111
this .topicConcentration = if (topicConcentration == - 1 ) 1.1 else topicConcentration
126
-
127
112
val randomSeed = lda.getSeed
128
113
129
114
// For each document, create an edge (Document -> Term) for each unique term in the document.
@@ -230,8 +215,8 @@ class EMLDAOptimizer extends LDAOptimizer {
230
215
/**
231
216
* :: Experimental ::
232
217
*
233
- * An online optimizer for LDA. The Optimizer implements the Online LDA algorithm, which
234
- * processes a subset of the corpus by each call to next , and update the term-topic
218
+ * An online optimizer for LDA. The Optimizer implements the Online variational Bayes LDA
219
+ * algorithm, which processes a subset of the corpus on each iteration , and update the term-topic
235
220
* distribution adaptively.
236
221
*
237
222
* References:
@@ -242,39 +227,35 @@ class OnlineLDAOptimizer extends LDAOptimizer {
242
227
243
228
// LDA common parameters
244
229
private var k : Int = 0
245
- private var D : Int = 0
230
+ private var corpusSize : Long = 0
246
231
private var vocabSize : Int = 0
247
232
private var alpha : Double = 0
248
233
private var eta : Double = 0
249
- private var randomSeed : Long = 0
234
+ private var randomGenerator : java.util. Random = null
250
235
251
236
// Online LDA specific parameters
252
- private var tau_0 : Double = - 1
253
- private var kappa : Double = - 1
254
- private var batchSize : Int = - 1
237
+ private var tau_0 : Double = 1024
238
+ private var kappa : Double = 0.51
239
+ private var minibatchFraction : Double = 0.01
255
240
256
241
// internal data structure
257
242
private var docs : RDD [(Long , Vector )] = null
258
243
private var lambda : BDM [Double ] = null
259
244
private var Elogbeta : BDM [Double ]= null
260
245
private var expElogbeta : BDM [Double ] = null
261
246
262
- // count of invocation to next, used to help deciding the weight for each iteration
247
+ // count of invocation to next, which helps deciding the weight for each iteration
263
248
private var iteration = 0
264
249
265
250
/**
266
- * A (positive) learning parameter that downweights early iterations
251
+ * A (positive) learning parameter that downweights early iterations. Larger values make early
252
+ * iterations count less
267
253
*/
268
- def getTau_0 : Double = {
269
- if (this .tau_0 == - 1 ) {
270
- 1024
271
- } else {
272
- this .tau_0
273
- }
274
- }
254
+ def getTau_0 : Double = this .tau_0
275
255
276
256
/**
277
- * A (positive) learning parameter that downweights early iterations
257
+ * A (positive) learning parameter that downweights early iterations. Larger values make early
258
+ * iterations count less
278
259
* Automatic setting of parameter:
279
260
* - default = 1024, which follows the recommendation from OnlineLDA paper.
280
261
*/
@@ -287,18 +268,12 @@ class OnlineLDAOptimizer extends LDAOptimizer {
287
268
/**
288
269
* Learning rate: exponential decay rate
289
270
*/
290
- def getKappa : Double = {
291
- if (this .kappa == - 1 ) {
292
- 0.5
293
- } else {
294
- this .kappa
295
- }
296
- }
271
+ def getKappa : Double = this .kappa
297
272
298
273
/**
299
274
* Learning rate: exponential decay rate---should be between
300
275
* (0.5, 1.0] to guarantee asymptotic convergence.
301
- * - default = 0.5 , which follows the recommendation from OnlineLDA paper.
276
+ * - default = 0.51 , which follows the recommendation from OnlineLDA paper.
302
277
*/
303
278
def setKappa (kappa : Double ): this .type = {
304
279
require(kappa >= 0 || kappa == - 1.0 ,
@@ -310,52 +285,44 @@ class OnlineLDAOptimizer extends LDAOptimizer {
310
285
/**
311
286
* Mini-batch size, which controls how many documents are used in each iteration
312
287
*/
313
- def getBatchSize : Int = {
314
- if (this .batchSize == - 1 ) {
315
- D / 100
316
- } else {
317
- this .batchSize
318
- }
319
- }
288
+ def getMiniBatchFraction : Double = this .minibatchFraction
320
289
321
290
/**
322
291
* Mini-batch size, which controls how many documents are used in each iteration
323
292
* default = 1% from total documents.
324
293
*/
325
- def setBatchSize ( batchSize : Int ): this .type = {
326
- this .batchSize = batchSize
294
+ def setMiniBatchFraction ( miniBatchFraction : Double ): this .type = {
295
+ this .minibatchFraction = miniBatchFraction
327
296
this
328
297
}
329
298
330
- private [clustering] override def initialize (docs : RDD [(Long , Vector )], lda : LDA ): LDAOptimizer = {
299
+ private [clustering] override def initialize (docs : RDD [(Long , Vector )], lda : LDA ): LDAOptimizer = {
331
300
332
301
this .k = lda.getK
333
- this .D = docs.count().toInt
302
+ this .corpusSize = docs.count()
334
303
this .vocabSize = docs.first()._2.size
335
304
this .alpha = if (lda.getDocConcentration == - 1 ) 1.0 / k else lda.getDocConcentration
336
305
this .eta = if (lda.getTopicConcentration == - 1 ) 1.0 / k else lda.getTopicConcentration
337
- this .randomSeed = randomSeed
306
+ this .randomGenerator = new Random (lda.getSeed)
338
307
339
308
this .docs = docs
340
309
341
310
// Initialize the variational distribution q(beta|lambda)
342
311
this .lambda = getGammaMatrix(k, vocabSize)
343
- this .Elogbeta = dirichlet_expectation (lambda)
312
+ this .Elogbeta = dirichletExpectation (lambda)
344
313
this .expElogbeta = exp(Elogbeta )
345
314
this .iteration = 0
346
315
this
347
316
}
348
317
349
318
/**
350
- * Submit a a subset (like 1%, decide by the batchSize) of the corpus to the Online LDA model,
351
- * and it will update the topic distribution adaptively for the terms appearing in the subset.
352
- *
353
- * @return Inferred LDA model
319
+ * Submit a subset (like 1%, decide by the miniBatchFraction) of the corpus to the Online LDA
320
+ * model, and it will update the topic distribution adaptively for the terms appearing in the
321
+ * subset.
354
322
*/
355
323
private [clustering] override def next (): OnlineLDAOptimizer = {
356
324
iteration += 1
357
- val batchSize = getBatchSize
358
- val batch = docs.sample(true , batchSize.toDouble / D , randomSeed).cache()
325
+ val batch = docs.sample(true , minibatchFraction, randomGenerator.nextLong())
359
326
if (batch.isEmpty()) return this
360
327
361
328
val k = this .k
@@ -406,15 +373,17 @@ class OnlineLDAOptimizer extends LDAOptimizer {
406
373
})
407
374
408
375
val batchResult = stats.reduce(_ += _)
409
- update(batchResult, iteration, batchSize)
410
- batch.unpersist()
376
+ update(batchResult, iteration, (minibatchFraction * corpusSize).toInt)
411
377
this
412
378
}
413
379
414
380
private [clustering] override def getLDAModel (iterationTimes : Array [Double ]): LDAModel = {
415
381
new LocalLDAModel (Matrices .fromBreeze(lambda).transpose)
416
382
}
417
383
384
+ /**
385
+ * Update lambda based on the batch submitted. batchSize can be different for each iteration.
386
+ */
418
387
private def update (raw : BDM [Double ], iter: Int , batchSize : Int ): Unit = {
419
388
420
389
val tau_0 = this .getTau_0
@@ -427,18 +396,26 @@ class OnlineLDAOptimizer extends LDAOptimizer {
427
396
val stat = raw :* expElogbeta
428
397
429
398
// Update lambda based on documents.
430
- lambda = lambda * (1 - weight) + (stat * D .toDouble / batchSize.toDouble + eta) * weight
431
- Elogbeta = dirichlet_expectation(lambda)
399
+ lambda = lambda * (1 - weight) +
400
+ (stat * (corpusSize.toDouble / batchSize.toDouble) + eta) * weight
401
+ Elogbeta = dirichletExpectation(lambda)
432
402
expElogbeta = exp(Elogbeta )
433
403
}
434
404
405
+ /**
406
+ * Get a random matrix to initialize lambda
407
+ */
435
408
private def getGammaMatrix (row: Int , col: Int ): BDM [Double ] = {
436
409
val gammaRandomGenerator = new Gamma (100 , 1.0 / 100.0 )
437
410
val temp = gammaRandomGenerator.sample(row * col).toArray
438
411
(new BDM [Double ](col, row, temp)).t
439
412
}
440
413
441
- private def dirichlet_expectation (alpha : BDM [Double ]): BDM [Double ] = {
414
+ /**
415
+ * For theta ~ Dir(alpha), computes E[log(theta)] given alpha. Currently the implementation
416
+ * uses digamma which is accurate but expensive.
417
+ */
418
+ private def dirichletExpectation (alpha : BDM [Double ]): BDM [Double ] = {
442
419
val rowSum = sum(alpha(breeze.linalg.* , :: ))
443
420
val digAlpha = digamma(alpha)
444
421
val digRowSum = digamma(rowSum)
0 commit comments