Asymmetric docConcentration

Feynman Liang · Feynman Liang · commit bf4e68561df8 · 2015-08-10T13:38:27.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -79,7 +79,7 @@ class LDA private (
    *
    * This is the parameter to a Dirichlet distribution.
    */
-  def getDocConcentration: Vector = this.docConcentration
+  def getAsymmetricDocConcentration: Vector = this.docConcentration
 
   /**
    * Concentration parameter (commonly named "alpha") for the prior placed on documents'
@@ -105,22 +105,37 @@ class LDA private (
    *     - default = uniformly (1.0 / k), following the implementation from
    *       [[https://github.com/Blei-Lab/onlineldavb]].
    */
-  def setDocConcentration(docConcentration: Vector): this.type = {
+  def setAsymmetricDocConcentration(docConcentration: Vector): this.type = {
     this.docConcentration = docConcentration
     this
   }
 
-  /** Replicates Double to create a symmetric prior */
+  /**
+   * Gets the concentration parameter, assuming the document-topic Dirichlet distribution is
+   * symmetric. Included for backwards compatibility. This method should fail if
+   * [[docConcentration]] is asymmetric.
+   */
+  def getDocConcentration: Double = {
+    val parameter = docConcentration(0)
+    if (docConcentration.size == 1) {
+      parameter
+    } else {
+      require(docConcentration.toArray.forall(_ == parameter))
+      parameter
+    }
+  }
+
+  /** Replicates a [[Double]] docConcentration to create a symmetric prior. */
   def setDocConcentration(docConcentration: Double): this.type = {
     this.docConcentration = Vectors.dense(docConcentration)
     this
   }
 
-  /** Alias for [[getDocConcentration]] */
-  def getAlpha: Vector = getDocConcentration
+  /** Alias for [[setAsymmetricDocConcentration()]] */
+  def getAlpha: Vector = getAsymmetricDocConcentration
 
-  /** Alias for [[setDocConcentration()]] */
-  def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)
+  /** Alias for [[setAsymmetricDocConcentration()]] */
+  def setAlpha(alpha: Vector): this.type = setAsymmetricDocConcentration(alpha)
 
   /** Alias for [[setDocConcentration()]] */
   def setAlpha(alpha: Double): this.type = setDocConcentration(alpha)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -757,7 +757,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {
       val graph: Graph[LDA.TopicCounts, LDA.TokenCount] = Graph(vertices, edges)
 
       new DistributedLDAModel(graph, globalTopicTotals, globalTopicTotals.length, vocabSize,
-        docConcentration, topicConcentration, gammaShape, iterationTimes)
+        docConcentration, topicConcentration, iterationTimes, gammaShape)
     }
 
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -95,10 +95,8 @@ final class EMLDAOptimizer extends LDAOptimizer {
    * Compute bipartite term/doc graph.
    */
   override private[clustering] def initialize(docs: RDD[(Long, Vector)], lda: LDA): LDAOptimizer = {
-    val docConcentration = lda.getDocConcentration(0)
-    require({
-      lda.getDocConcentration.toArray.forall(_ == docConcentration)
-    }, "EMLDAOptimizer currently only supports symmetric document-topic priors")
+    // EMLDAOptimizer currently only supports symmetric document-topic priors
+    val docConcentration = lda.getDocConcentration
 
     val topicConcentration = lda.getTopicConcentration
     val k = lda.getK
@@ -378,18 +376,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     this.k = lda.getK
     this.corpusSize = docs.count()
     this.vocabSize = docs.first()._2.size
-    this.alpha = if (lda.getDocConcentration.size == 1) {
-      if (lda.getDocConcentration(0) == -1) Vectors.dense(Array.fill(k)(1.0 / k))
+    this.alpha = if (lda.getAsymmetricDocConcentration.size == 1) {
+      if (lda.getAsymmetricDocConcentration(0) == -1) Vectors.dense(Array.fill(k)(1.0 / k))
       else {
-        require(lda.getDocConcentration(0) >= 0, s"all entries in alpha must be >=0, got: $alpha")
-        Vectors.dense(Array.fill(k)(lda.getDocConcentration(0)))
+        require(lda.getAsymmetricDocConcentration(0) >= 0,
+          s"all entries in alpha must be >=0, got: $alpha")
+        Vectors.dense(Array.fill(k)(lda.getAsymmetricDocConcentration(0)))
       }
     } else {
-      require(lda.getDocConcentration.size == k, s"alpha must have length k, got: $alpha")
-      lda.getDocConcentration.foreachActive { case (_, x) =>
+      require(lda.getAsymmetricDocConcentration.size == k,
+        s"alpha must have length k, got: $alpha")
+      lda.getAsymmetricDocConcentration.foreachActive { case (_, x) =>
         require(x >= 0, s"all entries in alpha must be >= 0, got: $alpha")
       }
-      lda.getDocConcentration
+      lda.getAsymmetricDocConcentration
     }
     this.eta = if (lda.getTopicConcentration == -1) 1.0 / k else lda.getTopicConcentration
     this.randomGenerator = new Random(lda.getSeed)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -161,7 +161,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
   test("setter alias") {
     val lda = new LDA().setAlpha(2.0).setBeta(3.0)
     assert(lda.getAlpha.toArray.forall(_ === 2.0))
-    assert(lda.getDocConcentration.toArray.forall(_ === 2.0))
+    assert(lda.getAsymmetricDocConcentration.toArray.forall(_ === 2.0))
     assert(lda.getBeta === 3.0)
     assert(lda.getTopicConcentration === 3.0)
   }
@@ -364,7 +364,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
       .setGammaShape(1e10)
     val lda = new LDA().setK(2)
-      .setDocConcentration(Vectors.dense(0.00001, 0.1))
+      .setAsymmetricDocConcentration(Vectors.dense(0.00001, 0.1))
       .setTopicConcentration(0.01)
       .setMaxIterations(100)
       .setOptimizer(op)

Original file line number	Diff line number	Diff line change
`@@ -757,7 +757,7 @@ object DistributedLDAModel extends Loader[DistributedLDAModel] {`
`757`	`757`	`val graph: Graph[LDA.TopicCounts, LDA.TokenCount] = Graph(vertices, edges)`
`758`	`758`
`759`	`759`	`new DistributedLDAModel(graph, globalTopicTotals, globalTopicTotals.length, vocabSize,`
`760`		`- docConcentration, topicConcentration, gammaShape, iterationTimes)`
	`760`	`+ docConcentration, topicConcentration, iterationTimes, gammaShape)`
`761`	`761`	`}`
`762`	`762`
`763`	`763`	`}`