[SPARK-6065] Optimize word2vec.findSynonynms using blas calls

MechCoder · MechCoder · commit 1350cf375de4 · 2015-04-17T12:08:53.000+05:30
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -479,9 +479,23 @@ class Word2VecModel private[mllib] (
    */
   def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
     require(num > 0, "Number of similar words should > 0")
-    // TODO: optimize top-k
-    val fVector = vector.toArray.map(_.toFloat)
-    model.mapValues(vec => cosineSimilarity(fVector, vec))
+
+    val fVector = vector.toArray
+    val flatVec = model.toSeq.flatMap { case(w, v) =>
+      v.map(_.toDouble)}.toArray
+
+    val numDim = model.head._2.size
+    val numWords = model.size
+    val cosineArray = Array.fill[Double](numWords)(0)
+
+    blas.dgemv(
+      "T", numDim, numWords, 1.0, flatVec, numDim, fVector, 1, 0.0, cosineArray, 1)
+
+    // Need not divide with the norm of the given vector since it is constant.
+    val updatedCosines = model.zipWithIndex.map { case (vec, ind) =>
+      cosineArray(ind) / blas.snrm2(numDim, vec._2, 1) }
+
+    model.keys.zip(updatedCosines)
       .toSeq
       .sortBy(- _._2)
       .take(num + 1)