@@ -34,7 +34,7 @@ import org.apache.spark.SparkContext
34
34
import org .apache .spark .SparkContext ._
35
35
import org .apache .spark .annotation .Experimental
36
36
import org .apache .spark .api .java .JavaRDD
37
- import org .apache .spark .mllib .linalg .{Vector , Vectors }
37
+ import org .apache .spark .mllib .linalg .{Vector , Vectors , DenseMatrix , BLAS , DenseVector }
38
38
import org .apache .spark .mllib .util .{Loader , Saveable }
39
39
import org .apache .spark .rdd ._
40
40
import org .apache .spark .util .Utils
@@ -431,6 +431,14 @@ class Word2Vec extends Serializable with Logging {
431
431
class Word2VecModel private [mllib] (
432
432
private val model : Map [String , Array [Float ]]) extends Serializable with Saveable {
433
433
434
+ private val numDim = model.head._2.size
435
+ private val numWords = model.size
436
+ private val flatVec = model.toSeq.flatMap { case (w, v) =>
437
+ v.map(_.toDouble)}.toArray
438
+ private val wordVecMat = new DenseMatrix (numWords, numDim, flatVec, isTransposed= true )
439
+ private val wordVecNorms = model.map { case (word, vec) =>
440
+ blas.snrm2(numDim, vec, 1 )}.toArray
441
+
434
442
private def cosineSimilarity (v1 : Array [Float ], v2 : Array [Float ]): Double = {
435
443
require(v1.length == v2.length, " Vectors should have the same length" )
436
444
val n = v1.length
@@ -481,19 +489,13 @@ class Word2VecModel private[mllib] (
481
489
require(num > 0 , " Number of similar words should > 0" )
482
490
483
491
val fVector = vector.toArray
484
- val flatVec = model.toSeq.flatMap { case (w, v) =>
485
- v.map(_.toDouble)}.toArray
486
-
487
- val numDim = model.head._2.size
488
- val numWords = model.size
489
- val cosineArray = Array .fill[Double ](numWords)(0 )
490
492
491
- blas.dgemv(
492
- " T " , numDim, numWords, 1.0 , flatVec, numDim, fVector, 1 , 0.0 , cosineArray, 1 )
493
+ val cosineVec = new DenseVector ( Array .fill[ Double ](numWords)( 0 ))
494
+ BLAS .gemv( 1.0 , wordVecMat, vector. asInstanceOf [ DenseVector ], 0.0 , cosineVec )
493
495
494
496
// Need not divide with the norm of the given vector since it is constant.
495
497
val updatedCosines = model.zipWithIndex.map { case (vec, ind) =>
496
- cosineArray (ind) / blas.snrm2(numDim, vec._2, 1 ) }
498
+ cosineVec (ind) / wordVecNorms(ind ) }
497
499
498
500
model.keys.zip(updatedCosines)
499
501
.toSeq
0 commit comments