18
18
package org .apache .spark .mllib .recommendation
19
19
20
20
import scala .collection .mutable .{ArrayBuffer , BitSet }
21
+ import scala .math .{abs , sqrt }
21
22
import scala .util .Random
22
23
import scala .util .Sorting
23
24
@@ -63,7 +64,7 @@ case class Rating(val user: Int, val product: Int, val rating: Double)
63
64
* Alternating Least Squares matrix factorization.
64
65
*
65
66
* ALS attempts to estimate the ratings matrix `R` as the product of two lower-rank matrices,
66
- * `X` and `Y`, i.e. `Xt * Y = R`. Typically these approximations are called 'factor' matrices.
67
+ * `X` and `Y`, i.e. `X * Yt = R`. Typically these approximations are called 'factor' matrices.
67
68
* The general approach is iterative. During each iteration, one of the factor matrices is held
68
69
* constant, while the other is solved for using least squares. The newly-solved factor matrix is
69
70
* then held constant while solving for the other factor matrix.
@@ -80,17 +81,22 @@ case class Rating(val user: Int, val product: Int, val rating: Double)
80
81
*
81
82
* For implicit preference data, the algorithm used is based on
82
83
* "Collaborative Filtering for Implicit Feedback Datasets", available at
83
- * [[http://research.yahoo.com/pub/2433 ]], adapted for the blocked approach used here.
84
+ * [[http://dx.doi.org/10.1109/ICDM.2008.22 ]], adapted for the blocked approach used here.
84
85
*
85
86
* Essentially instead of finding the low-rank approximations to the rating matrix `R`,
86
87
* this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if r > 0
87
88
* and 0 if r = 0. The ratings then act as 'confidence' values related to strength of indicated user
88
89
* preferences rather than explicit ratings given to items.
89
90
*/
90
- class ALS private (var numBlocks : Int , var rank : Int , var iterations : Int , var lambda : Double ,
91
- var implicitPrefs : Boolean , var alpha : Double )
92
- extends Serializable with Logging
93
- {
91
+ class ALS private (
92
+ var numBlocks : Int ,
93
+ var rank : Int ,
94
+ var iterations : Int ,
95
+ var lambda : Double ,
96
+ var implicitPrefs : Boolean ,
97
+ var alpha : Double ,
98
+ var seed : Long = System .nanoTime()
99
+ ) extends Serializable with Logging {
94
100
def this () = this (- 1 , 10 , 10 , 0.01 , false , 1.0 )
95
101
96
102
/**
@@ -130,6 +136,12 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
130
136
this
131
137
}
132
138
139
+ /** Sets a random seed to have deterministic results. */
140
+ def setSeed (seed : Long ): ALS = {
141
+ this .seed = seed
142
+ this
143
+ }
144
+
133
145
/**
134
146
* Run ALS with the configured parameters on an input RDD of (user, product, rating) triples.
135
147
* Returns a MatrixFactorizationModel with feature vectors for each user and product.
@@ -151,9 +163,9 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
151
163
val (userInLinks, userOutLinks) = makeLinkRDDs(numBlocks, ratingsByUserBlock)
152
164
val (productInLinks, productOutLinks) = makeLinkRDDs(numBlocks, ratingsByProductBlock)
153
165
154
- // Initialize user and product factors randomly, but use a deterministic seed for each partition
155
- // so that fault recovery works
156
- val seedGen = new Random ()
166
+ // Initialize user and product factors randomly, but use a deterministic seed for each
167
+ // partition so that fault recovery works
168
+ val seedGen = new Random (seed )
157
169
val seed1 = seedGen.nextInt()
158
170
val seed2 = seedGen.nextInt()
159
171
// Hash an integer to propagate random bits at all positions, similar to java.util.HashTable
@@ -208,21 +220,46 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
208
220
*/
209
221
def computeYtY (factors : RDD [(Int , Array [Array [Double ]])]) = {
210
222
if (implicitPrefs) {
211
- Option (
212
- factors.flatMapValues { case factorArray =>
213
- factorArray.view.map { vector =>
214
- val x = new DoubleMatrix (vector)
215
- x.mmul(x.transpose())
216
- }
217
- }.reduceByKeyLocally((a, b) => a.addi(b) )
218
- .values
219
- .reduce((a, b) => a.addi(b) )
220
- )
223
+ val n = rank * (rank + 1 ) / 2
224
+ val LYtY = factors.values.aggregate( new DoubleMatrix (n))( seqOp = ( L , Y ) => {
225
+ Y .foreach(y => dspr( 1.0 , new DoubleMatrix (y), L ))
226
+ L
227
+ }, combOp = ( L1 , L2 ) => {
228
+ L1 .addi( L2 )
229
+ } )
230
+ val YtY = new DoubleMatrix (rank, rank)
231
+ fillFullMatrix( LYtY , YtY )
232
+ Option ( YtY )
221
233
} else {
222
234
None
223
235
}
224
236
}
225
237
238
+ /**
239
+ * Adds alpha * x * x.t to a matrix in-place. This is the same as BLAS's DSPR.
240
+ *
241
+ * @param L the lower triangular part of the matrix packed in an array (row major)
242
+ */
243
+ private def dspr (alpha : Double , x : DoubleMatrix , L : DoubleMatrix ) = {
244
+ val n = x.length
245
+ var i = 0
246
+ var j = 0
247
+ var idx = 0
248
+ var axi = 0.0
249
+ val xd = x.data
250
+ val Ld = L .data
251
+ while (i < n) {
252
+ axi = alpha * xd(i)
253
+ j = 0
254
+ while (j <= i) {
255
+ Ld (idx) += axi * xd(j)
256
+ j += 1
257
+ idx += 1
258
+ }
259
+ i += 1
260
+ }
261
+ }
262
+
226
263
/**
227
264
* Flatten out blocked user or product factors into an RDD of (id, factor vector) pairs
228
265
*/
@@ -301,7 +338,14 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
301
338
* Make a random factor vector with the given random.
302
339
*/
303
340
private def randomFactor (rank : Int , rand : Random ): Array [Double ] = {
304
- Array .fill(rank)(rand.nextDouble)
341
+ // Choose a unit vector uniformly at random from the unit sphere, but from the
342
+ // "first quadrant" where all elements are nonnegative. This can be done by choosing
343
+ // elements distributed as Normal(0,1) and taking the absolute value, and then normalizing.
344
+ // This appears to create factorizations that have a slightly better reconstruction
345
+ // (<1%) compared picking elements uniformly at random in [0,1].
346
+ val factor = Array .fill(rank)(abs(rand.nextGaussian()))
347
+ val norm = sqrt(factor.map(x => x * x).sum)
348
+ factor.map(x => x / norm)
305
349
}
306
350
307
351
/**
@@ -365,51 +409,41 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
365
409
for (productBlock <- 0 until numBlocks) {
366
410
for (p <- 0 until blockFactors(productBlock).length) {
367
411
val x = new DoubleMatrix (blockFactors(productBlock)(p))
368
- fillXtX(x, tempXtX)
412
+ tempXtX.fill(0.0 )
413
+ dspr(1.0 , x, tempXtX)
369
414
val (us, rs) = inLinkBlock.ratingsForBlock(productBlock)(p)
370
415
for (i <- 0 until us.length) {
371
416
implicitPrefs match {
372
417
case false =>
373
418
userXtX(us(i)).addi(tempXtX)
374
419
SimpleBlas .axpy(rs(i), x, userXy(us(i)))
375
420
case true =>
376
- userXtX(us(i)).addi(tempXtX.mul(alpha * rs(i)))
377
- SimpleBlas .axpy(1 + alpha * rs(i), x, userXy(us(i)))
421
+ // Extension to the original paper to handle rs(i) < 0. confidence is a function
422
+ // of |rs(i)| instead so that it is never negative:
423
+ val confidence = 1 + alpha * abs(rs(i))
424
+ SimpleBlas .axpy(confidence - 1.0 , tempXtX, userXtX(us(i)))
425
+ // For rs(i) < 0, the corresponding entry in P is 0 now, not 1 -- negative rs(i)
426
+ // means we try to reconstruct 0. We add terms only where P = 1, so, term below
427
+ // is now only added for rs(i) > 0:
428
+ if (rs(i) > 0 ) {
429
+ SimpleBlas .axpy(confidence, x, userXy(us(i)))
430
+ }
378
431
}
379
432
}
380
433
}
381
434
}
382
435
383
436
// Solve the least-squares problem for each user and return the new feature vectors
384
- userXtX.zipWithIndex .map{ case (triangularXtX, index) =>
437
+ Array .range( 0 , numUsers) .map { index =>
385
438
// Compute the full XtX matrix from the lower-triangular part we got above
386
- fillFullMatrix(triangularXtX , fullXtX)
439
+ fillFullMatrix(userXtX(index) , fullXtX)
387
440
// Add regularization
388
441
(0 until rank).foreach(i => fullXtX.data(i* rank + i) += lambda)
389
442
// Solve the resulting matrix, which is symmetric and positive-definite
390
443
implicitPrefs match {
391
444
case false => Solve .solvePositive(fullXtX, userXy(index)).data
392
- case true => Solve .solvePositive(fullXtX.add(YtY .value.get), userXy(index)).data
393
- }
394
- }
395
- }
396
-
397
- /**
398
- * Set xtxDest to the lower-triangular part of x transpose * x. For efficiency in summing
399
- * these matrices, we store xtxDest as only rank * (rank+1) / 2 values, namely the values
400
- * at (0,0), (1,0), (1,1), (2,0), (2,1), (2,2), etc in that order.
401
- */
402
- private def fillXtX (x : DoubleMatrix , xtxDest : DoubleMatrix ) {
403
- var i = 0
404
- var pos = 0
405
- while (i < x.length) {
406
- var j = 0
407
- while (j <= i) {
408
- xtxDest.data(pos) = x.data(i) * x.data(j)
409
- pos += 1
410
- j += 1
445
+ case true => Solve .solvePositive(fullXtX.addi(YtY .value.get), userXy(index)).data
411
446
}
412
- i += 1
413
447
}
414
448
}
415
449
@@ -436,9 +470,10 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
436
470
437
471
438
472
/**
439
- * Top-level methods for calling Alternating Least Squares (ALS) matrix factorizaton .
473
+ * Top-level methods for calling Alternating Least Squares (ALS) matrix factorization .
440
474
*/
441
475
object ALS {
476
+
442
477
/**
443
478
* Train a matrix factorization model given an RDD of ratings given by users to some products,
444
479
* in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
@@ -451,15 +486,39 @@ object ALS {
451
486
* @param iterations number of iterations of ALS (recommended: 10-20)
452
487
* @param lambda regularization factor (recommended: 0.01)
453
488
* @param blocks level of parallelism to split computation into
489
+ * @param seed random seed
454
490
*/
455
491
def train (
456
492
ratings : RDD [Rating ],
457
493
rank : Int ,
458
494
iterations : Int ,
459
495
lambda : Double ,
460
- blocks : Int )
461
- : MatrixFactorizationModel =
462
- {
496
+ blocks : Int ,
497
+ seed : Long
498
+ ): MatrixFactorizationModel = {
499
+ new ALS (blocks, rank, iterations, lambda, false , 1.0 , seed).run(ratings)
500
+ }
501
+
502
+ /**
503
+ * Train a matrix factorization model given an RDD of ratings given by users to some products,
504
+ * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
505
+ * product of two lower-rank matrices of a given rank (number of features). To solve for these
506
+ * features, we run a given number of iterations of ALS. This is done using a level of
507
+ * parallelism given by `blocks`.
508
+ *
509
+ * @param ratings RDD of (userID, productID, rating) pairs
510
+ * @param rank number of features to use
511
+ * @param iterations number of iterations of ALS (recommended: 10-20)
512
+ * @param lambda regularization factor (recommended: 0.01)
513
+ * @param blocks level of parallelism to split computation into
514
+ */
515
+ def train (
516
+ ratings : RDD [Rating ],
517
+ rank : Int ,
518
+ iterations : Int ,
519
+ lambda : Double ,
520
+ blocks : Int
521
+ ): MatrixFactorizationModel = {
463
522
new ALS (blocks, rank, iterations, lambda, false , 1.0 ).run(ratings)
464
523
}
465
524
@@ -476,8 +535,7 @@ object ALS {
476
535
* @param lambda regularization factor (recommended: 0.01)
477
536
*/
478
537
def train (ratings : RDD [Rating ], rank : Int , iterations : Int , lambda : Double )
479
- : MatrixFactorizationModel =
480
- {
538
+ : MatrixFactorizationModel = {
481
539
train(ratings, rank, iterations, lambda, - 1 )
482
540
}
483
541
@@ -493,8 +551,7 @@ object ALS {
493
551
* @param iterations number of iterations of ALS (recommended: 10-20)
494
552
*/
495
553
def train (ratings : RDD [Rating ], rank : Int , iterations : Int )
496
- : MatrixFactorizationModel =
497
- {
554
+ : MatrixFactorizationModel = {
498
555
train(ratings, rank, iterations, 0.01 , - 1 )
499
556
}
500
557
@@ -511,16 +568,42 @@ object ALS {
511
568
* @param lambda regularization factor (recommended: 0.01)
512
569
* @param blocks level of parallelism to split computation into
513
570
* @param alpha confidence parameter (only applies when immplicitPrefs = true)
571
+ * @param seed random seed
514
572
*/
515
573
def trainImplicit (
516
574
ratings : RDD [Rating ],
517
575
rank : Int ,
518
576
iterations : Int ,
519
577
lambda : Double ,
520
578
blocks : Int ,
521
- alpha : Double )
522
- : MatrixFactorizationModel =
523
- {
579
+ alpha : Double ,
580
+ seed : Long
581
+ ): MatrixFactorizationModel = {
582
+ new ALS (blocks, rank, iterations, lambda, true , alpha, seed).run(ratings)
583
+ }
584
+
585
+ /**
586
+ * Train a matrix factorization model given an RDD of 'implicit preferences' given by users
587
+ * to some products, in the form of (userID, productID, preference) pairs. We approximate the
588
+ * ratings matrix as the product of two lower-rank matrices of a given rank (number of features).
589
+ * To solve for these features, we run a given number of iterations of ALS. This is done using
590
+ * a level of parallelism given by `blocks`.
591
+ *
592
+ * @param ratings RDD of (userID, productID, rating) pairs
593
+ * @param rank number of features to use
594
+ * @param iterations number of iterations of ALS (recommended: 10-20)
595
+ * @param lambda regularization factor (recommended: 0.01)
596
+ * @param blocks level of parallelism to split computation into
597
+ * @param alpha confidence parameter (only applies when immplicitPrefs = true)
598
+ */
599
+ def trainImplicit (
600
+ ratings : RDD [Rating ],
601
+ rank : Int ,
602
+ iterations : Int ,
603
+ lambda : Double ,
604
+ blocks : Int ,
605
+ alpha : Double
606
+ ): MatrixFactorizationModel = {
524
607
new ALS (blocks, rank, iterations, lambda, true , alpha).run(ratings)
525
608
}
526
609
@@ -537,8 +620,7 @@ object ALS {
537
620
* @param lambda regularization factor (recommended: 0.01)
538
621
*/
539
622
def trainImplicit (ratings : RDD [Rating ], rank : Int , iterations : Int , lambda : Double , alpha : Double )
540
- : MatrixFactorizationModel =
541
- {
623
+ : MatrixFactorizationModel = {
542
624
trainImplicit(ratings, rank, iterations, lambda, - 1 , alpha)
543
625
}
544
626
@@ -555,8 +637,7 @@ object ALS {
555
637
* @param iterations number of iterations of ALS (recommended: 10-20)
556
638
*/
557
639
def trainImplicit (ratings : RDD [Rating ], rank : Int , iterations : Int )
558
- : MatrixFactorizationModel =
559
- {
640
+ : MatrixFactorizationModel = {
560
641
trainImplicit(ratings, rank, iterations, 0.01 , - 1 , 1.0 )
561
642
}
562
643
0 commit comments