@@ -20,8 +20,6 @@ package org.apache.spark.mllib.fpm
20
20
import org .apache .spark .Logging
21
21
import org .apache .spark .annotation .Experimental
22
22
23
- import scala .collection .mutable .ArrayBuffer
24
-
25
23
/**
26
24
*
27
25
* :: Experimental ::
@@ -36,80 +34,71 @@ private[fpm] object LocalPrefixSpan extends Logging with Serializable {
36
34
* @param minCount minimum count
37
35
* @param maxPatternLength maximum pattern length
38
36
* @param prefix prefix
39
- * @param projectedDatabase the projected dabase
37
+ * @param database the projected dabase
40
38
* @return a set of sequential pattern pairs,
41
39
* the key of pair is sequential pattern (a list of items),
42
40
* the value of pair is the pattern's count.
43
41
*/
44
42
def run (
45
43
minCount : Long ,
46
44
maxPatternLength : Int ,
47
- prefix : ArrayBuffer [Int ],
48
- projectedDatabase : Array [Array [Int ]]): Iterator [(Array [Int ], Long )] = {
49
- val frequentPrefixAndCounts = getFreqItemAndCounts(minCount, projectedDatabase)
50
- val frequentPatternAndCounts = frequentPrefixAndCounts
51
- .map(x => ((prefix :+ x._1).toArray, x._2))
52
- val prefixProjectedDatabases = getPatternAndProjectedDatabase(
53
- prefix, frequentPrefixAndCounts.map(_._1), projectedDatabase)
45
+ prefix : List [Int ],
46
+ database : Iterable [Array [Int ]]): Iterator [(Array [Int ], Long )] = {
47
+
48
+ if (database.isEmpty) return Iterator .empty
49
+
50
+ val frequentItemAndCounts = getFreqItemAndCounts(minCount, database)
51
+ val frequentItems = frequentItemAndCounts.map(_._1)
52
+ val frequentPatternAndCounts = frequentItemAndCounts
53
+ .map { case (item, count) => ((item :: prefix).reverse.toArray, count) }
54
54
55
- if (prefixProjectedDatabases.nonEmpty && prefix.length + 1 < maxPatternLength) {
56
- frequentPatternAndCounts.iterator ++ prefixProjectedDatabases.flatMap {
57
- case (nextPrefix, projDB) => run(minCount, maxPatternLength, nextPrefix, projDB)
55
+ val filteredProjectedDatabase = database.map(x => x.filter(frequentItems.contains(_)))
56
+
57
+ if (prefix.length + 1 < maxPatternLength) {
58
+ frequentPatternAndCounts ++ frequentItems.flatMap { item =>
59
+ val nextProjected = project(filteredProjectedDatabase, item)
60
+ run(minCount, maxPatternLength, item :: prefix, nextProjected)
58
61
}
59
62
} else {
60
- frequentPatternAndCounts.iterator
63
+ frequentPatternAndCounts
61
64
}
62
65
}
63
66
64
67
/**
65
- * calculate suffix sequence following a prefix in a sequence
66
- * @param prefix prefix
67
- * @param sequence sequence
68
+ * Calculate suffix sequence immediately after the first occurrence of an item.
69
+ * @param item item to get suffix after
70
+ * @param sequence sequence to extract suffix from
68
71
* @return suffix sequence
69
72
*/
70
- def getSuffix (prefix : Int , sequence : Array [Int ]): Array [Int ] = {
71
- val index = sequence.indexOf(prefix )
73
+ def getSuffix (item : Int , sequence : Array [Int ]): Array [Int ] = {
74
+ val index = sequence.indexOf(item )
72
75
if (index == - 1 ) {
73
76
Array ()
74
77
} else {
75
78
sequence.drop(index + 1 )
76
79
}
77
80
}
78
81
82
+ def project (database : Iterable [Array [Int ]], prefix : Int ): Iterable [Array [Int ]] = {
83
+ database
84
+ .map(candidateSeq => getSuffix(prefix, candidateSeq))
85
+ .filter(_.nonEmpty)
86
+ }
87
+
79
88
/**
80
89
* Generates frequent items by filtering the input data using minimal count level.
81
- * @param minCount the absolute minimum count
82
- * @param sequences sequences data
83
- * @return array of item and count pair
90
+ * @param minCount the minimum count for an item to be frequent
91
+ * @param database database of sequences
92
+ * @return item and count pairs
84
93
*/
85
94
private def getFreqItemAndCounts (
86
95
minCount : Long ,
87
- sequences : Array [Array [Int ]]): Array [(Int , Long )] = {
88
- sequences .flatMap(_.distinct)
96
+ database : Iterable [Array [Int ]]): Iterator [(Int , Long )] = {
97
+ database .flatMap(_.distinct)
89
98
.foldRight(Map [Int , Long ]().withDefaultValue(0L )) { case (item, ctr) =>
90
99
ctr + (item -> (ctr(item) + 1 ))
91
100
}
92
101
.filter(_._2 >= minCount)
93
- .toArray
94
- }
95
-
96
- /**
97
- * Get the frequent prefixes' projected database.
98
- * @param prefix the frequent prefixes' prefix
99
- * @param frequentPrefixes frequent next prefixes
100
- * @param projDB projected database for given prefix
101
- * @return extensions of prefix by one item and corresponding projected databases
102
- */
103
- private def getPatternAndProjectedDatabase (
104
- prefix : ArrayBuffer [Int ],
105
- frequentPrefixes : Array [Int ],
106
- projDB : Array [Array [Int ]]): Array [(ArrayBuffer [Int ], Array [Array [Int ]])] = {
107
- val filteredProjectedDatabase = projDB.map(x => x.filter(frequentPrefixes.contains(_)))
108
- frequentPrefixes.map { nextItem =>
109
- val nextProjDB = filteredProjectedDatabase
110
- .map(candidateSeq => getSuffix(nextItem, candidateSeq))
111
- .filter(_.nonEmpty)
112
- (prefix :+ nextItem, nextProjDB)
113
- }.filter(x => x._2.nonEmpty)
102
+ .iterator
114
103
}
115
104
}
0 commit comments