Skip to content

Commit a920865

Browse files
committed
Merge branch 'sliding' into auc
2 parents c1c6c22 + a9b250a commit a920865

File tree

5 files changed

+115
-50
lines changed

5 files changed

+115
-50
lines changed

core/src/main/scala/org/apache/spark/rdd/RDD.scala

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -951,22 +951,6 @@ abstract class RDD[T: ClassTag](
951951
*/
952952
def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T] = top(num)(ord.reverse)
953953

954-
/**
955-
* Returns a RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
956-
* window over them. The ordering is first based on the partition index and then the ordering of
957-
* items within each partition. This is similar to sliding in Scala collections, except that it
958-
* becomes an empty RDD if the window size is greater than the total number of items. It needs to
959-
* trigger a Spark job if the parent RDD has more than one partitions and the window size is
960-
* greater than 1.
961-
*/
962-
def sliding(windowSize: Int): RDD[Array[T]] = {
963-
if (windowSize == 1) {
964-
this.map(Array(_))
965-
} else {
966-
new SlidedRDD[T](this, windowSize)
967-
}
968-
}
969-
970954
/**
971955
* Save this RDD as a text file, using string representations of elements.
972956
*/

core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -553,18 +553,4 @@ class RDDSuite extends FunSuite with SharedSparkContext {
553553
val ids = ranked.map(_._1).distinct().collect()
554554
assert(ids.length === n)
555555
}
556-
557-
test("sliding") {
558-
val data = 0 until 6
559-
for (numPartitions <- 1 to 8) {
560-
val rdd = sc.parallelize(data, numPartitions)
561-
for (windowSize <- 1 to 6) {
562-
val slided = rdd.sliding(windowSize).collect().map(_.toList).toList
563-
val expected = data.sliding(windowSize).map(_.toList).toList
564-
assert(slided === expected)
565-
}
566-
assert(rdd.sliding(7).collect().isEmpty,
567-
"Should return an empty RDD if the window size is greater than the number of items.")
568-
}
569-
}
570556
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.mllib.rdd
19+
20+
import scala.reflect.ClassTag
21+
22+
import org.apache.spark.rdd.RDD
23+
24+
/**
25+
* Machine learning specific RDD functions.
26+
*/
27+
private[mllib]
28+
class RDDFunctions[T: ClassTag](self: RDD[T]) {
29+
30+
/**
31+
* Returns a RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
32+
* window over them. The ordering is first based on the partition index and then the ordering of
33+
* items within each partition. This is similar to sliding in Scala collections, except that it
34+
* becomes an empty RDD if the window size is greater than the total number of items. It needs to
35+
* trigger a Spark job if the parent RDD has more than one partitions and the window size is
36+
* greater than 1.
37+
*/
38+
def sliding(windowSize: Int): RDD[Seq[T]] = {
39+
require(windowSize > 0, s"Sliding window size must be positive, but got $windowSize.")
40+
if (windowSize == 1) {
41+
self.map(Seq(_))
42+
} else {
43+
new SlidingRDD[T](self, windowSize)
44+
}
45+
}
46+
}
47+
48+
private[mllib]
49+
object RDDFunctions {
50+
51+
/** Implicit conversion from an RDD to RDDFunctions. */
52+
implicit def fromRDD[T: ClassTag](rdd: RDD[T]) = new RDDFunctions[T](rdd)
53+
}

core/src/main/scala/org/apache/spark/rdd/SlidedRDD.scala renamed to mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,16 @@
1515
* limitations under the License.
1616
*/
1717

18-
package org.apache.spark.rdd
18+
package org.apache.spark.mllib.rdd
1919

2020
import scala.collection.mutable
2121
import scala.reflect.ClassTag
2222

2323
import org.apache.spark.{TaskContext, Partition}
24+
import org.apache.spark.rdd.RDD
2425

25-
private[spark]
26-
class SlidedRDDPartition[T](val idx: Int, val prev: Partition, val tail: Array[T])
26+
private[mllib]
27+
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T])
2728
extends Partition with Serializable {
2829
override val index: Int = idx
2930
}
@@ -33,49 +34,50 @@ class SlidedRDDPartition[T](val idx: Int, val prev: Partition, val tail: Array[T
3334
* window over them. The ordering is first based on the partition index and then the ordering of
3435
* items within each partition. This is similar to sliding in Scala collections, except that it
3536
* becomes an empty RDD if the window size is greater than the total number of items. It needs to
36-
* trigger a Spark job if the parent RDD has more than one partitions.
37+
* trigger a Spark job if the parent RDD has more than one partitions. To make this operation
38+
* efficient, the number of items per partition should be larger than the window size and the
39+
* window size should be small, e.g., 2.
3740
*
3841
* @param parent the parent RDD
3942
* @param windowSize the window size, must be greater than 1
4043
*
41-
* @see [[org.apache.spark.rdd.RDD#sliding]]
44+
* @see [[org.apache.spark.mllib.rdd.RDDFunctions#sliding]]
4245
*/
43-
private[spark]
44-
class SlidedRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int)
45-
extends RDD[Array[T]](parent) {
46+
private[mllib]
47+
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int)
48+
extends RDD[Seq[T]](parent) {
4649

47-
require(windowSize > 1, "Window size must be greater than 1.")
50+
require(windowSize > 1, s"Window size must be greater than 1, but got $windowSize.")
4851

49-
override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
50-
val part = split.asInstanceOf[SlidedRDDPartition[T]]
52+
override def compute(split: Partition, context: TaskContext): Iterator[Seq[T]] = {
53+
val part = split.asInstanceOf[SlidingRDDPartition[T]]
5154
(firstParent[T].iterator(part.prev, context) ++ part.tail)
5255
.sliding(windowSize)
53-
.map(_.toArray)
54-
.filter(_.size == windowSize)
56+
.withPartial(false)
5557
}
5658

5759
override def getPreferredLocations(split: Partition): Seq[String] =
58-
firstParent[T].preferredLocations(split.asInstanceOf[SlidedRDDPartition[T]].prev)
60+
firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)
5961

6062
override def getPartitions: Array[Partition] = {
6163
val parentPartitions = parent.partitions
6264
val n = parentPartitions.size
6365
if (n == 0) {
6466
Array.empty
6567
} else if (n == 1) {
66-
Array(new SlidedRDDPartition[T](0, parentPartitions(0), Array.empty))
68+
Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty))
6769
} else {
6870
val n1 = n - 1
6971
val w1 = windowSize - 1
7072
// Get the first w1 items of each partition, starting from the second partition.
7173
val nextHeads =
7274
parent.context.runJob(parent, (iter: Iterator[T]) => iter.take(w1).toArray, 1 until n, true)
73-
val partitions = mutable.ArrayBuffer[SlidedRDDPartition[T]]()
75+
val partitions = mutable.ArrayBuffer[SlidingRDDPartition[T]]()
7476
var i = 0
7577
var partitionIndex = 0
7678
while (i < n1) {
7779
var j = i
78-
val tail = mutable.ArrayBuffer[T]()
80+
val tail = mutable.ListBuffer[T]()
7981
// Keep appending to the current tail until appended a head of size w1.
8082
while (j < n1 && nextHeads(j).size < w1) {
8183
tail ++= nextHeads(j)
@@ -85,14 +87,14 @@ class SlidedRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int)
8587
tail ++= nextHeads(j)
8688
j += 1
8789
}
88-
partitions += new SlidedRDDPartition[T](partitionIndex, parentPartitions(i), tail.toArray)
90+
partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail)
8991
partitionIndex += 1
9092
// Skip appended heads.
9193
i = j
9294
}
9395
// If the head of last partition has size w1, we also need to add this partition.
94-
if (nextHeads(n1 - 1).size == w1) {
95-
partitions += new SlidedRDDPartition[T](partitionIndex, parentPartitions(n1), Array.empty)
96+
if (nextHeads.last.size == w1) {
97+
partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(n1), Seq.empty)
9698
}
9799
partitions.toArray
98100
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.mllib.rdd
19+
20+
import org.scalatest.FunSuite
21+
22+
import org.apache.spark.mllib.util.LocalSparkContext
23+
import org.apache.spark.mllib.rdd.RDDFunctions._
24+
25+
class RDDFunctionsSuite extends FunSuite with LocalSparkContext {
26+
27+
test("sliding") {
28+
val data = 0 until 6
29+
for (numPartitions <- 1 to 8) {
30+
val rdd = sc.parallelize(data, numPartitions)
31+
for (windowSize <- 1 to 6) {
32+
val slided = rdd.sliding(windowSize).collect().map(_.toList).toList
33+
val expected = data.sliding(windowSize).map(_.toList).toList
34+
assert(slided === expected)
35+
}
36+
assert(rdd.sliding(7).collect().isEmpty,
37+
"Should return an empty RDD if the window size is greater than the number of items.")
38+
}
39+
}
40+
}

0 commit comments

Comments
 (0)