Skip to content

Commit 69da6cf

Browse files
committed
Merge pull request #1 from apache/master
merging upstream updates
2 parents 41c4a33 + 1abbde0 commit 69da6cf

File tree

86 files changed

+2564
-239
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+2564
-239
lines changed

bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter with Timeo
8080
test("large number of iterations") {
8181
// This tests whether jobs with a large number of iterations finish in a reasonable time,
8282
// because non-memoized recursion in RDD or DAGScheduler used to cause them to hang
83-
failAfter(10 seconds) {
83+
failAfter(30 seconds) {
8484
sc = new SparkContext("local", "test")
8585
val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
8686
val msgs = sc.parallelize(Array[(String, TestMessage)]())
@@ -101,7 +101,7 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter with Timeo
101101
sc = new SparkContext("local", "test")
102102
val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
103103
val msgs = sc.parallelize(Array[(String, TestMessage)]())
104-
val numSupersteps = 50
104+
val numSupersteps = 20
105105
val result =
106106
Bagel.run(sc, verts, msgs, sc.defaultParallelism, StorageLevel.DISK_ONLY) {
107107
(self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>

bin/pyspark

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ else
8686
if [[ "$IPYTHON" = "1" ]]; then
8787
exec ipython $IPYTHON_OPTS
8888
else
89-
exec "$PYSPARK_PYTHON"
89+
if [[ -n $SPARK_TESTING ]]; then
90+
exec "$PYSPARK_PYTHON" -m doctest
91+
else
92+
exec "$PYSPARK_PYTHON"
93+
fi
9094
fi
9195
fi

bin/run-example

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
5151
EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS"
5252
fi
5353

54-
./bin/spark-submit \
54+
"$FWDIR"/bin/spark-submit \
5555
--master $EXAMPLE_MASTER \
5656
--class $EXAMPLE_CLASS \
5757
"$SPARK_EXAMPLES_JAR" \

core/src/main/scala/org/apache/spark/Partitioner.scala

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,17 @@ class HashPartitioner(partitions: Int) extends Partitioner {
8383
case _ =>
8484
false
8585
}
86+
87+
override def hashCode: Int = numPartitions
8688
}
8789

8890
/**
8991
* A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly
9092
* equal ranges. The ranges are determined by sampling the content of the RDD passed in.
93+
*
94+
* Note that the actual number of partitions created by the RangePartitioner might not be the same
95+
* as the `partitions` parameter, in the case where the number of sampled records is less than
96+
* the value of `partitions`.
9197
*/
9298
class RangePartitioner[K : Ordering : ClassTag, V](
9399
partitions: Int,
@@ -119,7 +125,7 @@ class RangePartitioner[K : Ordering : ClassTag, V](
119125
}
120126
}
121127

122-
def numPartitions = partitions
128+
def numPartitions = rangeBounds.length + 1
123129

124130
private val binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K]
125131

@@ -155,4 +161,16 @@ class RangePartitioner[K : Ordering : ClassTag, V](
155161
case _ =>
156162
false
157163
}
164+
165+
override def hashCode(): Int = {
166+
val prime = 31
167+
var result = 1
168+
var i = 0
169+
while (i < rangeBounds.length) {
170+
result = prime * result + rangeBounds(i).hashCode
171+
i += 1
172+
}
173+
result = prime * result + ascending.hashCode
174+
result
175+
}
158176
}

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,7 @@ class SparkContext(config: SparkConf) extends Logging {
455455
*/
456456
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = {
457457
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
458-
minPartitions).map(pair => pair._2.toString)
458+
minPartitions).map(pair => pair._2.toString).setName(path)
459459
}
460460

461461
/**
@@ -496,7 +496,7 @@ class SparkContext(config: SparkConf) extends Logging {
496496
classOf[String],
497497
classOf[String],
498498
updateConf,
499-
minPartitions)
499+
minPartitions).setName(path)
500500
}
501501

502502
/**
@@ -551,7 +551,7 @@ class SparkContext(config: SparkConf) extends Logging {
551551
inputFormatClass,
552552
keyClass,
553553
valueClass,
554-
minPartitions)
554+
minPartitions).setName(path)
555555
}
556556

557557
/**
@@ -623,7 +623,7 @@ class SparkContext(config: SparkConf) extends Logging {
623623
val job = new NewHadoopJob(conf)
624624
NewFileInputFormat.addInputPath(job, new Path(path))
625625
val updatedConf = job.getConfiguration
626-
new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf)
626+
new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf).setName(path)
627627
}
628628

629629
/**
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.api.python
19+
20+
import org.apache.spark.rdd.RDD
21+
import org.apache.spark.Logging
22+
import org.apache.hadoop.conf.Configuration
23+
import org.apache.hadoop.io._
24+
import scala.util.{Failure, Success, Try}
25+
import org.apache.spark.annotation.Experimental
26+
27+
28+
/**
29+
* :: Experimental ::
30+
* A trait for use with reading custom classes in PySpark. Implement this trait and add custom
31+
* transformation code by overriding the convert method.
32+
*/
33+
@Experimental
34+
trait Converter[T, U] extends Serializable {
35+
def convert(obj: T): U
36+
}
37+
38+
private[python] object Converter extends Logging {
39+
40+
def getInstance(converterClass: Option[String]): Converter[Any, Any] = {
41+
converterClass.map { cc =>
42+
Try {
43+
val c = Class.forName(cc).newInstance().asInstanceOf[Converter[Any, Any]]
44+
logInfo(s"Loaded converter: $cc")
45+
c
46+
} match {
47+
case Success(c) => c
48+
case Failure(err) =>
49+
logError(s"Failed to load converter: $cc")
50+
throw err
51+
}
52+
}.getOrElse { new DefaultConverter }
53+
}
54+
}
55+
56+
/**
57+
* A converter that handles conversion of common [[org.apache.hadoop.io.Writable]] objects.
58+
* Other objects are passed through without conversion.
59+
*/
60+
private[python] class DefaultConverter extends Converter[Any, Any] {
61+
62+
/**
63+
* Converts a [[org.apache.hadoop.io.Writable]] to the underlying primitive, String or
64+
* object representation
65+
*/
66+
private def convertWritable(writable: Writable): Any = {
67+
import collection.JavaConversions._
68+
writable match {
69+
case iw: IntWritable => iw.get()
70+
case dw: DoubleWritable => dw.get()
71+
case lw: LongWritable => lw.get()
72+
case fw: FloatWritable => fw.get()
73+
case t: Text => t.toString
74+
case bw: BooleanWritable => bw.get()
75+
case byw: BytesWritable => byw.getBytes
76+
case n: NullWritable => null
77+
case aw: ArrayWritable => aw.get().map(convertWritable(_))
78+
case mw: MapWritable => mapAsJavaMap(mw.map { case (k, v) =>
79+
(convertWritable(k), convertWritable(v))
80+
}.toMap)
81+
case other => other
82+
}
83+
}
84+
85+
def convert(obj: Any): Any = {
86+
obj match {
87+
case writable: Writable =>
88+
convertWritable(writable)
89+
case _ =>
90+
obj
91+
}
92+
}
93+
}
94+
95+
/** Utilities for working with Python objects <-> Hadoop-related objects */
96+
private[python] object PythonHadoopUtil {
97+
98+
/**
99+
* Convert a [[java.util.Map]] of properties to a [[org.apache.hadoop.conf.Configuration]]
100+
*/
101+
def mapToConf(map: java.util.Map[String, String]): Configuration = {
102+
import collection.JavaConversions._
103+
val conf = new Configuration()
104+
map.foreach{ case (k, v) => conf.set(k, v) }
105+
conf
106+
}
107+
108+
/**
109+
* Merges two configurations, returns a copy of left with keys from right overwriting
110+
* any matching keys in left
111+
*/
112+
def mergeConfs(left: Configuration, right: Configuration): Configuration = {
113+
import collection.JavaConversions._
114+
val copy = new Configuration(left)
115+
right.iterator().foreach(entry => copy.set(entry.getKey, entry.getValue))
116+
copy
117+
}
118+
119+
/**
120+
* Converts an RDD of key-value pairs, where key and/or value could be instances of
121+
* [[org.apache.hadoop.io.Writable]], into an RDD[(K, V)]
122+
*/
123+
def convertRDD[K, V](rdd: RDD[(K, V)],
124+
keyConverter: Converter[Any, Any],
125+
valueConverter: Converter[Any, Any]): RDD[(Any, Any)] = {
126+
rdd.map { case (k, v) => (keyConverter.convert(k), valueConverter.convert(v)) }
127+
}
128+
129+
}

core/src/main/scala/org/apache/spark/api/python/PythonPartitioner.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,6 @@ private[spark] class PythonPartitioner(
5050
case _ =>
5151
false
5252
}
53+
54+
override def hashCode: Int = 31 * numPartitions + pyPartitionFunctionId.hashCode
5355
}

0 commit comments

Comments
 (0)