alteryx
diff --git a/‎CHANGES.txt‎
Lines changed: 559 additions & 0 deletions b/‎CHANGES.txt‎
Lines changed: 559 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎assembly/pom.xml‎
Lines changed: 28 additions & 0 deletions b/‎assembly/pom.xml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎bagel/pom.xml‎
Lines changed: 12 additions & 0 deletions b/‎bagel/pom.xml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎core/pom.xml‎
Lines changed: 3 additions & 7 deletions b/‎core/pom.xml‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala‎
Lines changed: 6 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 26 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala‎
Lines changed: 14 additions & 14 deletions b/‎core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala‎
Lines changed: 3 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala‎
Lines changed: 1 addition & 1 deletion
@@ -55,7 +55,7 @@ versions without YARN, use:
     # Cloudera CDH 4.2.0 with MapReduce v1
     $ SPARK_HADOOP_VERSION=2.0.0-mr1-cdh4.2.0 sbt/sbt assembly
 
-For Apache Hadoop 2.x, 0.23.x, Cloudera CDH MRv2, and other Hadoop versions
+For Apache Hadoop 2.0.X, 2.1.X, 2.2.X, 0.23.x, Cloudera CDH MRv2, and other Hadoop versions
 with YARN, also set `SPARK_YARN=true`:
 
     # Apache Hadoop 2.0.5-alpha
@@ -64,8 +64,8 @@ with YARN, also set `SPARK_YARN=true`:
     # Cloudera CDH 4.2.0 with MapReduce v2
     $ SPARK_HADOOP_VERSION=2.0.0-cdh4.2.0 SPARK_YARN=true sbt/sbt assembly
 
-For convenience, these variables may also be set through the `conf/spark-env.sh` file
-described below.
+    # Apache Hadoop 2.2.0 with YARN
+    $ SPARK_HADOOP_VERSION=2.2.0 SPARK_YARN=true sbt/sbt assembly
 
 When developing a Spark application, specify the Hadoop version by adding the
 "hadoop-client" artifact to your project's dependencies. For example, if you're
 
@@ -46,6 +46,23 @@
   </repositories>
 
   <dependencies>
+    <dependency>
+      <groupId>${akka.group}</groupId>
+      <artifactId>akka-actor</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>${akka.group}</groupId>
+      <artifactId>akka-remote</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>${akka.group}</groupId>
+      <artifactId>akka-slf4j</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>${akka.group}</groupId>
+      <artifactId>akka-zeromq</artifactId>
+    </dependency>
+
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_2.9.3</artifactId>
@@ -140,6 +157,17 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>new-yarn</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-yarn_2.9.3</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+
     <profile>
       <id>bigtop-dist</id>
       <!-- This profile uses the assembly plugin to create a special "dist" package for BigTop
 
@@ -32,6 +32,18 @@
   <url>http://spark.incubator.apache.org/</url>
 
   <dependencies>
+    <dependency>
+      <groupId>${akka.group}</groupId>
+      <artifactId>akka-actor</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>${akka.group}</groupId>
+      <artifactId>akka-remote</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>${akka.group}</groupId>
+      <artifactId>akka-slf4j</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_2.9.3</artifactId>
 
@@ -95,15 +95,15 @@
       <version>0.3.1</version>
     </dependency>
     <dependency>
-      <groupId>com.typesafe.akka</groupId>
+      <groupId>${akka.group}</groupId>
       <artifactId>akka-actor</artifactId>
     </dependency>
     <dependency>
-      <groupId>com.typesafe.akka</groupId>
+      <groupId>${akka.group}</groupId>
       <artifactId>akka-remote</artifactId>
     </dependency>
     <dependency>
-      <groupId>com.typesafe.akka</groupId>
+      <groupId>${akka.group}</groupId>
       <artifactId>akka-slf4j</artifactId>
     </dependency>
     <dependency>
@@ -126,10 +126,6 @@
       <groupId>colt</groupId>
       <artifactId>colt</artifactId>
     </dependency>
-    <dependency>
-      <groupId>com.github.scala-incubator.io</groupId>
-      <artifactId>scala-io-file_2.9.2</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.apache.mesos</groupId>
       <artifactId>mesos</artifactId>
 
@@ -247,7 +247,7 @@ private[spark] class MapOutputTracker extends Logging {
         case Some(bytes) =>
           return bytes
         case None =>
-          statuses = mapStatuses(shuffleId)
+          statuses = mapStatuses.getOrElse(shuffleId, Array[MapStatus]())
           epochGotten = epoch
       }
     }
@@ -261,9 +261,13 @@ private[spark] class MapOutputTracker extends Logging {
         cachedSerializedStatuses(shuffleId) = bytes
       }
     }
-    return bytes
+    bytes
   }
 
+  def has(shuffleId: Int): Boolean = {
+    cachedSerializedStatuses.get(shuffleId).isDefined || mapStatuses.contains(shuffleId)
+  }
+  
   // Serialize an array of map output locations into an efficient byte format so that we can send
   // it to reduce tasks. We do this by compressing the serialized bytes using GZIP. They will
   // generally be pretty compressible because many map outputs will be on the same hostname.
 
@@ -24,7 +24,6 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.Map
 import scala.collection.generic.Growable
-import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 
@@ -82,7 +81,7 @@ class SparkContext(
     val sparkHome: String = null,
     val jars: Seq[String] = Nil,
     val environment: Map[String, String] = Map(),
-    // This is used only by yarn for now, but should be relevant to other cluster types (mesos, etc)
+    // This is used only by YARN for now, but should be relevant to other cluster types (Mesos, etc)
     // too. This is typically generated from InputFormatInfo.computePreferredLocations .. host, set
     // of data-local splits on host
     val preferredNodeLocationData: scala.collection.Map[String, scala.collection.Set[SplitInfo]] =
@@ -227,6 +226,31 @@ class SparkContext(
         scheduler.initialize(backend)
         scheduler
 
+      case "yarn-client" =>
+        val scheduler = try {
+          val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClientClusterScheduler")
+          val cons = clazz.getConstructor(classOf[SparkContext])
+          cons.newInstance(this).asInstanceOf[ClusterScheduler]
+
+        } catch {
+          case th: Throwable => {
+            throw new SparkException("YARN mode not available ?", th)
+          }
+        }
+
+        val backend = try {
+          val clazz = Class.forName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")
+          val cons = clazz.getConstructor(classOf[ClusterScheduler], classOf[SparkContext])
+          cons.newInstance(scheduler, this).asInstanceOf[CoarseGrainedSchedulerBackend]
+        } catch {
+          case th: Throwable => {
+            throw new SparkException("YARN mode not available ?", th)
+          }
+        }
+
+        scheduler.initialize(backend)
+        scheduler
+
       case _ =>
         if (MESOS_REGEX.findFirstIn(master).isEmpty) {
           logWarning("Master %s does not match expected format, parsing as Mesos URL".format(master))
 
@@ -1,19 +1,19 @@
 /*
  *
- *  * Licensed to the Apache Software Foundation (ASF) under one or more
- *  * contributor license agreements.  See the NOTICE file distributed with
- *  * this work for additional information regarding copyright ownership.
- *  * The ASF licenses this file to You under the Apache License, Version 2.0
- *  * (the "License"); you may not use this file except in compliance with
- *  * the License.  You may obtain a copy of the License at
- *  *
- *  *    http://www.apache.org/licenses/LICENSE-2.0
- *  *
- *  * Unless required by applicable law or agreed to in writing, software
- *  * distributed under the License is distributed on an "AS IS" BASIS,
- *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  * See the License for the specific language governing permissions and
- *  * limitations under the License.
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
  *
  */
 
 
@@ -17,12 +17,12 @@
 
 package org.apache.spark.deploy
 
-import akka.actor.{ActorRef, Props, Actor, ActorSystem, Terminated}
+import akka.actor.ActorSystem
 
 import org.apache.spark.deploy.worker.Worker
 import org.apache.spark.deploy.master.Master
-import org.apache.spark.util.{Utils, AkkaUtils}
-import org.apache.spark.{Logging}
+import org.apache.spark.util.Utils
+import org.apache.spark.Logging
 
 import scala.collection.mutable.ArrayBuffer
 
 
@@ -22,7 +22,7 @@ import java.nio.ByteBuffer
 import akka.actor.{ActorRef, Actor, Props, Terminated}
 import akka.remote.{RemoteClientLifeCycleEvent, RemoteClientShutdown, RemoteClientDisconnected}
 
-import org.apache.spark.{Logging, SparkEnv}
+import org.apache.spark.Logging
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.util.{Utils, AkkaUtils}
Original file line number	Diff line number	Diff line change
`@@ -247,7 +247,7 @@ private[spark] class MapOutputTracker extends Logging {`
`247`	`247`	`case Some(bytes) =>`
`248`	`248`	`return bytes`
`249`	`249`	`case None =>`
`250`		`- statuses = mapStatuses(shuffleId)`
	`250`	`+ statuses = mapStatuses.getOrElse(shuffleId, Array[MapStatus]())`
`251`	`251`	`epochGotten = epoch`
`252`	`252`	`}`
`253`	`253`	`}`
`@@ -261,9 +261,13 @@ private[spark] class MapOutputTracker extends Logging {`
`261`	`261`	`cachedSerializedStatuses(shuffleId) = bytes`
`262`	`262`	`}`
`263`	`263`	`}`
`264`		`- return bytes`
	`264`	`+ bytes`
`265`	`265`	`}`
`266`	`266`
	`267`	`+ def has(shuffleId: Int): Boolean = {`
	`268`	`+ cachedSerializedStatuses.get(shuffleId).isDefined \|\| mapStatuses.contains(shuffleId)`
	`269`	`+ }`
	`270`	`+`
`267`	`271`	`// Serialize an array of map output locations into an efficient byte format so that we can send`
`268`	`272`	`// it to reduce tasks. We do this by compressing the serialized bytes using GZIP. They will`
`269`	`273`	`// generally be pretty compressible because many map outputs will be on the same hostname.`