apache
diff --git a/‎LICENSE‎
Lines changed: 0 additions & 12 deletions b/‎LICENSE‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎assembly/pom.xml‎
Lines changed: 4 additions & 0 deletions b/‎assembly/pom.xml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/pom.xml‎
Lines changed: 5 additions & 6 deletions b/‎core/pom.xml‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/TestUtils.scala‎
Lines changed: 2 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/TestUtils.scala‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala‎
Lines changed: 20 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala‎
Lines changed: 30 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala‎
Lines changed: 0 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/network/ManagedBuffer.scala‎
Lines changed: 8 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/network/ManagedBuffer.scala‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala‎
Lines changed: 15 additions & 10 deletions b/‎core/src/main/scala/org/apache/spark/network/nio/NioBlockTransferService.scala‎
Lines changed: 15 additions & 10 deletions
@@ -712,18 +712,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-========================================================================
-For colt:
-========================================================================
-
-Copyright (c) 1999 CERN - European Organization for Nuclear Research.
-Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. CERN makes no representations about the suitability of this software for any purpose. It is provided "as is" without expressed or implied warranty.
-
-Packages hep.aida.*
-
-Written by Pavel Binko, Dino Ferrero Merlino, Wolfgang Hoschek, Tony Johnson, Andreas Pfeiffer, and others. Check the FreeHEP home page for more info. Permission to use and/or redistribute this work is granted under the terms of the LGPL License, with the exception that any usage related to military applications is expressly forbidden. The software and documentation made available under the terms of this license are provided with no warranty.
-
-
 ========================================================================
 For SnapTree:
 ========================================================================
 
@@ -84,7 +84,7 @@ storage systems. Because the protocols have changed in different versions of
 Hadoop, you must build Spark against the same version that your cluster runs.
 
 Please refer to the build documentation at
-["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)
+["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-with-maven.html#specifying-the-hadoop-version)
 for detailed guidance on building for a particular distribution of Hadoop, including
 building for particular Hive and Hive Thriftserver distributions. See also
 ["Third Party Hadoop Distributions"](http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html)
 
@@ -146,6 +146,10 @@
                     <exclude>com/google/common/base/Present*</exclude>
                   </excludes>
                 </relocation>
+                <relocation>
+                  <pattern>org.apache.commons.math3</pattern>
+                  <shadedPattern>org.spark-project.commons.math3</shadedPattern>
+                </relocation>
               </relocations>
               <transformers>
                 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
 
@@ -85,8 +85,6 @@
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-math3</artifactId>
-      <version>3.3</version>
-      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>com.google.code.findbugs</groupId>
@@ -162,10 +160,6 @@
       <artifactId>json4s-jackson_${scala.binary.version}</artifactId>
       <version>3.2.10</version>
     </dependency>
-    <dependency>
-      <groupId>colt</groupId>
-      <artifactId>colt</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.apache.mesos</groupId>
       <artifactId>mesos</artifactId>
@@ -247,6 +241,11 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.seleniumhq.selenium</groupId>
+      <artifactId>selenium-java</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>
 
@@ -23,8 +23,8 @@ import java.util.jar.{JarEntry, JarOutputStream}
 
 import scala.collection.JavaConversions._
 
+import com.google.common.io.{ByteStreams, Files}
 import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
-import com.google.common.io.Files
 
 import org.apache.spark.util.Utils
 
@@ -64,12 +64,7 @@ private[spark] object TestUtils {
       jarStream.putNextEntry(jarEntry)
 
       val in = new FileInputStream(file)
-      val buffer = new Array[Byte](10240)
-      var nRead = 0
-      while (nRead <= 0) {
-        nRead = in.read(buffer, 0, buffer.length)
-        jarStream.write(buffer, 0, nRead)
-      }
+      ByteStreams.copy(in, jarStream)
       in.close()
     }
     jarStream.close()
 
@@ -20,6 +20,8 @@ package org.apache.spark.broadcast
 import java.io.Serializable
 
 import org.apache.spark.SparkException
+import org.apache.spark.Logging
+import org.apache.spark.util.Utils
 
 import scala.reflect.ClassTag
 
@@ -52,14 +54,16 @@ import scala.reflect.ClassTag
  * @param id A unique identifier for the broadcast variable.
  * @tparam T Type of the data contained in the broadcast variable.
  */
-abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable {
+abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Logging {
 
   /**
    * Flag signifying whether the broadcast variable is valid
    * (that is, not already destroyed) or not.
    */
   @volatile private var _isValid = true
 
+  private var _destroySite = ""
+
   /** Get the broadcasted value. */
   def value: T = {
     assertValid()
@@ -84,13 +88,26 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable {
     doUnpersist(blocking)
   }
 
+
+  /**
+   * Destroy all data and metadata related to this broadcast variable. Use this with caution;
+   * once a broadcast variable has been destroyed, it cannot be used again.
+   * This method blocks until destroy has completed
+   */
+  def destroy() {
+    destroy(blocking = true)
+  }
+
   /**
    * Destroy all data and metadata related to this broadcast variable. Use this with caution;
    * once a broadcast variable has been destroyed, it cannot be used again.
+   * @param blocking Whether to block until destroy has completed
    */
   private[spark] def destroy(blocking: Boolean) {
     assertValid()
     _isValid = false
+    _destroySite = Utils.getCallSite().shortForm
+    logInfo("Destroying %s (from %s)".format(toString, _destroySite))
     doDestroy(blocking)
   }
 
@@ -124,7 +141,8 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable {
   /** Check if this broadcast is valid. If not valid, exception is thrown. */
   protected def assertValid() {
     if (!_isValid) {
-      throw new SparkException("Attempted to use %s after it has been destroyed!".format(toString))
+      throw new SparkException(
+        "Attempted to use %s after it was destroyed (%s) ".format(toString, _destroySite))
     }
   }
 
 
@@ -20,12 +20,15 @@ package org.apache.spark.deploy
 import java.security.PrivilegedExceptionAction
 
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.FileSystem.Statistics
 import org.apache.hadoop.mapred.JobConf
 import org.apache.hadoop.security.Credentials
 import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{Logging, SparkContext, SparkConf, SparkException}
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.util.Utils
 
 import scala.collection.JavaConversions._
 
@@ -121,6 +124,33 @@ class SparkHadoopUtil extends Logging {
     UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename)
   }
 
+  /**
+   * Returns a function that can be called to find Hadoop FileSystem bytes read. If
+   * getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will
+   * return the bytes read on r since t.  Reflection is required because thread-level FileSystem
+   * statistics are only available as of Hadoop 2.5 (see HADOOP-10688).
+   * Returns None if the required method can't be found.
+   */
+  private[spark] def getFSBytesReadOnThreadCallback(path: Path, conf: Configuration)
+    : Option[() => Long] = {
+    val qualifiedPath = path.getFileSystem(conf).makeQualified(path)
+    val scheme = qualifiedPath.toUri().getScheme()
+    val stats = FileSystem.getAllStatistics().filter(_.getScheme().equals(scheme))
+    try {
+      val threadStats = stats.map(Utils.invoke(classOf[Statistics], _, "getThreadStatistics"))
+      val statisticsDataClass =
+        Class.forName("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsData")
+      val getBytesReadMethod = statisticsDataClass.getDeclaredMethod("getBytesRead")
+      val f = () => threadStats.map(getBytesReadMethod.invoke(_).asInstanceOf[Long]).sum
+      val baselineBytesRead = f()
+      Some(() => f() - baselineBytesRead)
+    } catch {
+      case e: NoSuchMethodException => {
+        logDebug("Couldn't find method for retrieving thread-level FileSystem input data", e)
+        None
+      }
+    }
+  }
 }
 
 object SparkHadoopUtil {
 
@@ -169,7 +169,6 @@ case class InputMetrics(readMethod: DataReadMethod.Value) {
   var bytesRead: Long = 0L
 }
 
-
 /**
  * :: DeveloperApi ::
  * Metrics pertaining to shuffle data read in a given task.
 
@@ -81,7 +81,13 @@ final class FileSegmentManagedBuffer(val file: File, val offset: Long, val lengt
       // Just copy the buffer if it's sufficiently small, as memory mapping has a high overhead.
       if (length < MIN_MEMORY_MAP_BYTES) {
         val buf = ByteBuffer.allocate(length.toInt)
-        channel.read(buf, offset)
+        channel.position(offset)
+        while (buf.remaining() != 0) {
+          if (channel.read(buf) == -1) {
+            throw new IOException("Reached EOF before filling buffer\n" +
+              s"offset=$offset\nfile=${file.getAbsolutePath}\nbuf.remaining=${buf.remaining}")
+          }
+        }
         buf.flip()
         buf
       } else {
@@ -106,7 +112,7 @@ final class FileSegmentManagedBuffer(val file: File, val offset: Long, val lengt
     var is: FileInputStream = null
     try {
       is = new FileInputStream(file)
-      is.skip(offset)
+      ByteStreams.skipFully(is, offset)
       ByteStreams.limit(is, length)
     } catch {
       case e: IOException =>
 
@@ -95,16 +95,21 @@ final class NioBlockTransferService(conf: SparkConf, securityManager: SecurityMa
     future.onSuccess { case message =>
       val bufferMessage = message.asInstanceOf[BufferMessage]
       val blockMessageArray = BlockMessageArray.fromBufferMessage(bufferMessage)
-
-      for (blockMessage <- blockMessageArray) {
-        if (blockMessage.getType != BlockMessage.TYPE_GOT_BLOCK) {
-          listener.onBlockFetchFailure(
-            new SparkException(s"Unexpected message ${blockMessage.getType} received from $cmId"))
-        } else {
-          val blockId = blockMessage.getId
-          val networkSize = blockMessage.getData.limit()
-          listener.onBlockFetchSuccess(
-            blockId.toString, new NioByteBufferManagedBuffer(blockMessage.getData))
+      // SPARK-4064: In some cases(eg. Remote block was removed) blockMessageArray may be empty.
+      if (blockMessageArray.isEmpty) {
+        listener.onBlockFetchFailure(
+          new SparkException(s"Received empty message from $cmId"))
+      } else {
+        for (blockMessage <- blockMessageArray) {
+          val msgType = blockMessage.getType
+          if (msgType != BlockMessage.TYPE_GOT_BLOCK) {
+            listener.onBlockFetchFailure(
+              new SparkException(s"Unexpected message ${msgType} received from $cmId"))
+          } else {
+            val blockId = blockMessage.getId
+            listener.onBlockFetchSuccess(
+              blockId.toString, new NioByteBufferManagedBuffer(blockMessage.getData))
+          }
         }
       }
     }(cm.futureExecContext)
Original file line number	Diff line number	Diff line change
`@@ -169,7 +169,6 @@ case class InputMetrics(readMethod: DataReadMethod.Value) {`
`169`	`169`	`var bytesRead: Long = 0L`
`170`	`170`	`}`
`171`	`171`
`172`		`-`
`173`	`172`	`/**`
`174`	`173`	`* :: DeveloperApi ::`
`175`	`174`	`* Metrics pertaining to shuffle data read in a given task.`