apache · yinxusen · Mar 18, 2014 · Mar 18, 2014 · Mar 18, 2014 · Mar 18, 2014
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@
 sbt/*.jar
 .settings
 .cache
+.mima-excludes
 /build/
 work/
 out/
@@ -45,3 +46,5 @@ dist/
 spark-*-bin.tar.gz
 unit-tests.log
 /lib/
+rat-results.txt
+mllib/build/
diff --git a/.rat-excludes b/.rat-excludes
@@ -0,0 +1,41 @@
+target
+.gitignore
+.project
+.classpath
+.mima-excludes
+.rat-excludes
+.*md
+derby.log
+TAGS
+RELEASE
+control
+docs
+fairscheduler.xml.template
+log4j.properties
+log4j.properties.template
+metrics.properties.template
+slaves
+spark-env.sh
+spark-env.sh.template
+log4j-defaults.properties
+sorttable.js
+.*txt
+.*data
+.*log
+cloudpickle.py
+join.py
+SparkExprTyper.scala
+SparkILoop.scala
+SparkILoopInit.scala
+SparkIMain.scala
+SparkImports.scala
+SparkJLineCompletion.scala
+SparkJLineReader.scala
+SparkMemberHandlers.scala
+sbt
+sbt-launch-lib.bash
+plugins.sbt
+work
+.*\.q
+golden
+test.out/*
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ language: scala
+ scala:
+   - "2.10.3"
+ jdk:
+   - oraclejdk7
+ env:
+  matrix:
+   - TEST=sql/test
+   - TEST=hive/test
+   - TEST=catalyst/test
+   - TEST=streaming/test
+   - TEST=graphx/test
+   - TEST=mllib/test
+   - TEST=graphx/test
+   - TEST=bagel/test
+ cache:
+   directories:
+     - $HOME/.m2
+     - $HOME/.ivy2
+     - $HOME/.sbt
+ script:
+   - "sbt ++$TRAVIS_SCALA_VERSION scalastyle $TEST"
diff --git a/NOTICE b/NOTICE
@@ -1,5 +1,14 @@
 Apache Spark
-Copyright 2013 The Apache Software Foundation.
+Copyright 2014 The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
+
+In addition, this product includes:
+
+- JUnit (http://www.junit.org) is a testing framework for Java. We included it
+  under the terms of the Eclipse Public License v1.0.
+
+- JTransforms (https://sites.google.com/site/piotrwendykier/software/jtransforms)
+  provides fast transforms in Java. It is tri-licensed, and we included it under 
+  the terms of the Mozilla Public License v1.1.
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -79,6 +79,11 @@
       <artifactId>spark-graphx_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -25,31 +25,49 @@ SCALA_VERSION=2.10
 # Figure out where Spark is installed
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 
+# Support for interacting with Hive.  Since hive pulls in a lot of dependencies that might break
+# existing Spark applications, it is not included in the standard spark assembly.  Instead, we only
+# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
+# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
+# the future.
+if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
+  echo "Hive assembly found, including hive support.  If this isn't desired run sbt hive/clean."
+
+  # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
+  DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
+  CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+
+  ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
+else
+  ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
+fi
+
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
-if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar ]; then
+if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/tools/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
   if [ -f "$FWDIR/RELEASE" ]; then
-    ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark-assembly*.jar`
+    ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark*-assembly*.jar`
   else
-    ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar`
+    ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*.jar`
   fi
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
@@ -62,6 +80,9 @@ if [[ $SPARK_TESTING == 1 ]]; then
   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/test-classes"
   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/test-classes"
   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/test-classes"
 fi
 
 # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !

diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script loads spark-env.sh if it exists, and ensures it is only loaded once.
+# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
+# conf/ subdirectory.
+
+if [ -z "$SPARK_ENV_LOADED" ]; then
+  export SPARK_ENV_LOADED=1
+
+  # Returns the parent of the directory this script lives in.
+  parent_dir="$(cd `dirname $0`/..; pwd)"
+
+  use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
+
+  if [ -f "${use_conf_dir}/spark-env.sh" ]; then
+    . "${use_conf_dir}/spark-env.sh"
+  fi
+fi
diff --git a/bin/pyspark b/bin/pyspark
@@ -36,10 +36,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
   fi
 fi
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 # Figure out which Python executable to use
 if [ -z "$PYSPARK_PYTHON" ] ; then

diff --git a/bin/run-example b/bin/run-example
@@ -30,10 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Export this as SPARK_HOME
 export SPARK_HOME="$FWDIR"
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 if [ -z "$1" ]; then
   echo "Usage: run-example <example-class> [<args>]" >&2

diff --git a/bin/spark-class b/bin/spark-class
@@ -30,10 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Export this as SPARK_HOME
 export SPARK_HOME="$FWDIR"
 
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
+. $FWDIR/bin/load-spark-env.sh
 
 if [ -z "$1" ]; then
   echo "Usage: spark-class <class> [<args>]" >&2
@@ -137,8 +134,7 @@ fi
 
 # Compute classpath using external script
 CLASSPATH=`$FWDIR/bin/compute-classpath.sh`
-
-if [ "$1" == "org.apache.spark.tools.JavaAPICompletenessChecker" ]; then
+if [[ "$1" =~ org.apache.spark.tools.* ]]; then
   CLASSPATH="$CLASSPATH:$SPARK_TOOLS_JAR"
 fi
 

diff --git a/bin/spark-shell b/bin/spark-shell
@@ -81,9 +81,7 @@ done
 # Set MASTER from spark-env if possible
 DEFAULT_SPARK_MASTER_PORT=7077
 if [ -z "$MASTER" ]; then
-  if [ -e "$FWDIR/conf/spark-env.sh" ]; then
-    . "$FWDIR/conf/spark-env.sh"
-  fi
+  . $FWDIR/bin/load-spark-env.sh
   if [ "x" != "x$SPARK_MASTER_IP" ]; then
     if [ "y" != "y$SPARK_MASTER_PORT" ]; then
       SPARK_MASTER_PORT="${SPARK_MASTER_PORT}"

diff --git a/core/pom.xml b/core/pom.xml
@@ -200,11 +200,6 @@
       <artifactId>derby</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.binary.version}</artifactId>

diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark
 
-import scala.{Option, deprecated}
-
 import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
 
 /**
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,8 +17,6 @@ @@
     package org.apache.spark
-    import scala.{Option, deprecated}
     import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
     /**
@@ Expand Down @@