Skip to content

Commit 9555b73

Browse files
committed
Merge branch 'master' into kafka-rdd-count
2 parents 253031d + e41e2fd commit 9555b73

File tree

547 files changed

+16053
-9257
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

547 files changed

+16053
-9257
lines changed

.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ spark-env.sh
2828
spark-env.cmd
2929
spark-env.sh.template
3030
log4j-defaults.properties
31+
log4j-defaults-repl.properties
3132
bootstrap-tooltip.js
3233
jquery-1.11.1.min.js
3334
d3.min.js

LICENSE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,3 +950,4 @@ The following components are provided under the MIT License. See project link fo
950950
(MIT License) scopt (com.github.scopt:scopt_2.10:3.2.0 - https://github.com/scopt/scopt)
951951
(The MIT License) Mockito (org.mockito:mockito-all:1.8.5 - http://www.mockito.org)
952952
(MIT License) jquery (https://jquery.org/license/)
953+
(MIT License) AnchorJS (https://github.com/bryanbraun/anchorjs)

R/create-docs.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@
2323
# After running this script the html docs can be found in
2424
# $SPARK_HOME/R/pkg/html
2525

26+
set -o pipefail
27+
set -e
28+
2629
# Figure out where the script is
2730
export FWDIR="$(cd "`dirname "$0"`"; pwd)"
2831
pushd $FWDIR
2932

30-
# Generate Rd file
31-
Rscript -e 'library(devtools); devtools::document(pkg="./pkg", roclets=c("rd"))'
32-
33-
# Install the package
33+
# Install the package (this will also generate the Rd files)
3434
./install-dev.sh
3535

3636
# Now create HTML files

R/install-dev.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,20 @@
2626
# NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
2727
# to load the SparkR package on the worker nodes.
2828

29+
set -o pipefail
30+
set -e
2931

3032
FWDIR="$(cd `dirname $0`; pwd)"
3133
LIB_DIR="$FWDIR/lib"
3234

3335
mkdir -p $LIB_DIR
3436

35-
# Install R
37+
pushd $FWDIR
38+
39+
# Generate Rd files if devtools is installed
40+
Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
41+
42+
# Install SparkR to $LIB_DIR
3643
R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
44+
45+
popd

R/log4j.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
log4j.rootCategory=INFO, file
2020
log4j.appender.file=org.apache.log4j.FileAppender
2121
log4j.appender.file.append=true
22-
log4j.appender.file.file=R-unit-tests.log
22+
log4j.appender.file.file=R/target/unit-tests.log
2323
log4j.appender.file.layout=org.apache.log4j.PatternLayout
2424
log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
2525

R/pkg/R/SQLContext.R

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ dropTempTable <- function(sqlContext, tableName) {
452452
#' df <- read.df(sqlContext, "path/to/file.json", source = "json")
453453
#' }
454454

455-
read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
455+
read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
456456
options <- varargsToEnv(...)
457457
if (!is.null(path)) {
458458
options[['path']] <- path
@@ -462,15 +462,21 @@ read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
462462
source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
463463
"org.apache.spark.sql.parquet")
464464
}
465-
sdf <- callJMethod(sqlContext, "load", source, options)
465+
if (!is.null(schema)) {
466+
stopifnot(class(schema) == "structType")
467+
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source,
468+
schema$jobj, options)
469+
} else {
470+
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source, options)
471+
}
466472
dataFrame(sdf)
467473
}
468474

469475
#' @aliases loadDF
470476
#' @export
471477

472-
loadDF <- function(sqlContext, path = NULL, source = NULL, ...) {
473-
read.df(sqlContext, path, source, ...)
478+
loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
479+
read.df(sqlContext, path, source, schema, ...)
474480
}
475481

476482
#' Create an external table

R/pkg/R/serialize.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ writeObject <- function(con, object, writeType = TRUE) {
3737
# passing in vectors as arrays and instead require arrays to be passed
3838
# as lists.
3939
type <- class(object)[[1]] # class of POSIXlt is c("POSIXlt", "POSIXt")
40+
# Checking types is needed here, since ‘is.na’ only handles atomic vectors,
41+
# lists and pairlists
42+
if (type %in% c("integer", "character", "logical", "double", "numeric")) {
43+
if (is.na(object)) {
44+
object <- NULL
45+
type <- "NULL"
46+
}
47+
}
4048
if (writeType) {
4149
writeType(con, type)
4250
}

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,43 @@ test_that("create DataFrame from RDD", {
101101
expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
102102
})
103103

104+
test_that("convert NAs to null type in DataFrames", {
105+
rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L)))
106+
df <- createDataFrame(sqlContext, rdd, list("a", "b"))
107+
expect_true(is.na(collect(df)[2, "a"]))
108+
expect_equal(collect(df)[2, "b"], 4L)
109+
110+
l <- data.frame(x = 1L, y = c(1L, NA_integer_, 3L))
111+
df <- createDataFrame(sqlContext, l)
112+
expect_equal(collect(df)[2, "x"], 1L)
113+
expect_true(is.na(collect(df)[2, "y"]))
114+
115+
rdd <- parallelize(sc, list(list(1, 2), list(NA, 4)))
116+
df <- createDataFrame(sqlContext, rdd, list("a", "b"))
117+
expect_true(is.na(collect(df)[2, "a"]))
118+
expect_equal(collect(df)[2, "b"], 4)
119+
120+
l <- data.frame(x = 1, y = c(1, NA_real_, 3))
121+
df <- createDataFrame(sqlContext, l)
122+
expect_equal(collect(df)[2, "x"], 1)
123+
expect_true(is.na(collect(df)[2, "y"]))
124+
125+
l <- list("a", "b", NA, "d")
126+
df <- createDataFrame(sqlContext, l)
127+
expect_true(is.na(collect(df)[3, "_1"]))
128+
expect_equal(collect(df)[4, "_1"], "d")
129+
130+
l <- list("a", "b", NA_character_, "d")
131+
df <- createDataFrame(sqlContext, l)
132+
expect_true(is.na(collect(df)[3, "_1"]))
133+
expect_equal(collect(df)[4, "_1"], "d")
134+
135+
l <- list(TRUE, FALSE, NA, TRUE)
136+
df <- createDataFrame(sqlContext, l)
137+
expect_true(is.na(collect(df)[3, "_1"]))
138+
expect_equal(collect(df)[4, "_1"], TRUE)
139+
})
140+
104141
test_that("toDF", {
105142
rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
106143
df <- toDF(rdd, list("a", "b"))
@@ -504,6 +541,19 @@ test_that("read.df() from json file", {
504541
df <- read.df(sqlContext, jsonPath, "json")
505542
expect_true(inherits(df, "DataFrame"))
506543
expect_true(count(df) == 3)
544+
545+
# Check if we can apply a user defined schema
546+
schema <- structType(structField("name", type = "string"),
547+
structField("age", type = "double"))
548+
549+
df1 <- read.df(sqlContext, jsonPath, "json", schema)
550+
expect_true(inherits(df1, "DataFrame"))
551+
expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
552+
553+
# Run the same with loadDF
554+
df2 <- loadDF(sqlContext, jsonPath, "json", schema)
555+
expect_true(inherits(df2, "DataFrame"))
556+
expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
507557
})
508558

509559
test_that("write.df() as parquet file", {

bin/pyspark

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,24 +17,10 @@
1717
# limitations under the License.
1818
#
1919

20-
# Figure out where Spark is installed
2120
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
2221

2322
source "$SPARK_HOME"/bin/load-spark-env.sh
24-
25-
function usage() {
26-
if [ -n "$1" ]; then
27-
echo $1
28-
fi
29-
echo "Usage: ./bin/pyspark [options]" 1>&2
30-
"$SPARK_HOME"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
31-
exit $2
32-
}
33-
export -f usage
34-
35-
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
36-
usage
37-
fi
23+
export _SPARK_CMD_USAGE="Usage: ./bin/pyspark [options]"
3824

3925
# In Spark <= 1.1, setting IPYTHON=1 would cause the driver to be launched using the `ipython`
4026
# executable, while the worker would still be launched using PYSPARK_PYTHON.

bin/pyspark2.cmd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ rem Figure out where the Spark framework is installed
2121
set SPARK_HOME=%~dp0..
2222

2323
call %SPARK_HOME%\bin\load-spark-env.cmd
24+
set _SPARK_CMD_USAGE=Usage: bin\pyspark.cmd [options]
2425

2526
rem Figure out which Python to use.
2627
if "x%PYSPARK_DRIVER_PYTHON%"=="x" (

bin/spark-class

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,12 @@
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
1818
#
19-
set -e
2019

2120
# Figure out where Spark is installed
2221
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
2322

2423
. "$SPARK_HOME"/bin/load-spark-env.sh
2524

26-
if [ -z "$1" ]; then
27-
echo "Usage: spark-class <class> [<args>]" 1>&2
28-
exit 1
29-
fi
30-
3125
# Find the java binary
3226
if [ -n "${JAVA_HOME}" ]; then
3327
RUNNER="${JAVA_HOME}/bin/java"
@@ -64,24 +58,6 @@ fi
6458

6559
SPARK_ASSEMBLY_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
6660

67-
# Verify that versions of java used to build the jars and run Spark are compatible
68-
if [ -n "$JAVA_HOME" ]; then
69-
JAR_CMD="$JAVA_HOME/bin/jar"
70-
else
71-
JAR_CMD="jar"
72-
fi
73-
74-
if [ $(command -v "$JAR_CMD") ] ; then
75-
jar_error_check=$("$JAR_CMD" -tf "$SPARK_ASSEMBLY_JAR" nonexistent/class/path 2>&1)
76-
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
77-
echo "Loading Spark jar with '$JAR_CMD' failed. " 1>&2
78-
echo "This is likely because Spark was compiled with Java 7 and run " 1>&2
79-
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark " 1>&2
80-
echo "or build Spark with Java 6." 1>&2
81-
exit 1
82-
fi
83-
fi
84-
8561
LAUNCH_CLASSPATH="$SPARK_ASSEMBLY_JAR"
8662

8763
# Add the launcher build dir to the classpath if requested.
@@ -98,9 +74,4 @@ CMD=()
9874
while IFS= read -d '' -r ARG; do
9975
CMD+=("$ARG")
10076
done < <("$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@")
101-
102-
if [ "${CMD[0]}" = "usage" ]; then
103-
"${CMD[@]}"
104-
else
105-
exec "${CMD[@]}"
106-
fi
77+
exec "${CMD[@]}"

bin/spark-shell

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,7 @@ esac
2929
set -o posix
3030

3131
export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
32-
33-
usage() {
34-
if [ -n "$1" ]; then
35-
echo "$1"
36-
fi
37-
echo "Usage: ./bin/spark-shell [options]"
38-
"$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
39-
exit "$2"
40-
}
41-
export -f usage
42-
43-
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
44-
usage "" 0
45-
fi
32+
export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]"
4633

4734
# SPARK-4161: scala does not assume use of the java classpath,
4835
# so we need to add the "-Dscala.usejavacp=true" flag manually. We

bin/spark-shell2.cmd

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,7 @@ rem limitations under the License.
1818
rem
1919

2020
set SPARK_HOME=%~dp0..
21-
22-
echo "%*" | findstr " \<--help\> \<-h\>" >nul
23-
if %ERRORLEVEL% equ 0 (
24-
call :usage
25-
exit /b 0
26-
)
21+
set _SPARK_CMD_USAGE=Usage: .\bin\spark-shell.cmd [options]
2722

2823
rem SPARK-4161: scala does not assume use of the java classpath,
2924
rem so we need to add the "-Dscala.usejavacp=true" flag manually. We
@@ -37,16 +32,4 @@ if "x%SPARK_SUBMIT_OPTS%"=="x" (
3732
set SPARK_SUBMIT_OPTS="%SPARK_SUBMIT_OPTS% -Dscala.usejavacp=true"
3833

3934
:run_shell
40-
call %SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main %*
41-
set SPARK_ERROR_LEVEL=%ERRORLEVEL%
42-
if not "x%SPARK_LAUNCHER_USAGE_ERROR%"=="x" (
43-
call :usage
44-
exit /b 1
45-
)
46-
exit /b %SPARK_ERROR_LEVEL%
47-
48-
:usage
49-
echo %SPARK_LAUNCHER_USAGE_ERROR%
50-
echo "Usage: .\bin\spark-shell.cmd [options]" >&2
51-
call %SPARK_HOME%\bin\spark-submit2.cmd --help 2>&1 | findstr /V "Usage" 1>&2
52-
goto :eof
35+
%SPARK_HOME%\bin\spark-submit2.cmd --class org.apache.spark.repl.Main %*

bin/spark-sql

Lines changed: 2 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,41 +17,6 @@
1717
# limitations under the License.
1818
#
1919

20-
#
21-
# Shell script for starting the Spark SQL CLI
22-
23-
# Enter posix mode for bash
24-
set -o posix
25-
26-
# NOTE: This exact class name is matched downstream by SparkSubmit.
27-
# Any changes need to be reflected there.
28-
export CLASS="org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
29-
30-
# Figure out where Spark is installed
3120
export FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
32-
33-
function usage {
34-
if [ -n "$1" ]; then
35-
echo "$1"
36-
fi
37-
echo "Usage: ./bin/spark-sql [options] [cli option]"
38-
pattern="usage"
39-
pattern+="\|Spark assembly has been built with Hive"
40-
pattern+="\|NOTE: SPARK_PREPEND_CLASSES is set"
41-
pattern+="\|Spark Command: "
42-
pattern+="\|--help"
43-
pattern+="\|======="
44-
45-
"$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
46-
echo
47-
echo "CLI options:"
48-
"$FWDIR"/bin/spark-class "$CLASS" --help 2>&1 | grep -v "$pattern" 1>&2
49-
exit "$2"
50-
}
51-
export -f usage
52-
53-
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
54-
usage "" 0
55-
fi
56-
57-
exec "$FWDIR"/bin/spark-submit --class "$CLASS" "$@"
21+
export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]"
22+
exec "$FWDIR"/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@"

bin/spark-submit

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,4 @@ SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
2222
# disable randomized hash for string in Python 3.3+
2323
export PYTHONHASHSEED=0
2424

25-
# Only define a usage function if an upstream script hasn't done so.
26-
if ! type -t usage >/dev/null 2>&1; then
27-
usage() {
28-
if [ -n "$1" ]; then
29-
echo "$1"
30-
fi
31-
"$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit --help
32-
exit "$2"
33-
}
34-
export -f usage
35-
fi
36-
3725
exec "$SPARK_HOME"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"

bin/spark-submit2.cmd

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,4 @@ rem disable randomized hash for string in Python 3.3+
2424
set PYTHONHASHSEED=0
2525

2626
set CLASS=org.apache.spark.deploy.SparkSubmit
27-
call %~dp0spark-class2.cmd %CLASS% %*
28-
set SPARK_ERROR_LEVEL=%ERRORLEVEL%
29-
if not "x%SPARK_LAUNCHER_USAGE_ERROR%"=="x" (
30-
call :usage
31-
exit /b 1
32-
)
33-
exit /b %SPARK_ERROR_LEVEL%
34-
35-
:usage
36-
echo %SPARK_LAUNCHER_USAGE_ERROR%
37-
call %SPARK_HOME%\bin\spark-class2.cmd %CLASS% --help
38-
goto :eof
27+
%~dp0spark-class2.cmd %CLASS% %*

0 commit comments

Comments
 (0)