Skip to content

Commit f2ceacd

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents 126a51e + 66a7637 commit f2ceacd

File tree

316 files changed

+8663
-4775
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

316 files changed

+8663
-4775
lines changed

.github/workflows/build_and_test.yml

Lines changed: 65 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
fail-fast: false
2424
matrix:
2525
java:
26-
- 1.8
26+
- 8
2727
hadoop:
2828
- hadoop3.2
2929
hive:
@@ -49,26 +49,26 @@ jobs:
4949
include:
5050
# Hive tests
5151
- modules: hive
52-
java: 1.8
52+
java: 8
5353
hadoop: hadoop3.2
5454
hive: hive2.3
5555
included-tags: org.apache.spark.tags.SlowHiveTest
5656
comment: "- slow tests"
5757
- modules: hive
58-
java: 1.8
58+
java: 8
5959
hadoop: hadoop3.2
6060
hive: hive2.3
6161
excluded-tags: org.apache.spark.tags.SlowHiveTest
6262
comment: "- other tests"
6363
# SQL tests
6464
- modules: sql
65-
java: 1.8
65+
java: 8
6666
hadoop: hadoop3.2
6767
hive: hive2.3
6868
included-tags: org.apache.spark.tags.ExtendedSQLTest
6969
comment: "- slow tests"
7070
- modules: sql
71-
java: 1.8
71+
java: 8
7272
hadoop: hadoop3.2
7373
hive: hive2.3
7474
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
@@ -101,24 +101,18 @@ jobs:
101101
build/zinc-*
102102
build/scala-*
103103
build/*.jar
104+
~/.sbt
104105
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
105106
restore-keys: |
106107
build-
107-
- name: Cache Maven local repository
108-
uses: actions/cache@v2
109-
with:
110-
path: ~/.m2/repository
111-
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
112-
restore-keys: |
113-
${{ matrix.java }}-${{ matrix.hadoop }}-maven-
114108
- name: Cache Coursier local repository
115109
uses: actions/cache@v2
116110
with:
117111
path: ~/.cache/coursier
118112
key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
119113
restore-keys: |
120114
${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
121-
- name: Install JDK ${{ matrix.java }}
115+
- name: Install Java ${{ matrix.java }}
122116
uses: actions/setup-java@v1
123117
with:
124118
java-version: ${{ matrix.java }}
@@ -139,11 +133,9 @@ jobs:
139133
# Run the tests.
140134
- name: Run tests
141135
run: |
142-
# Hive tests become flaky when running in parallel as it's too intensive.
143-
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
144-
mkdir -p ~/.m2
136+
# Hive and SQL tests become flaky when running in parallel as it's too intensive.
137+
if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi
145138
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
146-
rm -rf ~/.m2/repository/org/apache/spark
147139
- name: Upload test results to report
148140
if: always()
149141
uses: actions/upload-artifact@v2
@@ -196,16 +188,10 @@ jobs:
196188
build/zinc-*
197189
build/scala-*
198190
build/*.jar
191+
~/.sbt
199192
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
200193
restore-keys: |
201194
build-
202-
- name: Cache Maven local repository
203-
uses: actions/cache@v2
204-
with:
205-
path: ~/.m2/repository
206-
key: pyspark-maven-${{ hashFiles('**/pom.xml') }}
207-
restore-keys: |
208-
pyspark-maven-
209195
- name: Cache Coursier local repository
210196
uses: actions/cache@v2
211197
with:
@@ -228,24 +214,22 @@ jobs:
228214
# Run the tests.
229215
- name: Run tests
230216
run: |
231-
mkdir -p ~/.m2
232217
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
233-
rm -rf ~/.m2/repository/org/apache/spark
234218
- name: Upload test results to report
235219
if: always()
236220
uses: actions/upload-artifact@v2
237221
with:
238-
name: test-results-${{ matrix.modules }}--1.8-hadoop3.2-hive2.3
222+
name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3
239223
path: "**/target/test-reports/*.xml"
240224
- name: Upload unit tests log files
241225
if: failure()
242226
uses: actions/upload-artifact@v2
243227
with:
244-
name: unit-tests-log-${{ matrix.modules }}--1.8-hadoop3.2-hive2.3
228+
name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3
245229
path: "**/target/unit-tests.log"
246230

247231
sparkr:
248-
name: Build modules - sparkr
232+
name: "Build modules: sparkr"
249233
runs-on: ubuntu-20.04
250234
container:
251235
image: dongjoon/apache-spark-github-action-image:20201025
@@ -272,16 +256,10 @@ jobs:
272256
build/zinc-*
273257
build/scala-*
274258
build/*.jar
259+
~/.sbt
275260
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
276261
restore-keys: |
277262
build-
278-
- name: Cache Maven local repository
279-
uses: actions/cache@v2
280-
with:
281-
path: ~/.m2/repository
282-
key: sparkr-maven-${{ hashFiles('**/pom.xml') }}
283-
restore-keys: |
284-
sparkr-maven-
285263
- name: Cache Coursier local repository
286264
uses: actions/cache@v2
287265
with:
@@ -291,18 +269,16 @@ jobs:
291269
sparkr-coursier-
292270
- name: Run tests
293271
run: |
294-
mkdir -p ~/.m2
295272
# The followings are also used by `r-lib/actions/setup-r` to avoid
296273
# R issues at docker environment
297274
export TZ=UTC
298275
export _R_CHECK_SYSTEM_CLOCK_=FALSE
299276
./dev/run-tests --parallelism 2 --modules sparkr
300-
rm -rf ~/.m2/repository/org/apache/spark
301277
- name: Upload test results to report
302278
if: always()
303279
uses: actions/upload-artifact@v2
304280
with:
305-
name: test-results-sparkr--1.8-hadoop3.2-hive2.3
281+
name: test-results-sparkr--8-hadoop3.2-hive2.3
306282
path: "**/target/test-reports/*.xml"
307283

308284
# Static analysis, and documentation build
@@ -312,17 +288,37 @@ jobs:
312288
steps:
313289
- name: Checkout Spark repository
314290
uses: actions/checkout@v2
291+
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
292+
- name: Cache Scala, SBT, Maven and Zinc
293+
uses: actions/cache@v2
294+
with:
295+
path: |
296+
build/apache-maven-*
297+
build/zinc-*
298+
build/scala-*
299+
build/*.jar
300+
~/.sbt
301+
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
302+
restore-keys: |
303+
build-
304+
- name: Cache Coursier local repository
305+
uses: actions/cache@v2
306+
with:
307+
path: ~/.cache/coursier
308+
key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
309+
restore-keys: |
310+
docs-coursier-
315311
- name: Cache Maven local repository
316312
uses: actions/cache@v2
317313
with:
318314
path: ~/.m2/repository
319-
key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
315+
key: docs-maven-${{ hashFiles('**/pom.xml') }}
320316
restore-keys: |
321317
docs-maven-
322-
- name: Install JDK 1.8
318+
- name: Install Java 8
323319
uses: actions/setup-java@v1
324320
with:
325-
java-version: 1.8
321+
java-version: 8
326322
- name: Install Python 3.6
327323
uses: actions/setup-python@v2
328324
with:
@@ -373,8 +369,8 @@ jobs:
373369
cd docs
374370
jekyll build
375371
376-
java11:
377-
name: Java 11 build
372+
java-11:
373+
name: Java 11 build with Maven
378374
runs-on: ubuntu-20.04
379375
steps:
380376
- name: Checkout Spark repository
@@ -394,12 +390,12 @@ jobs:
394390
run: |
395391
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
396392
export MAVEN_CLI_OPTS="--no-transfer-progress"
397-
mkdir -p ~/.m2
393+
# It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
398394
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
399395
rm -rf ~/.m2/repository/org/apache/spark
400396
401397
scala-213:
402-
name: Scala 2.13 build
398+
name: Scala 2.13 build with SBT
403399
runs-on: ubuntu-20.04
404400
steps:
405401
- name: Checkout Spark repository
@@ -411,11 +407,32 @@ jobs:
411407
key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
412408
restore-keys: |
413409
scala-213-coursier-
414-
- name: Install Java 11
410+
- name: Install Java 8
415411
uses: actions/setup-java@v1
416412
with:
417-
java-version: 11
413+
java-version: 8
418414
- name: Build with SBT
419415
run: |
420416
./dev/change-scala-version.sh 2.13
421417
./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Djava.version=11 -Pscala-2.13 compile test:compile
418+
419+
hadoop-2:
420+
name: Hadoop 2 build with SBT
421+
runs-on: ubuntu-20.04
422+
steps:
423+
- name: Checkout Spark repository
424+
uses: actions/checkout@v2
425+
- name: Cache Coursier local repository
426+
uses: actions/cache@v2
427+
with:
428+
path: ~/.cache/coursier
429+
key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
430+
restore-keys: |
431+
hadoop-2-coursier-
432+
- name: Install Java 8
433+
uses: actions/setup-java@v1
434+
with:
435+
java-version: 8
436+
- name: Build with SBT
437+
run: |
438+
./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,7 @@ exportMethods("%<=>%",
292292
"floor",
293293
"format_number",
294294
"format_string",
295+
"from_avro",
295296
"from_csv",
296297
"from_json",
297298
"from_unixtime",
@@ -416,6 +417,7 @@ exportMethods("%<=>%",
416417
"timestamp_seconds",
417418
"toDegrees",
418419
"toRadians",
420+
"to_avro",
419421
"to_csv",
420422
"to_date",
421423
"to_json",

R/pkg/R/functions.R

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,50 @@ NULL
361361
#' }
362362
NULL
363363

364+
#' Avro processing functions for Column operations
365+
#'
366+
#' Avro processing functions defined for \code{Column}.
367+
#'
368+
#' @param x Column to compute on.
369+
#' @param jsonFormatSchema character Avro schema in JSON string format
370+
#' @param ... additional argument(s) passed as parser options.
371+
#' @name column_avro_functions
372+
#' @rdname column_avro_functions
373+
#' @family avro functions
374+
#' @note Avro is built-in but external data source module since Spark 2.4.
375+
#' Please deploy the application as per
376+
#' \href{https://spark.apache.org/docs/latest/sql-data-sources-avro.html#deploying}{
377+
#' the deployment section
378+
#' } of "Apache Avro Data Source Guide".
379+
#' @examples
380+
#' \dontrun{
381+
#' df <- createDataFrame(iris)
382+
#' schema <- paste(
383+
#' c(
384+
#' '{"type": "record", "namespace": "example.avro", "name": "Iris", "fields": [',
385+
#' '{"type": ["double", "null"], "name": "Sepal_Length"},',
386+
#' '{"type": ["double", "null"], "name": "Sepal_Width"},',
387+
#' '{"type": ["double", "null"], "name": "Petal_Length"},',
388+
#' '{"type": ["double", "null"], "name": "Petal_Width"},',
389+
#' '{"type": ["string", "null"], "name": "Species"}]}'
390+
#' ),
391+
#' collapse="\\n"
392+
#' )
393+
#'
394+
#' df_serialized <- select(
395+
#' df,
396+
#' alias(to_avro(alias(struct(column("*")), "fields")), "payload")
397+
#' )
398+
#'
399+
#' df_deserialized <- select(
400+
#' df_serialized,
401+
#' from_avro(df_serialized$payload, schema)
402+
#' )
403+
#'
404+
#' head(df_deserialized)
405+
#' }
406+
NULL
407+
364408
#' @details
365409
#' \code{lit}: A new Column is created to represent the literal value.
366410
#' If the parameter is a Column, it is returned unchanged.
@@ -4547,3 +4591,60 @@ setMethod("vector_to_array",
45474591
)
45484592
column(jc)
45494593
})
4594+
4595+
#' @details
4596+
#' \code{from_avro} Converts a binary column of Avro format into its corresponding catalyst value.
4597+
#' The specified schema must match the read data, otherwise the behavior is undefined:
4598+
#' it may fail or return arbitrary result.
4599+
#' To deserialize the data with a compatible and evolved schema, the expected Avro schema can be
4600+
#' set via the option avroSchema.
4601+
#'
4602+
#' @rdname column_avro_functions
4603+
#' @aliases from_avro from_avro,Column-method
4604+
#' @note from_avro since 3.1.0
4605+
setMethod("from_avro",
4606+
signature(x = "characterOrColumn"),
4607+
function(x, jsonFormatSchema, ...) {
4608+
x <- if (is.character(x)) {
4609+
column(x)
4610+
} else {
4611+
x
4612+
}
4613+
4614+
options <- varargsToStrEnv(...)
4615+
jc <- callJStatic(
4616+
"org.apache.spark.sql.avro.functions", "from_avro",
4617+
x@jc,
4618+
jsonFormatSchema,
4619+
options
4620+
)
4621+
column(jc)
4622+
})
4623+
4624+
#' @details
4625+
#' \code{to_avro} Converts a column into binary of Avro format.
4626+
#'
4627+
#' @rdname column_avro_functions
4628+
#' @aliases to_avro to_avro,Column-method
4629+
#' @note to_avro since 3.1.0
4630+
setMethod("to_avro",
4631+
signature(x = "characterOrColumn"),
4632+
function(x, jsonFormatSchema = NULL) {
4633+
x <- if (is.character(x)) {
4634+
column(x)
4635+
} else {
4636+
x
4637+
}
4638+
4639+
jc <- if (is.null(jsonFormatSchema)) {
4640+
callJStatic("org.apache.spark.sql.avro.functions", "to_avro", x@jc)
4641+
} else {
4642+
callJStatic(
4643+
"org.apache.spark.sql.avro.functions",
4644+
"to_avro",
4645+
x@jc,
4646+
jsonFormatSchema
4647+
)
4648+
}
4649+
column(jc)
4650+
})

0 commit comments

Comments
 (0)