[iceberg] upgrade iceberg spark runtime to 1.4.2 (#26) #151
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Licensed to the Apache Software Foundation (ASF) under one | |
# or more contributor license agreements. See the NOTICE file | |
# distributed with this work for additional information | |
# regarding copyright ownership. The ASF licenses this file | |
# to you under the Apache License, Version 2.0 (the | |
# "License"); you may not use this file except in compliance | |
# with the License. You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, | |
# software distributed under the License is distributed on an | |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
# KIND, either express or implied. See the License for the | |
# specific language governing permissions and limitations | |
# under the License. | |
# | |
name: Build and test | |
on: | |
push: | |
branches: | |
- '**' | |
workflow_call: | |
inputs: | |
ansi_enabled: | |
required: false | |
type: boolean | |
default: false | |
jobs: | |
configure-jobs: | |
name: Configure jobs | |
runs-on: ubuntu-20.04 | |
outputs: | |
java: ${{ steps.set-outputs.outputs.java }} | |
branch: ${{ steps.set-outputs.outputs.branch }} | |
hadoop: ${{ steps.set-outputs.outputs.hadoop }} | |
type: ${{ steps.set-outputs.outputs.type }} | |
envs: ${{ steps.set-outputs.outputs.envs }} | |
steps: | |
- name: Configure branch and additional environment variables | |
id: set-outputs | |
run: | | |
if [ "${{ github.event.schedule }}" = "0 1 * * *" ]; then | |
echo '::set-output name=java::8' | |
echo '::set-output name=branch::master' | |
echo '::set-output name=type::scheduled' | |
echo '::set-output name=envs::{}' | |
echo '::set-output name=hadoop::hadoop2' | |
elif [ "${{ github.event.schedule }}" = "0 4 * * *" ]; then | |
echo '::set-output name=java::8' | |
echo '::set-output name=branch::master' | |
echo '::set-output name=type::scheduled' | |
echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}' | |
echo '::set-output name=hadoop::hadoop3' | |
elif [ "${{ github.event.schedule }}" = "0 7 * * *" ]; then | |
echo '::set-output name=java::8' | |
echo '::set-output name=branch::branch-3.2' | |
echo '::set-output name=type::scheduled' | |
echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}' | |
echo '::set-output name=hadoop::hadoop3.2' | |
elif [ "${{ github.event.schedule }}" = "0 10 * * *" ]; then | |
echo '::set-output name=java::8' | |
echo '::set-output name=branch::master' | |
echo '::set-output name=type::pyspark-coverage-scheduled' | |
echo '::set-output name=envs::{"PYSPARK_CODECOV": "true"}' | |
echo '::set-output name=hadoop::hadoop3' | |
elif [ "${{ github.event.schedule }}" = "0 13 * * *" ]; then | |
echo '::set-output name=java::11' | |
echo '::set-output name=branch::master' | |
echo '::set-output name=type::scheduled' | |
echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' | |
echo '::set-output name=hadoop::hadoop3' | |
elif [ "${{ github.event.schedule }}" = "0 16 * * *" ]; then | |
echo '::set-output name=java::17' | |
echo '::set-output name=branch::master' | |
echo '::set-output name=type::scheduled' | |
echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' | |
echo '::set-output name=hadoop::hadoop3' | |
else | |
echo '::set-output name=java::8' | |
echo '::set-output name=branch::branch-3.3' # Default branch to run on. CHANGE here when a branch is cut out. | |
echo '::set-output name=type::regular' | |
echo '::set-output name=envs::{"SPARK_ANSI_SQL_MODE": "${{ inputs.ansi_enabled }}"}' | |
echo '::set-output name=hadoop::hadoop3' | |
fi | |
precondition: | |
name: Check changes | |
runs-on: ubuntu-20.04 | |
env: | |
GITHUB_PREV_SHA: ${{ github.event.before }} | |
outputs: | |
required: ${{ steps.set-outputs.outputs.required }} | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
- name: Check all modules | |
id: set-outputs | |
run: | | |
build=`./dev/is-changed.py -m avro,build,catalyst,core,docker-integration-tests,examples,graphx,hadoop-cloud,hive,hive-thriftserver,kubernetes,kvstore,launcher,mesos,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,spark-ganglia-lgpl,sparkr,sql,sql-kafka-0-10,streaming,streaming-kafka-0-10,streaming-kinesis-asl,tags,unsafe,yarn` | |
pyspark=`./dev/is-changed.py -m avro,build,catalyst,core,graphx,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,sql,tags,unsafe` | |
sparkr=`./dev/is-changed.py -m avro,build,catalyst,core,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,repl,sketch,sparkr,sql,tags,unsafe` | |
tpcds=`./dev/is-changed.py -m build,catalyst,core,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe` | |
docker=`./dev/is-changed.py -m build,catalyst,core,docker-integration-tests,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe` | |
echo "{\"build\": \"$build\", \"pyspark\": \"$pyspark\", \"sparkr\": \"$sparkr\", \"tpcds\": \"$tpcds\", \"docker\": \"$docker\"}" > required.json | |
cat required.json | |
echo "::set-output name=required::$(cat required.json)" | |
# Build: build Spark and run the tests for specified modules. | |
build: | |
name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" | |
needs: [configure-jobs, precondition] | |
# Run scheduled jobs for Apache Spark only | |
# Run regular jobs for commit in both Apache Spark and forked repository | |
if: >- | |
(github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled') | |
|| (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true') | |
# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. | |
runs-on: ubuntu-20.04 | |
strategy: | |
fail-fast: false | |
matrix: | |
java: | |
- ${{ needs.configure-jobs.outputs.java }} | |
hadoop: | |
- ${{ needs.configure-jobs.outputs.hadoop }} | |
hive: | |
- hive2.3 | |
# TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. | |
# Kinesis tests depends on external Amazon kinesis service. | |
# Note that the modules below are from sparktestsupport/modules.py. | |
modules: | |
- >- | |
core, unsafe, kvstore, avro, | |
network-common, network-shuffle, repl, launcher, | |
examples, sketch, graphx | |
- >- | |
catalyst, hive-thriftserver | |
- >- | |
streaming, sql-kafka-0-10, streaming-kafka-0-10, | |
mllib-local, mllib, | |
yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl | |
# Here, we split Hive and SQL tests into some of slow ones and the rest of them. | |
included-tags: [""] | |
excluded-tags: [""] | |
comment: [""] | |
include: | |
# Hive tests | |
- modules: hive | |
java: ${{ needs.configure-jobs.outputs.java }} | |
hadoop: ${{ needs.configure-jobs.outputs.hadoop }} | |
hive: hive2.3 | |
included-tags: org.apache.spark.tags.SlowHiveTest | |
comment: "- slow tests" | |
- modules: hive | |
java: ${{ needs.configure-jobs.outputs.java }} | |
hadoop: ${{ needs.configure-jobs.outputs.hadoop }} | |
hive: hive2.3 | |
excluded-tags: org.apache.spark.tags.SlowHiveTest | |
comment: "- other tests" | |
# SQL tests | |
- modules: sql | |
java: ${{ needs.configure-jobs.outputs.java }} | |
hadoop: ${{ needs.configure-jobs.outputs.hadoop }} | |
hive: hive2.3 | |
included-tags: org.apache.spark.tags.ExtendedSQLTest | |
comment: "- slow tests" | |
- modules: sql | |
java: ${{ needs.configure-jobs.outputs.java }} | |
hadoop: ${{ needs.configure-jobs.outputs.hadoop }} | |
hive: hive2.3 | |
excluded-tags: org.apache.spark.tags.ExtendedSQLTest | |
comment: "- other tests" | |
env: | |
MODULES_TO_TEST: ${{ matrix.modules }} | |
EXCLUDED_TAGS: ${{ matrix.excluded-tags }} | |
INCLUDED_TAGS: ${{ matrix.included-tags }} | |
HADOOP_PROFILE: ${{ matrix.hadoop }} | |
HIVE_PROFILE: ${{ matrix.hive }} | |
GITHUB_PREV_SHA: ${{ github.event.before }} | |
SPARK_LOCAL_IP: localhost | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
# In order to fetch changed files | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: ${{ needs.configure-jobs.outputs.branch }} | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
# Cache local repositories. Note that GitHub Actions cache has a 2G limit. | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Coursier local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.cache/coursier | |
key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | |
restore-keys: | | |
${{ matrix.java }}-${{ matrix.hadoop }}-coursier- | |
- name: Install Java ${{ matrix.java }} | |
uses: actions/setup-java@v1 | |
with: | |
java-version: ${{ matrix.java }} | |
- name: Install Python 3.8 | |
uses: actions/setup-python@v2 | |
# We should install one Python that is higher then 3+ for SQL and Yarn because: | |
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. | |
# - Yarn has a Python specific test too, for example, YarnClusterSuite. | |
if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) | |
with: | |
python-version: 3.8 | |
architecture: x64 | |
- name: Install Python packages (Python 3.8) | |
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) | |
run: | | |
python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy xmlrunner | |
python3.8 -m pip list | |
# Run the tests. | |
- name: Run tests | |
env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }} | |
run: | | |
# Hive "other tests" test needs larger metaspace size based on experiment. | |
if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi | |
export SERIAL_SBT_TESTS=1 | |
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" | |
- name: Upload test results to report | |
if: always() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} | |
path: "**/target/test-reports/*.xml" | |
- name: Upload unit tests log files | |
if: failure() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} | |
path: "**/target/unit-tests.log" | |
pyspark: | |
needs: [configure-jobs, precondition] | |
# Run PySpark coverage scheduled jobs for Apache Spark only | |
# Run scheduled jobs with JDK 17 in Apache Spark | |
# Run regular jobs for commit in both Apache Spark and forked repository | |
if: >- | |
(github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled') | |
|| (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17') | |
|| (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).pyspark == 'true') | |
name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }}" | |
runs-on: ubuntu-20.04 | |
container: | |
image: dongjoon/apache-spark-github-action-image:20220207 | |
strategy: | |
fail-fast: false | |
matrix: | |
java: | |
- ${{ needs.configure-jobs.outputs.java }} | |
modules: | |
- >- | |
pyspark-sql, pyspark-mllib, pyspark-resource | |
- >- | |
pyspark-core, pyspark-streaming, pyspark-ml | |
- >- | |
pyspark-pandas | |
- >- | |
pyspark-pandas-slow | |
env: | |
MODULES_TO_TEST: ${{ matrix.modules }} | |
HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} | |
HIVE_PROFILE: hive2.3 | |
GITHUB_PREV_SHA: ${{ github.event.before }} | |
SPARK_LOCAL_IP: localhost | |
SKIP_UNIDOC: true | |
SKIP_MIMA: true | |
METASPACE_SIZE: 1g | |
SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }} | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
# In order to fetch changed files | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
# Cache local repositories. Note that GitHub Actions cache has a 2G limit. | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Coursier local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.cache/coursier | |
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | |
restore-keys: | | |
pyspark-coursier- | |
- name: Install Java ${{ matrix.java }} | |
uses: actions/setup-java@v1 | |
with: | |
java-version: ${{ matrix.java }} | |
- name: List Python packages (Python 3.9, PyPy3) | |
run: | | |
python3.9 -m pip list | |
pypy3 -m pip list | |
- name: Install Conda for pip packaging test | |
run: | | |
curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh | |
bash miniconda.sh -b -p $HOME/miniconda | |
# Run the tests. | |
- name: Run tests | |
env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }} | |
run: | | |
export PATH=$PATH:$HOME/miniconda/bin | |
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" | |
- name: Upload coverage to Codecov | |
if: needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled' | |
uses: codecov/codecov-action@v2 | |
with: | |
files: ./python/coverage.xml | |
flags: unittests | |
name: PySpark | |
- name: Upload test results to report | |
if: always() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: test-results-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 | |
path: "**/target/test-reports/*.xml" | |
- name: Upload unit tests log files | |
if: failure() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: unit-tests-log-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 | |
path: "**/target/unit-tests.log" | |
sparkr: | |
needs: [configure-jobs, precondition] | |
if: >- | |
(needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).sparkr == 'true') | |
|| (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17') | |
name: "Build modules: sparkr" | |
runs-on: ubuntu-20.04 | |
container: | |
image: dongjoon/apache-spark-github-action-image:20220207 | |
env: | |
HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} | |
HIVE_PROFILE: hive2.3 | |
GITHUB_PREV_SHA: ${{ github.event.before }} | |
SPARK_LOCAL_IP: localhost | |
SKIP_MIMA: true | |
SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }} | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
# In order to fetch changed files | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
# Cache local repositories. Note that GitHub Actions cache has a 2G limit. | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Coursier local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.cache/coursier | |
key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | |
restore-keys: | | |
sparkr-coursier- | |
- name: Install Java ${{ needs.configure-jobs.outputs.java }} | |
uses: actions/setup-java@v1 | |
with: | |
java-version: ${{ needs.configure-jobs.outputs.java }} | |
- name: Run tests | |
run: | | |
# The followings are also used by `r-lib/actions/setup-r` to avoid | |
# R issues at docker environment | |
export TZ=UTC | |
export _R_CHECK_SYSTEM_CLOCK_=FALSE | |
./dev/run-tests --parallelism 1 --modules sparkr | |
- name: Upload test results to report | |
if: always() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: test-results-sparkr--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 | |
path: "**/target/test-reports/*.xml" | |
# Static analysis, and documentation build | |
lint: | |
needs: configure-jobs | |
if: needs.configure-jobs.outputs.type == 'regular' | |
name: Linters, licenses, dependencies and documentation generation | |
runs-on: ubuntu-20.04 | |
env: | |
LC_ALL: C.UTF-8 | |
LANG: C.UTF-8 | |
PYSPARK_DRIVER_PYTHON: python3.9 | |
PYSPARK_PYTHON: python3.9 | |
GITHUB_PREV_SHA: ${{ github.event.before }} | |
container: | |
image: dongjoon/apache-spark-github-action-image:20220207 | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
# Cache local repositories. Note that GitHub Actions cache has a 2G limit. | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Coursier local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.cache/coursier | |
key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | |
restore-keys: | | |
docs-coursier- | |
- name: Cache Maven local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.m2/repository | |
key: docs-maven-${{ hashFiles('**/pom.xml') }} | |
restore-keys: | | |
docs-maven- | |
- name: Install Python linter dependencies | |
run: | | |
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. | |
# See also https://github.com/sphinx-doc/sphinx/issues/7551. | |
# Jinja2 3.0.0+ causes error when building with Sphinx. | |
# See also https://issues.apache.org/jira/browse/SPARK-35375. | |
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==21.12b0' | |
python3.9 -m pip install 'pandas-stubs==1.2.0.53' | |
- name: Install R linter dependencies and SparkR | |
run: | | |
apt update | |
apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ | |
libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \ | |
libtiff5-dev libjpeg-dev | |
Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" | |
Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" | |
./R/install-dev.sh | |
- name: Instll JavaScript linter dependencies | |
run: | | |
apt update | |
apt-get install -y nodejs npm | |
- name: Install dependencies for documentation generation | |
run: | | |
# pandoc is required to generate PySpark APIs as well in nbsphinx. | |
apt-get install -y libcurl4-openssl-dev pandoc | |
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. | |
# See also https://github.com/sphinx-doc/sphinx/issues/7551. | |
# Jinja2 3.0.0+ causes error when building with Sphinx. | |
# See also https://issues.apache.org/jira/browse/SPARK-35375. | |
# Pin the MarkupSafe to 2.0.1 to resolve the CI error. | |
# See also https://issues.apache.org/jira/browse/SPARK-38279. | |
python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0' | |
python3.9 -m pip install ipython_genutils # See SPARK-38517 | |
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' | |
python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 | |
apt-get update -y | |
apt-get install -y ruby ruby-dev | |
Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')" | |
Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" | |
Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" | |
gem install bundler | |
cd docs | |
bundle install | |
- name: Install Java 8 | |
uses: actions/setup-java@v1 | |
with: | |
java-version: 8 | |
- name: Scala linter | |
run: ./dev/lint-scala | |
- name: Java linter | |
run: ./dev/lint-java | |
- name: Python linter | |
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python | |
- name: R linter | |
run: ./dev/lint-r | |
- name: JS linter | |
run: ./dev/lint-js | |
- name: License test | |
run: ./dev/check-license | |
- name: Dependencies test | |
run: ./dev/test-dependencies.sh | |
- name: Run documentation build | |
run: | | |
if [ -f "./dev/is-changed.py" ]; then | |
# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs | |
pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` | |
if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi | |
if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi | |
fi | |
cd docs | |
bundle exec jekyll build | |
java-11-17: | |
needs: [configure-jobs, precondition] | |
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true' | |
name: Java ${{ matrix.java }} build with Maven | |
strategy: | |
fail-fast: false | |
matrix: | |
java: | |
- 11 | |
- 17 | |
runs-on: ubuntu-20.04 | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Maven local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.m2/repository | |
key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }} | |
restore-keys: | | |
java${{ matrix.java }}-maven- | |
- name: Install Java ${{ matrix.java }} | |
uses: actions/setup-java@v1 | |
with: | |
java-version: ${{ matrix.java }} | |
- name: Build with Maven | |
run: | | |
export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" | |
export MAVEN_CLI_OPTS="--no-transfer-progress" | |
export JAVA_VERSION=${{ matrix.java }} | |
# It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. | |
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install | |
rm -rf ~/.m2/repository/org/apache/spark | |
scala-213: | |
needs: [configure-jobs, precondition] | |
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true' | |
name: Scala 2.13 build with SBT | |
runs-on: ubuntu-20.04 | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Coursier local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.cache/coursier | |
key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | |
restore-keys: | | |
scala-213-coursier- | |
- name: Install Java 8 | |
uses: actions/setup-java@v1 | |
with: | |
java-version: 8 | |
- name: Build with SBT | |
run: | | |
./dev/change-scala-version.sh 2.13 | |
./build/sbt -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile | |
tpcds-1g: | |
needs: [configure-jobs, precondition] | |
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).tpcds == 'true' | |
name: Run TPC-DS queries with SF=1 | |
runs-on: ubuntu-20.04 | |
env: | |
SPARK_LOCAL_IP: localhost | |
SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }} | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Coursier local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.cache/coursier | |
key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | |
restore-keys: | | |
tpcds-coursier- | |
- name: Install Java 8 | |
uses: actions/setup-java@v1 | |
with: | |
java-version: 8 | |
- name: Cache TPC-DS generated data | |
id: cache-tpcds-sf-1 | |
uses: actions/cache@v2 | |
with: | |
path: ./tpcds-sf-1 | |
key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} | |
- name: Checkout tpcds-kit repository | |
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' | |
uses: actions/checkout@v2 | |
with: | |
repository: databricks/tpcds-kit | |
ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069 | |
path: ./tpcds-kit | |
- name: Build tpcds-kit | |
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' | |
run: cd tpcds-kit/tools && make OS=LINUX | |
- name: Generate TPC-DS (SF=1) table data | |
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' | |
run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" | |
- name: Run TPC-DS queries (Sort merge join) | |
run: | | |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" | |
env: | |
SPARK_TPCDS_JOIN_CONF: | | |
spark.sql.autoBroadcastJoinThreshold=-1 | |
spark.sql.join.preferSortMergeJoin=true | |
- name: Run TPC-DS queries (Broadcast hash join) | |
run: | | |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" | |
env: | |
SPARK_TPCDS_JOIN_CONF: | | |
spark.sql.autoBroadcastJoinThreshold=10485760 | |
- name: Run TPC-DS queries (Shuffled hash join) | |
run: | | |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" | |
env: | |
SPARK_TPCDS_JOIN_CONF: | | |
spark.sql.autoBroadcastJoinThreshold=-1 | |
spark.sql.join.forceApplyShuffledHashJoin=true | |
- name: Upload test results to report | |
if: always() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: test-results-tpcds--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 | |
path: "**/target/test-reports/*.xml" | |
- name: Upload unit tests log files | |
if: failure() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: unit-tests-log-tpcds--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 | |
path: "**/target/unit-tests.log" | |
docker-integration-tests: | |
needs: [configure-jobs, precondition] | |
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).docker == 'true' | |
name: Run Docker integration tests | |
runs-on: ubuntu-20.04 | |
env: | |
HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} | |
HIVE_PROFILE: hive2.3 | |
GITHUB_PREV_SHA: ${{ github.event.before }} | |
SPARK_LOCAL_IP: localhost | |
ORACLE_DOCKER_IMAGE_NAME: gvenzl/oracle-xe:18.4.0 | |
SKIP_MIMA: true | |
steps: | |
- name: Checkout Spark repository | |
uses: actions/checkout@v2 | |
with: | |
fetch-depth: 0 | |
repository: apache/spark | |
ref: branch-3.3 | |
- name: Sync the current branch with the latest in Apache Spark | |
if: github.repository != 'apache/spark' | |
run: | | |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD | |
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" | |
- name: Cache Scala, SBT and Maven | |
uses: actions/cache@v2 | |
with: | |
path: | | |
build/apache-maven-* | |
build/scala-* | |
build/*.jar | |
~/.sbt | |
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | |
restore-keys: | | |
build- | |
- name: Cache Coursier local repository | |
uses: actions/cache@v2 | |
with: | |
path: ~/.cache/coursier | |
key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | |
restore-keys: | | |
docker-integration-coursier- | |
- name: Install Java 8 | |
uses: actions/setup-java@v1 | |
with: | |
java-version: 8 | |
- name: Run tests | |
run: | | |
./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest | |
- name: Upload test results to report | |
if: always() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: test-results-docker-integration--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 | |
path: "**/target/test-reports/*.xml" | |
- name: Upload unit tests log files | |
if: failure() | |
uses: actions/upload-artifact@v2 | |
with: | |
name: unit-tests-log-docker-integration--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 | |
path: "**/target/unit-tests.log" |