Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 9 additions & 169 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,172 +9,6 @@ on:
- master

jobs:
# TODO(SPARK-32248): Recover JDK 11 builds
# Build: build Spark and run the tests for specified modules.
build:
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
java:
- 1.8
hadoop:
- hadoop3.2
hive:
- hive2.3
# TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
# Kinesis tests depends on external Amazon kinesis service.
# Note that the modules below are from sparktestsupport/modules.py.
modules:
- |-
core, unsafe, kvstore, avro,
network-common, network-shuffle, repl, launcher,
examples, sketch, graphx
- |-
catalyst, hive-thriftserver
- |-
streaming, sql-kafka-0-10, streaming-kafka-0-10,
mllib-local, mllib,
yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
- |-
pyspark-sql, pyspark-mllib, pyspark-resource
- |-
pyspark-core, pyspark-streaming, pyspark-ml
- |-
sparkr
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
included-tags: [""]
excluded-tags: [""]
comment: [""]
include:
# Hive tests
- modules: hive
java: 1.8
hadoop: hadoop3.2
hive: hive2.3
included-tags: org.apache.spark.tags.SlowHiveTest
comment: "- slow tests"
- modules: hive
java: 1.8
hadoop: hadoop3.2
hive: hive2.3
excluded-tags: org.apache.spark.tags.SlowHiveTest
comment: "- other tests"
# SQL tests
- modules: sql
java: 1.8
hadoop: hadoop3.2
hive: hive2.3
included-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- slow tests"
- modules: sql
java: 1.8
hadoop: hadoop3.2
hive: hive2.3
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- other tests"
env:
MODULES_TO_TEST: ${{ matrix.modules }}
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
INCLUDED_TAGS: ${{ matrix.included-tags }}
HADOOP_PROFILE: ${{ matrix.hadoop }}
HIVE_PROFILE: ${{ matrix.hive }}
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
GITHUB_PREV_SHA: ${{ github.event.before }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT, Maven and Zinc
uses: actions/cache@v1
with:
path: build
key: build-${{ hashFiles('**/pom.xml') }}
restore-keys: |
build-
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-maven-
- name: Cache Ivy local repository
uses: actions/cache@v2
with:
path: ~/.ivy2/cache
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
- name: Install JDK ${{ matrix.java }}
uses: actions/setup-java@v1
with:
java-version: ${{ matrix.java }}
# PySpark
- name: Install PyPy3
# Note that order of Python installations here matters because default python3 is
# overridden by pypy3.
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark')
with:
python-version: pypy3
architecture: x64
- name: Install Python 3.6
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark')
with:
python-version: 3.6
architecture: x64
- name: Install Python 3.8
uses: actions/setup-python@v2
# We should install one Python that is higher then 3+ for SQL and Yarn because:
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
# - Yarn has a Python specific test too, for example, YarnClusterSuite.
if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 3.8
architecture: x64
- name: Install Python packages (Python 3.6 and PyPy3)
if: contains(matrix.modules, 'pyspark')
# PyArrow is not supported in PyPy yet, see ARROW-2651.
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
run: |
python3.6 -m pip install numpy pyarrow pandas scipy
python3.6 -m pip list
pypy3 -m pip install numpy pandas
pypy3 -m pip list
- name: Install Python packages (Python 3.8)
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
run: |
python3.8 -m pip install numpy pyarrow pandas scipy
python3.8 -m pip list
# SparkR
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
if: contains(matrix.modules, 'sparkr')
with:
r-version: 3.6
- name: Install R packages
if: contains(matrix.modules, 'sparkr')
run: |
sudo apt-get install -y libcurl4-openssl-dev
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
# Show installed packages in R.
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
# Run the tests.
- name: "Run tests: ${{ matrix.modules }}"
run: |
# Hive tests become flaky when running in parallel as it's too intensive.
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
mkdir -p ~/.m2
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
rm -rf ~/.m2/repository/org/apache/spark

# Static analysis, and documentation build
lint:
name: Linters, licenses, dependencies and documentation generation
Expand Down Expand Up @@ -204,9 +38,15 @@ jobs:
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
with:
r-version: 3.6
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
sudo apt-get update
sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
- name: Install dependencies for documentation generation
run: |
Rscript -e 'print(version)'
ls /usr/local/bin
- name: Install R linter dependencies and SparkR
run: |
sudo apt-get install -y libcurl4-openssl-dev
Expand Down
14 changes: 8 additions & 6 deletions dev/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,11 +609,14 @@ def main():
" install one and retry.")
sys.exit(2)

# install SparkR
if which("R"):
run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
else:
print("Cannot install SparkR as R was not found in PATH")
# Install SparkR
should_only_test_modules = opts.modules is not None
if not should_only_test_modules:
# If tests modules are specified, we will not run R linter.
if which("R"):
run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
else:
print("Cannot install SparkR as R was not found in PATH")

if os.environ.get("AMPLAB_JENKINS"):
# if we're on the Amplab Jenkins build servers setup variables
Expand Down Expand Up @@ -643,7 +646,6 @@ def main():
changed_modules = None
test_modules = None
changed_files = None
should_only_test_modules = opts.modules is not None
included_tags = []
excluded_tags = []
if should_only_test_modules:
Expand Down