From 95305bf5e96f5b43337553998a64fe08ddeb7b4d Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Sat, 11 Jul 2020 13:09:06 -0700 Subject: [PATCH] [SPARK-32245][INFRA] Run Spark tests in Github Actions This PR aims to run the Spark tests in Github Actions. To briefly explain the main idea: - Reuse `dev/run-tests.py` with SBT build - Reuse the modules in `dev/sparktestsupport/modules.py` to test each module - Pass the modules to test into `dev/run-tests.py` directly via `TEST_ONLY_MODULES` environment variable. For example, `pyspark-sql,core,sql,hive`. - `dev/run-tests.py` _does not_ take the dependent modules into account but solely the specified modules to test. Another thing to note might be `SlowHiveTest` annotation. Running the tests in Hive modules takes too much so the slow tests are extracted and it runs as a separate job. It was extracted from the actual elapsed time in Jenkins: ![Screen Shot 2020-07-09 at 7 48 13 PM](https://user-images.githubusercontent.com/6477701/87050238-f6098e80-c238-11ea-9c4a-ab505af61381.png) So, Hive tests are separated into to jobs. One is slow test cases, and the other one is the other test cases. _Note that_ the current GitHub Actions build virtually copies what the default PR builder on Jenkins does (without other profiles such as JDK 11, Hadoop 2, etc.). The only exception is Kinesis https://github.com/apache/spark/pull/29057/files#diff-04eb107ee163a50b61281ca08f4e4c7bR23 Last week and onwards, the Jenkins machines became very unstable for many reasons: - Apparently, the machines became extremely slow. Almost all tests can't pass. - One machine (worker 4) started to have the corrupt `.m2` which fails the build. - Documentation build fails time to time for an unknown reason in Jenkins machine specifically. This is disabled for now at https://github.com/apache/spark/pull/29017. - Almost all PRs are basically blocked by this instability currently. The advantages of using Github Actions: - To avoid depending on few persons who can access to the cluster. - To reduce the elapsed time in the build - we could split the tests (e.g., SQL, ML, CORE), and run them in parallel so the total build time will significantly reduce. - To control the environment more flexibly. - Other contributors can test and propose to fix Github Actions configurations so we can distribute this build management cost. Note that: - The current build in Jenkins takes _more than 7 hours_. With Github actions it takes _less than 2 hours_ - We can now control the environments especially for Python easily. - The test and build look more stable than the Jenkins'. No, dev-only change. Tested at https://github.com/HyukjinKwon/spark/pull/4 Closes #29057 from HyukjinKwon/migrate-to-github-actions. Authored-by: HyukjinKwon Signed-off-by: Dongjoon Hyun --- .github/workflows/branch-2.4.yml | 104 --------- .github/workflows/master.yml | 220 ++++++++++++++++++ .../org/apache/spark/tags/SlowHiveTest.java | 30 +++ dev/run-pip-tests | 11 +- dev/run-tests.py | 89 ++++--- dev/sparktestsupport/modules.py | 93 ++++++-- project/SparkBuild.scala | 11 + python/pyspark/sql/tests.py | 3 +- python/pyspark/streaming/tests.py | 10 + python/run-tests.py | 2 +- .../execution/HiveCompatibilitySuite.scala | 2 + .../HiveExternalCatalogVersionsSuite.scala | 3 + .../spark/sql/hive/HiveSparkSubmitSuite.scala | 3 + .../spark/sql/hive/client/VersionsSuite.scala | 3 +- .../execution/AggregationQuerySuite.scala | 2 + .../sql/hive/execution/HiveDDLSuite.scala | 3 + .../sql/hive/execution/HiveQuerySuite.scala | 2 + .../hive/execution/Hive_2_1_DDLSuite.scala | 3 +- .../sql/hive/execution/SQLQuerySuite.scala | 2 + 19 files changed, 437 insertions(+), 159 deletions(-) delete mode 100644 .github/workflows/branch-2.4.yml create mode 100644 .github/workflows/master.yml create mode 100644 common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java diff --git a/.github/workflows/branch-2.4.yml b/.github/workflows/branch-2.4.yml deleted file mode 100644 index 77e8f27d35b8d..0000000000000 --- a/.github/workflows/branch-2.4.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: branch-2.4 - -on: - push: - branches: - - branch-2.4 - pull_request: - branches: - - branch-2.4 - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - scala: [ '2.11', '2.12' ] - hadoop: [ 'hadoop-2.6', 'hadoop-2.7' ] - name: Build Spark with Scala ${{ matrix.scala }} / Hadoop ${{ matrix.hadoop }} - - steps: - - uses: actions/checkout@master - # We split caches because GitHub Action Cache has a 400MB-size limit. - - uses: actions/cache@v1 - with: - path: build - key: build-${{ hashFiles('**/pom.xml') }} - restore-keys: | - build- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/com - key: ${{ matrix.scala }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.scala }}-${{ matrix.hadoop }}-maven-com- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/org - key: ${{ matrix.scala }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.scala }}-${{ matrix.hadoop }}-maven-org- - - name: Set up JDK 8 - uses: actions/setup-java@v1 - with: - java-version: '1.8' - - name: Change to Scala ${{ matrix.scala }} - run: | - dev/change-scala-version.sh ${{ matrix.scala }} - - name: Build with Maven - run: | - export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" - export MAVEN_CLI_OPTS="--no-transfer-progress" - mkdir -p ~/.m2 - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Pscala-${{ matrix.scala }} -P${{ matrix.hadoop }} -Phadoop-cloud install - rm -rf ~/.m2/repository/org/apache/spark - - - lint: - runs-on: ubuntu-latest - name: Linters (Java/Scala/Python), licenses, dependencies - steps: - - uses: actions/checkout@master - - uses: actions/setup-java@v1 - with: - java-version: '1.8' - - uses: actions/setup-python@v1 - with: - python-version: '3.7' - architecture: 'x64' - - name: Scala - run: ./dev/lint-scala - - name: Java - run: ./dev/lint-java - - name: Python - run: | - pip install flake8 sphinx numpy - ./dev/lint-python - - name: License - run: ./dev/check-license - - name: Dependencies - run: ./dev/test-dependencies.sh - - lintr: - runs-on: ubuntu-latest - name: Linter (R) - steps: - - uses: actions/checkout@master - - uses: actions/setup-java@v1 - with: - java-version: '1.8' - - uses: r-lib/actions/setup-r@v1 - with: - r-version: '3.6.2' - - name: install lib - run: | - sudo apt-get install -y libcurl4-openssl-dev - - name: install R packages - run: | - sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" - sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" - - name: package and install SparkR - run: ./R/install-dev.sh - - name: lint-r - run: ./dev/lint-r diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml new file mode 100644 index 0000000000000..ff90324acdc05 --- /dev/null +++ b/.github/workflows/master.yml @@ -0,0 +1,220 @@ +name: master + +on: + pull_request: + branches: + - branch-2.4 + +jobs: + # TODO(SPARK-32248): Recover JDK 11 builds + # Build: build Spark and run the tests for specified modules. + build: + name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }})" + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + java: + - 1.8 + hadoop: + - hadoop2.6 + # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. + # Kinesis tests depends on external Amazon kinesis service. + # Note that the modules below are from sparktestsupport/modules.py. + modules: + - |- + core, unsafe, kvstore, avro, + network_common, network_shuffle, repl, launcher + examples, sketch, graphx + - |- + catalyst, hive-thriftserver + - |- + streaming, sql-kafka-0-10, streaming-kafka-0-10, + mllib-local, mllib, + yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl + - |- + pyspark-sql, pyspark-mllib + - |- + pyspark-core, pyspark-streaming, pyspark-ml + - |- + sparkr + # Here, we split Hive and SQL tests into some of slow ones and the rest of them. + included-tags: [""] + excluded-tags: [""] + comment: [""] + include: + # Hive tests + - modules: hive + java: 1.8 + hadoop: hadoop2.6 + included-tags: org.apache.spark.tags.SlowHiveTest + comment: "- slow tests" + - modules: hive + java: 1.8 + hadoop: hadoop2.6 + excluded-tags: org.apache.spark.tags.SlowHiveTest + comment: "- other tests" + # SQL tests + - modules: sql + java: 1.8 + hadoop: hadoop2.6 + included-tags: org.apache.spark.tags.ExtendedSQLTest + comment: "- slow tests" + - modules: sql + java: 1.8 + hadoop: hadoop2.6 + excluded-tags: org.apache.spark.tags.ExtendedSQLTest + comment: "- other tests" + env: + TEST_ONLY_MODULES: ${{ matrix.modules }} + TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }} + TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }} + HADOOP_PROFILE: ${{ matrix.hadoop }} + # GitHub Actions' default miniconda to use in pip packaging test. + CONDA_PREFIX: /usr/share/miniconda + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT, Maven and Zinc + uses: actions/cache@v1 + with: + path: build + key: build-${{ hashFiles('**/pom.xml') }} + restore-keys: | + build- + - name: Cache Maven local repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ matrix.java }}-${{ matrix.hadoop }}-maven- + - name: Cache Ivy local repository + uses: actions/cache@v2 + with: + path: ~/.ivy2/cache + key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }} + restore-keys: | + ${{ matrix.java }}-${{ matrix.hadoop }}-ivy- + - name: Install JDK ${{ matrix.java }} + uses: actions/setup-java@v1 + with: + java-version: ${{ matrix.java }} + # PySpark + - name: Install PyPy3 + # SQL component also has Python related tests, for example, IntegratedUDFTestUtils. + # Note that order of Python installations here matters because default python3 is + # overridden by pypy3. + uses: actions/setup-python@v2 + if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + with: + python-version: pypy3 + architecture: x64 + - name: Install Python 2.7 + uses: actions/setup-python@v2 + if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + with: + python-version: 2.7 + architecture: x64 + - name: Install Python 3.6 + uses: actions/setup-python@v2 + if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + with: + python-version: 3.6 + architecture: x64 + - name: Install Python packages + if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + # PyArrow is not supported in PyPy yet, see ARROW-2651. + # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason. + run: | + python3 -m pip install numpy pyarrow pandas scipy + python3 -m pip list + python2 -m pip install numpy pyarrow pandas scipy + python2 -m pip list + pypy3 -m pip install numpy pandas + pypy3 -m pip list + # SparkR + - name: Install R 3.6 + uses: r-lib/actions/setup-r@v1 + if: contains(matrix.modules, 'sparkr') + with: + r-version: 3.6 + - name: Install R packages + if: contains(matrix.modules, 'sparkr') + run: | + sudo apt-get install -y libcurl4-openssl-dev + sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')" + # Show installed packages in R. + sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]' + # Run the tests. + - name: "Run tests: ${{ matrix.modules }}" + run: | + # Hive tests become flaky when running in parallel as it's too intensive. + if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi + mkdir -p ~/.m2 + ./dev/run-tests --parallelism 2 + rm -rf ~/.m2/repository/org/apache/spark + + # Static analysis, and documentation build + lint: + name: Linters, licenses, dependencies and documentation generation + runs-on: ubuntu-latest + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + - name: Cache Maven local repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: docs-maven-repo-${{ hashFiles('**/pom.xml') }} + restore-keys: | + docs-maven- + - name: Install JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Install Python 3.6 + uses: actions/setup-python@v2 + with: + python-version: 3.6 + architecture: x64 + - name: Install Python linter dependencies + run: | + pip3 install flake8 sphinx numpy + - name: Install R 3.6 + uses: r-lib/actions/setup-r@v1 + with: + r-version: 3.6 + - name: Install R linter dependencies and SparkR + run: | + sudo apt-get install -y libcurl4-openssl-dev + sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" + sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" + ./R/install-dev.sh + - name: Install Ruby 2.7 for documentation generation + uses: actions/setup-ruby@v1 + with: + ruby-version: 2.7 + - name: Install dependencies for documentation generation + run: | + sudo apt-get install -y libcurl4-openssl-dev pandoc + pip install sphinx mkdocs numpy + gem install jekyll jekyll-redirect-from rouge + sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" + - name: Scala linter + run: ./dev/lint-scala + - name: Java linter + run: ./dev/lint-java + - name: Python linter + run: ./dev/lint-python + - name: R linter + run: ./dev/lint-r + - name: License test + run: ./dev/check-license + - name: Dependencies test + run: ./dev/test-dependencies.sh + - name: Run documentation build + run: | + cd docs + jekyll build diff --git a/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java b/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java new file mode 100644 index 0000000000000..a7e6f352667d7 --- /dev/null +++ b/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.tags; + +import org.scalatest.TagAnnotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@TagAnnotation +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.METHOD, ElementType.TYPE}) +public @interface SlowHiveTest { } diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 60cf4d8209416..5b0569f81d8af 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -68,7 +68,7 @@ fi PYSPARK_VERSION=$(python3 -c "exec(open('python/pyspark/version.py').read());print(__version__)") PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz" # The pip install options we use for all the pip commands -PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall " +PIP_OPTIONS="--user --upgrade --no-cache-dir --force-reinstall " # Test both regular user and edit/dev install modes. PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST" "pip install $PIP_OPTIONS -e python/") @@ -81,8 +81,12 @@ for python in "${PYTHON_EXECS[@]}"; do VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python rm -rf "$VIRTUALENV_PATH" if [ -n "$USE_CONDA" ]; then + if [ -f "$CONDA_PREFIX/etc/profile.d/conda.sh" ]; then + # See also https://github.com/conda/conda/issues/7980 + source "$CONDA_PREFIX/etc/profile.d/conda.sh" + fi conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools - source activate "$VIRTUALENV_PATH" + conda activate "$VIRTUALENV_PATH" || (echo "Falling back to 'source activate'" && source activate "$VIRTUALENV_PATH") else mkdir -p "$VIRTUALENV_PATH" virtualenv --python=$python "$VIRTUALENV_PATH" @@ -115,6 +119,7 @@ for python in "${PYTHON_EXECS[@]}"; do cd / echo "Run basic sanity check on pip installed version with spark-submit" + export PATH="$(python3 -m site --user-base)/bin:$PATH" spark-submit "$FWDIR"/dev/pip-sanity-check.py echo "Run basic sanity check with import based" python "$FWDIR"/dev/pip-sanity-check.py @@ -125,7 +130,7 @@ for python in "${PYTHON_EXECS[@]}"; do # conda / virtualenv environments need to be deactivated differently if [ -n "$USE_CONDA" ]; then - source deactivate + conda deactivate || (echo "Falling back to 'source deactivate'" && source deactivate) else deactivate fi diff --git a/dev/run-tests.py b/dev/run-tests.py index 5915b52c8ef5c..124180ccda135 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -111,10 +111,14 @@ def determine_modules_to_test(changed_modules): ['root'] >>> [x.name for x in determine_modules_to_test([modules.build])] ['root'] + >>> [x.name for x in determine_modules_to_test([modules.core])] + ['root'] + >>> [x.name for x in determine_modules_to_test([modules.launcher])] + ['root'] >>> [x.name for x in determine_modules_to_test([modules.graphx])] ['graphx', 'examples'] - >>> x = [x.name for x in determine_modules_to_test([modules.sql])] - >>> x # doctest: +NORMALIZE_WHITESPACE + >>> [x.name for x in determine_modules_to_test([modules.sql])] + ... # doctest: +NORMALIZE_WHITESPACE ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml'] """ @@ -421,7 +425,7 @@ def run_scala_tests_sbt(test_modules, test_profiles): exec_sbt(profiles_and_goals) -def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags): +def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags, included_tags): """Function to properly execute all tests passed in as a set from the `determine_test_suites` function""" set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") @@ -431,6 +435,8 @@ def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags): test_profiles = get_hadoop_profiles(hadoop_version) + \ list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules))) + if included_tags: + test_profiles += ['-Dtest.include.tags=' + ",".join(included_tags)] if excluded_tags: test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)] @@ -538,7 +544,23 @@ def main(): changed_modules = None changed_files = None - if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): + should_only_test_modules = "TEST_ONLY_MODULES" in os.environ + included_tags = [] + if should_only_test_modules: + str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")] + test_modules = [m for m in modules.all_modules if m.name in str_test_modules] + # Directly uses test_modules as changed modules to apply tags and environments + # as if all specified test modules are changed. + changed_modules = test_modules + str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None) + str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None) + excluded_tags = [] + if str_excluded_tags: + excluded_tags = [t.strip() for t in str_excluded_tags.split(",")] + included_tags = [] + if str_included_tags: + included_tags = [t.strip() for t in str_included_tags.split(",")] + elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) @@ -558,33 +580,34 @@ def main(): test_environ.update(m.environ) setup_test_environ(test_environ) - test_modules = determine_modules_to_test(changed_modules) - - # license checks - run_apache_rat_checks() - - # style checks - if not changed_files or any(f.endswith(".scala") - or f.endswith("scalastyle-config.xml") - for f in changed_files): - run_scala_style_checks() should_run_java_style_checks = False - if not changed_files or any(f.endswith(".java") - or f.endswith("checkstyle.xml") - or f.endswith("checkstyle-suppressions.xml") - for f in changed_files): - # Run SBT Checkstyle after the build to prevent a side-effect to the build. - should_run_java_style_checks = True - if not changed_files or any(f.endswith("lint-python") - or f.endswith("tox.ini") - or f.endswith(".py") - for f in changed_files): - run_python_style_checks() - if not changed_files or any(f.endswith(".R") - or f.endswith("lint-r") - or f.endswith(".lintr") - for f in changed_files): - run_sparkr_style_checks() + if not should_only_test_modules: + test_modules = determine_modules_to_test(changed_modules) + + # license checks + run_apache_rat_checks() + + # style checks + if not changed_files or any(f.endswith(".scala") + or f.endswith("scalastyle-config.xml") + for f in changed_files): + run_scala_style_checks() + if not changed_files or any(f.endswith(".java") + or f.endswith("checkstyle.xml") + or f.endswith("checkstyle-suppressions.xml") + for f in changed_files): + # Run SBT Checkstyle after the build to prevent a side-effect to the build. + should_run_java_style_checks = True + if not changed_files or any(f.endswith("lint-python") + or f.endswith("tox.ini") + or f.endswith(".py") + for f in changed_files): + run_python_style_checks() + if not changed_files or any(f.endswith(".R") + or f.endswith("lint-r") + or f.endswith(".lintr") + for f in changed_files): + run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed @@ -606,7 +629,7 @@ def main(): build_spark_assembly_sbt(hadoop_version, should_run_java_style_checks) # run the test suites - run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) + run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags, included_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: @@ -617,6 +640,10 @@ def main(): def _test(): + if "TEST_ONLY_MODULES" in os.environ: + # TODO(SPARK-32252): Enable doctests back in Github Actions. + return + import doctest failure_count = doctest.testmod()[0] if failure_count: diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 26905279e4bb2..9bc9222abc652 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -100,9 +100,75 @@ def __hash__(self): ] ) +kvstore = Module( + name="kvstore", + dependencies=[tags], + source_file_regexes=[ + "common/kvstore/", + ], + sbt_test_goals=[ + "kvstore/test", + ], +) + +network_common = Module( + name="network-common", + dependencies=[tags], + source_file_regexes=[ + "common/network-common/", + ], + sbt_test_goals=[ + "network-common/test", + ], +) + +network_shuffle = Module( + name="network-shuffle", + dependencies=[tags], + source_file_regexes=[ + "common/network-shuffle/", + ], + sbt_test_goals=[ + "network-shuffle/test", + ], +) + +unsafe = Module( + name="unsafe", + dependencies=[tags], + source_file_regexes=[ + "common/unsafe", + ], + sbt_test_goals=[ + "unsafe/test", + ], +) + +launcher = Module( + name="launcher", + dependencies=[tags], + source_file_regexes=[ + "launcher/", + ], + sbt_test_goals=[ + "launcher/test", + ], +) + +core = Module( + name="core", + dependencies=[kvstore, network_common, network_shuffle, unsafe, launcher], + source_file_regexes=[ + "core/", + ], + sbt_test_goals=[ + "core/test", + ], +) + catalyst = Module( name="catalyst", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "sql/catalyst/", ], @@ -111,7 +177,6 @@ def __hash__(self): ], ) - sql = Module( name="sql", dependencies=[catalyst], @@ -123,7 +188,6 @@ def __hash__(self): ], ) - hive = Module( name="hive", dependencies=[sql], @@ -142,7 +206,6 @@ def __hash__(self): ] ) - repl = Module( name="repl", dependencies=[hive], @@ -154,7 +217,6 @@ def __hash__(self): ], ) - hive_thriftserver = Module( name="hive-thriftserver", dependencies=[hive], @@ -192,7 +254,6 @@ def __hash__(self): ] ) - sketch = Module( name="sketch", dependencies=[tags], @@ -204,10 +265,9 @@ def __hash__(self): ] ) - graphx = Module( name="graphx", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "graphx/", ], @@ -216,10 +276,9 @@ def __hash__(self): ] ) - streaming = Module( name="streaming", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "streaming", ], @@ -235,7 +294,7 @@ def __hash__(self): # fail other PRs. streaming_kinesis_asl = Module( name="streaming-kinesis-asl", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "external/kinesis-asl/", "external/kinesis-asl-assembly/", @@ -275,14 +334,16 @@ def __hash__(self): streaming_kafka_0_10 = Module( name="streaming-kafka-0-10", - dependencies=[streaming], + dependencies=[streaming, core], source_file_regexes=[ # The ending "/" is necessary otherwise it will include "sql-kafka" codes "external/kafka-0-10/", "external/kafka-0-10-assembly", + "external/kafka-0-10-token-provider", ], sbt_test_goals=[ "streaming-kafka-0-10/test", + "token-provider-kafka-0-10/test" ] ) @@ -339,7 +400,7 @@ def __hash__(self): mllib_local = Module( name="mllib-local", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "mllib-local", ], @@ -373,10 +434,9 @@ def __hash__(self): ] ) - pyspark_core = Module( name="pyspark-core", - dependencies=[], + dependencies=[core], source_file_regexes=[ "python/(?!pyspark/(ml|mllib|sql|streaming))" ], @@ -396,7 +456,6 @@ def __hash__(self): ] ) - pyspark_sql = Module( name="pyspark-sql", dependencies=[pyspark_core, hive], @@ -574,7 +633,7 @@ def __hash__(self): # No other modules should directly depend on this module. root = Module( name="root", - dependencies=[build], # Changes to build should trigger all tests. + dependencies=[build, core], # Changes to build should trigger all tests. source_file_regexes=[], # In order to run all of the tests, enable every test profile: build_profile_flags=list(set( diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 7ee079cf053a5..c709d9ea4b9bf 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -922,9 +922,20 @@ object TestSettings { sys.props.get("test.exclude.tags").map { tags => Seq("--exclude-categories=" + tags) }.getOrElse(Nil): _*), + // Include tags defined in a system property + testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest, + sys.props.get("test.include.tags").map { tags => + tags.split(",").flatMap { tag => Seq("-n", tag) }.toSeq + }.getOrElse(Nil): _*), + testOptions in Test += Tests.Argument(TestFrameworks.JUnit, + sys.props.get("test.include.tags").map { tags => + Seq("--include-categories=" + tags) + }.getOrElse(Nil): _*), // Show full stack trace and duration in test cases. testOptions in Test += Tests.Argument("-oDF"), testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"), + // Required to detect Junit tests for each project, see also https://github.com/sbt/junit-interface/issues/35 + crossPaths := false, // Enable Junit testing. libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % "test", // `parallelExecutionInTest` controls whether test suites belonging to the same SBT project diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index c144b410bc384..020542bf66469 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3214,7 +3214,8 @@ def assertCollectSuccess(typecode, value): if sys.version_info[0] < 3: all_types = set(['c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd']) else: - all_types = set(array.typecodes) + # PyPy seems not having array.typecodes. + all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd']) unsupported_types = all_types - set(supported_types) # test unsupported types for t in unsupported_types: diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 2f7fa83b575cd..783ca40aa0e9e 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -26,6 +26,7 @@ import struct import shutil from functools import reduce +import platform try: import xmlrunner @@ -167,6 +168,9 @@ def _sort_result_based_on_key(self, outputs): output.sort(key=lambda x: x[0]) +@unittest.skipIf( + "pypy" in platform.python_implementation().lower(), + "The tests fail in PyPy3 implementation for an unknown reason.") class BasicOperationTests(PySparkStreamingTestCase): def test_map(self): @@ -657,6 +661,9 @@ def func(dstream): self.assertEqual(info.numRecords(), 0) +@unittest.skipIf( + "pypy" in platform.python_implementation().lower(), + "The tests fail in PyPy3 implementation for an unknown reason.") class WindowFunctionTests(PySparkStreamingTestCase): timeout = 15 @@ -884,6 +891,9 @@ def test_await_termination_or_timeout(self): self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001)) +@unittest.skipIf( + "pypy" in platform.python_implementation().lower(), + "The tests fail in PyPy3 implementation for an unknown reason.") class CheckpointTests(unittest.TestCase): setupCalled = False diff --git a/python/run-tests.py b/python/run-tests.py index c34e48aad1211..7f4670a8b8a9b 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -160,7 +160,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): def get_default_python_executables(): - python_execs = [x for x in ["python2.7", "python3.6", "pypy"] if which(x)] + python_execs = [x for x in ["python2.7", "python3.6", "pypy", "pypy3"] if which(x)] if "python2.7" not in python_execs: LOGGER.warning("Not testing against `python2.7` because it could not be found; falling" " back to `python` instead") diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index cebaad5b4ad9b..dfe318c9be5d9 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -26,10 +26,12 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.tags.SlowHiveTest /** * Runs the test cases that are included in the hive distribution. */ +@SlowHiveTest class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // TODO: bundle in jar files... get from classpath private lazy val hiveQueryDir = TestHive.getHiveFile( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index e33d8ff5efe7d..f01fc86e57882 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.{QueryTest, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.tags.{ExtendedHiveTest, SlowHiveTest} import org.apache.spark.util.Utils /** @@ -41,6 +42,8 @@ import org.apache.spark.util.Utils * expected version under this local directory, e.g. `/tmp/spark-test/spark-2.0.3`, we will skip the * downloading for this spark version. */ +@SlowHiveTest +@ExtendedHiveTest class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { private val wareHousePath = Utils.createTempDir(namePrefix = "warehouse") private val tmpDataDir = Utils.createTempDir(namePrefix = "test-data") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index a676cf6ce6925..90ba6d80a57c9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -33,11 +33,14 @@ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext} import org.apache.spark.sql.types.{DecimalType, StructType} +import org.apache.spark.tags.{ExtendedHiveTest, SlowHiveTest} import org.apache.spark.util.{ResetSystemProperties, Utils} /** * This suite tests spark-submit with applications using HiveContext. */ +@SlowHiveTest +@ExtendedHiveTest class HiveSparkSubmitSuite extends SparkSubmitTestUtils with Matchers diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index ff4643a78086e..7c66ff699f70f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.hive.test.TestHiveVersion import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.StructType -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, SlowHiveTest} import org.apache.spark.util.{MutableURLClassLoader, Utils} /** @@ -48,6 +48,7 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils} * is not fully tested. */ // TODO: Refactor this to `HiveClientSuite` and make it a subclass of `HiveVersionSuite` +@SlowHiveTest @ExtendedHiveTest class VersionsSuite extends SparkFunSuite with Logging { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index c65bf7c14c7a5..1df5260dc733f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.SlowHiveTest class ScalaAggregateFunction(schema: StructType) extends UserDefinedAggregateFunction { @@ -1024,6 +1025,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te class HashAggregationQuerySuite extends AggregationQuerySuite +@SlowHiveTest class HashAggregationQueryWithControlledFallbackSuite extends AggregationQuerySuite { override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index d590a2ca2f991..90915e0b4f218 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -44,9 +44,11 @@ import org.apache.spark.sql.internal.SQLConf.ORC_IMPLEMENTATION import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.SlowHiveTest import org.apache.spark.util.Utils // TODO(gatorsmile): combine HiveCatalogedDDLSuite and HiveDDLSuite +@SlowHiveTest class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeAndAfterEach { override def afterEach(): Unit = { try { @@ -262,6 +264,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA } } +@SlowHiveTest class HiveDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach { import testImplicits._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 6a7932f82cb5f..e97941574cad9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.tags.SlowHiveTest case class TestData(a: Int, b: String) @@ -45,6 +46,7 @@ case class TestData(a: Int, b: String) * A set of test cases expressed in Hive QL that are not covered by the tests * included in the hive distribution. */ +@SlowHiveTest class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAndAfter { private val originalTimeZone = TimeZone.getDefault private val originalLocale = Locale.getDefault diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala index eaedac1fa95d8..552a6ac7b53f3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala @@ -30,13 +30,14 @@ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.sql.types._ -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, SlowHiveTest} import org.apache.spark.util.Utils /** * A separate set of DDL tests that uses Hive 2.1 libraries, which behave a little differently * from the built-in ones. */ +@SlowHiveTest @ExtendedHiveTest class Hive_2_1_DDLSuite extends SparkFunSuite with TestHiveSingleton with BeforeAndAfterEach with BeforeAndAfterAll { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index f69f589447aa7..833a655d6ebf6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -41,6 +41,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.SlowHiveTest import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.Utils @@ -67,6 +68,7 @@ case class Order( * Hive to generate them (in contrast to HiveQuerySuite). Often this is because the query is * valid, but Hive currently cannot execute it. */ +@SlowHiveTest class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import hiveContext._ import spark.implicits._