From ba8029e7732400553c32f940439e3b6ab69f115c Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 7 Jul 2020 16:32:25 +0900 Subject: [PATCH] Migrate PR builders from Jenkins to Github Actions --- .github/workflows/master.yml | 267 ++++++++++++------ .../org/apache/spark/tags/HeavyHiveTest.java | 30 ++ .../spark/deploy/master/MasterSuite.scala | 2 +- dev/run-pip-tests | 8 +- dev/run-tests.py | 92 +++--- dev/sparktestsupport/modules.py | 93 ++++-- project/SparkBuild.scala | 11 + python/pyspark/sql/tests/test_arrow.py | 9 +- python/pyspark/sql/tests/test_types.py | 3 +- .../pyspark/streaming/tests/test_dstream.py | 15 +- python/run-tests.py | 2 +- .../execution/HiveCompatibilitySuite.scala | 2 + .../HiveExternalCatalogVersionsSuite.scala | 3 +- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 3 +- .../spark/sql/hive/client/VersionsSuite.scala | 3 +- .../execution/AggregationQuerySuite.scala | 2 + .../sql/hive/execution/HiveDDLSuite.scala | 3 + .../sql/hive/execution/HiveQuerySuite.scala | 2 + .../hive/execution/Hive_2_1_DDLSuite.scala | 3 +- .../sql/hive/execution/SQLQuerySuite.scala | 3 + 20 files changed, 396 insertions(+), 160 deletions(-) create mode 100644 common/tags/src/test/java/org/apache/spark/tags/HeavyHiveTest.java diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 4282504cc3984..d0614e9b06347 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -9,148 +9,231 @@ on: - master jobs: + # Build: build Spark and run the tests for specified modules. build: - + name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" runs-on: ubuntu-latest strategy: + fail-fast: false matrix: - java: [ '1.8', '11' ] - hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ] - hive: [ 'hive-1.2', 'hive-2.3' ] - exclude: - - java: '11' - hive: 'hive-1.2' - - hadoop: 'hadoop-3.2' - hive: 'hive-1.2' - name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }} - + java: + - 1.8 + hadoop: + - hadoop3.2 + hive: + - hive2.3 + # TODO(SPARK-XXXXX): We don't test 'streaming-kinesis-asl' for now. + # Kinesis tests depends on external Amazon kinesis service. + # Note that the modules below are from sparktestsupport/modules.py. + modules: + - |- + core, unsafe, kvstore, avro, + network_common, network_shuffle, repl, launcher + examples, sketch, graphx + - |- + catalyst, sql + - |- + hive-thriftserver + - |- + streaming, sql-kafka-0-10, streaming-kafka-0-10 + - |- + mllib-local, mllib + - |- + pyspark-sql, pyspark-mllib, pyspark-resource + - |- + pyspark-core, pyspark-streaming, pyspark-ml + - |- + sparkr + - |- + yarn, mesos, kubernetes, hadoop-cloud, + spark-ganglia-lgpl + # Here, we split Hive tests into some of heavy ones and the rest of them. + included-tags: [""] + excluded-tags: [""] + comment: ["- running all tests"] + include: + - modules: hive + java: 1.8 + hadoop: hadoop3.2 + hive: hive2.3 + included-tags: org.apache.spark.tags.HeavyHiveTest + comment: "- running heavy tests" + - modules: hive + java: 1.8 + hadoop: hadoop3.2 + hive: hive2.3 + excluded-tags: org.apache.spark.tags.HeavyHiveTest + comment: "- running non-heavy tests" + env: + TEST_ONLY_MODULES: ${{ matrix.modules }} + HADOOP_PROFILE: ${{ matrix.hadoop }} + HIVE_PROFILE: ${{ matrix.hive }} + # Github Actions' default miniconda + CONDA_PREFIX: /usr/share/miniconda + # Don't run the tests in parallel due to flakiness. See SparkParallelTestGrouping. + SERIAL_SBT_TESTS: 1 + TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }} + TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }} steps: - - uses: actions/checkout@master - # We split caches because GitHub Action Cache has a 400MB-size limit. - - uses: actions/cache@v1 + - name: Checkout Spark repository + uses: actions/checkout@v2 + # Cache local repositories. Note that Github Actions cache has a 2G limit. + - name: Cache Scala, SBT, Maven and Zinc + uses: actions/cache@v1 with: path: build key: build-${{ hashFiles('**/pom.xml') }} restore-keys: | build- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/com - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com- - - uses: actions/cache@v1 + - name: Cache Maven local repository + uses: actions/cache@v2 with: - path: ~/.m2/repository/org - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/net - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }} + path: ~/.m2/repository + key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net- - - uses: actions/cache@v1 + ${{ matrix.java }}-${{ matrix.hadoop }}-maven- + - name: Cache Ivy local repository + uses: actions/cache@v2 with: - path: ~/.m2/repository/io - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }} + path: ~/.ivy2/cache + key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }} restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io- - - name: Set up JDK ${{ matrix.java }} + ${{ matrix.java }}-${{ matrix.hadoop }}-ivy- + - name: Install JDK ${{ matrix.java }} uses: actions/setup-java@v1 with: java-version: ${{ matrix.java }} - - name: Build with Maven + # PySpark + - name: Install PyPy3 + # SQL component also has Python related tests, for example, IntegratedUDFTestUtils. + # Note that order here matters because default python3 is overridden by pypy3. + uses: actions/setup-python@v2 + if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql' + with: + python-version: pypy3 + architecture: x64 + - name: Install Python 2.7 + uses: actions/setup-python@v2 + if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql' + with: + python-version: 2.7 + architecture: x64 + - name: Install Python 3.6 + uses: actions/setup-python@v2 + if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql' + with: + python-version: 3.6 + architecture: x64 + - name: Install Python packages + if: contains(matrix.modules, 'pyspark') || matrix.modules == 'sql' + # PyArrow is not supported in PyPy yet, see ARROW-2651. + # scipy installation with PyPy fails for an unknown reason. + run: | + python3 -m pip install numpy pyarrow pandas scipy + python3 -m pip list + python2 -m pip install numpy pyarrow pandas scipy + python2 -m pip list + # Installing NumPy is flaky in PyPy. + pypy3 -m pip install numpy pandas + pypy3 -m pip list + # SparkR + - name: Install R 3.6 + uses: r-lib/actions/setup-r@v1 + if: contains(matrix.modules, 'sparkr') + with: + r-version: 3.6 + - name: Install R packages + if: contains(matrix.modules, 'sparkr') + run: | + sudo apt-get install -y libcurl4-openssl-dev + sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')" + # Show installed packages in R. + sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]' + # Run the tests. + - name: "Run tests: ${{ matrix.modules }}" run: | - export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" - export MAVEN_CLI_OPTS="--no-transfer-progress" mkdir -p ~/.m2 - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install + ./dev/run-tests --parallelism 2 rm -rf ~/.m2/repository/org/apache/spark - + # Linters: run the linters and other static analysis. lint: + name: Linters, licenses, dependencies runs-on: ubuntu-latest - name: Linters (Java/Scala/Python), licenses, dependencies steps: - uses: actions/checkout@master - - uses: actions/setup-java@v1 + - name: Install JDK 11 + uses: actions/setup-java@v1 with: - java-version: '11' - - uses: actions/setup-python@v1 + java-version: 11 + - name: Install Python 3.6 + uses: actions/setup-python@v2 with: - python-version: '3.x' - architecture: 'x64' - - name: Scala - run: ./dev/lint-scala - - name: Java - run: ./dev/lint-java - - name: Python + python-version: 3.6 + architecture: x64 + - name: Install Python linter dependencies run: | - pip install flake8 sphinx numpy - ./dev/lint-python - - name: License - run: ./dev/check-license - - name: Dependencies - run: ./dev/test-dependencies.sh - - lintr: - runs-on: ubuntu-latest - name: Linter (R) - steps: - - uses: actions/checkout@master - - uses: actions/setup-java@v1 + pip3 install flake8 sphinx numpy + - name: Install R 3.6 + uses: r-lib/actions/setup-r@v1 with: - java-version: '11' - - uses: r-lib/actions/setup-r@v1 - with: - r-version: '3.6.2' - - name: Install lib + r-version: 3.6 + - name: Install R linter dependencies and SparkR run: | sudo apt-get install -y libcurl4-openssl-dev - - name: install R packages - run: | - sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" + sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" - - name: package and install SparkR - run: ./R/install-dev.sh - - name: lint-r + ./R/install-dev.sh + - name: Scala linter + run: ./dev/lint-scala + - name: Java linter + run: ./dev/lint-java + - name: Python linter + run: ./dev/lint-python + - name: R linter run: ./dev/lint-r + - name: License test + run: ./dev/check-license + - name: Dependencies test + run: ./dev/test-dependencies.sh + # Documentation build. docs: + name: Build documentation runs-on: ubuntu-latest - name: Generate documents steps: - uses: actions/checkout@master - - uses: actions/cache@v1 + - name: Cache Maven local repository + uses: actions/cache@v2 with: path: ~/.m2/repository key: docs-maven-repo-${{ hashFiles('**/pom.xml') }} restore-keys: | - docs-maven-repo- - - uses: actions/setup-java@v1 + docs-maven- + - name: Install JDK 1.8 + uses: actions/setup-java@v1 with: - java-version: '1.8' - - uses: actions/setup-python@v1 + java-version: 1.8 + - name: Install Python 3.6 + uses: actions/setup-python@v2 with: - python-version: '3.x' - architecture: 'x64' - - uses: actions/setup-ruby@v1 + python-version: 3.6 + architecture: x64 + - name: Install Ruby 2.7 + uses: actions/setup-ruby@v1 with: - ruby-version: '2.7' - - uses: r-lib/actions/setup-r@v1 + ruby-version: 2.7 + - name: Install R 3.6 + uses: r-lib/actions/setup-r@v1 with: - r-version: '3.6.2' - - name: Install lib and pandoc + r-version: 3.6 + - name: Install dependencies run: | sudo apt-get install -y libcurl4-openssl-dev pandoc - - name: Install packages - run: | pip install sphinx mkdocs numpy gem install jekyll jekyll-redirect-from rouge - sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" - - name: Run jekyll build + sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" + - name: Run documentation build run: | cd docs jekyll build diff --git a/common/tags/src/test/java/org/apache/spark/tags/HeavyHiveTest.java b/common/tags/src/test/java/org/apache/spark/tags/HeavyHiveTest.java new file mode 100644 index 0000000000000..c01f1564ea171 --- /dev/null +++ b/common/tags/src/test/java/org/apache/spark/tags/HeavyHiveTest.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.tags; + +import org.scalatest.TagAnnotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@TagAnnotation +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.METHOD, ElementType.TYPE}) +public @interface HeavyHiveTest { } diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala index 0cf573c2490b3..c7623ac54f63b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala @@ -685,7 +685,7 @@ class MasterSuite extends SparkFunSuite } } - test("SPARK-27510: Master should avoid dead loop while launching executor failed in Worker") { + ignore("SPARK-27510: Master should avoid dead loop while launching executor failed in Worker") { val master = makeAliveMaster() var worker: MockExecutorLaunchFailWorker = null try { diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 470f21e69d46a..cb5d858eee2b7 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -76,8 +76,12 @@ for python in "${PYTHON_EXECS[@]}"; do VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python rm -rf "$VIRTUALENV_PATH" if [ -n "$USE_CONDA" ]; then + if [ -f "$CONDA_PREFIX/etc/profile.d/conda.sh" ]; then + # See also https://github.com/conda/conda/issues/7980 + source $CONDA_PREFIX/etc/profile.d/conda.sh + fi conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools - source activate "$VIRTUALENV_PATH" + conda activate "$VIRTUALENV_PATH" else mkdir -p "$VIRTUALENV_PATH" virtualenv --python=$python "$VIRTUALENV_PATH" @@ -120,7 +124,7 @@ for python in "${PYTHON_EXECS[@]}"; do # conda / virtualenv environments need to be deactivated differently if [ -n "$USE_CONDA" ]; then - source deactivate + conda deactivate else deactivate fi diff --git a/dev/run-tests.py b/dev/run-tests.py index 223072cbe7bfb..183a4761fffa7 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -112,10 +112,14 @@ def determine_modules_to_test(changed_modules): ['root'] >>> [x.name for x in determine_modules_to_test([modules.build])] ['root'] + >>> [x.name for x in determine_modules_to_test([modules.core])] + ['root'] + >>> [x.name for x in determine_modules_to_test([modules.launcher])] + ['root'] >>> [x.name for x in determine_modules_to_test([modules.graphx])] ['graphx', 'examples'] - >>> x = [x.name for x in determine_modules_to_test([modules.sql])] - >>> x # doctest: +NORMALIZE_WHITESPACE + >>> [x.name for x in determine_modules_to_test([modules.sql])] + ... # doctest: +NORMALIZE_WHITESPACE ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml'] """ @@ -416,7 +420,7 @@ def run_scala_tests_sbt(test_modules, test_profiles): exec_sbt(profiles_and_goals) -def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags): +def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags): """Function to properly execute all tests passed in as a set from the `determine_test_suites` function""" set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") @@ -426,6 +430,8 @@ def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags): test_profiles = extra_profiles + \ list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules))) + if included_tags: + test_profiles += ['-Dtest.include.tags=' + ",".join(included_tags)] if excluded_tags: test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)] @@ -595,13 +601,28 @@ def main(): changed_modules = None changed_files = None - if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): + should_only_test_modules = "TEST_ONLY_MODULES" in os.environ + included_tags = [] + if should_only_test_modules: + str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")] + test_modules = [m for m in modules.all_modules if m.name in str_test_modules] + # Directly uses test_modules as changed modules to apply tags and environments + # as if all specified test modules are changed. + changed_modules = test_modules + str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None) + str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None) + excluded_tags = [] + if str_excluded_tags: + excluded_tags = [t.strip() for t in str_excluded_tags.split(",")] + included_tags = [] + if str_included_tags: + included_tags = [t.strip() for t in str_included_tags.split(",")] + elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) excluded_tags = determine_tags_to_exclude(changed_modules) - - if not changed_modules: + else: changed_modules = [modules.root] excluded_tags = [] print("[info] Found the following changed modules:", @@ -616,33 +637,34 @@ def main(): test_environ.update(m.environ) setup_test_environ(test_environ) - test_modules = determine_modules_to_test(changed_modules) - - # license checks - run_apache_rat_checks() - - # style checks - if not changed_files or any(f.endswith(".scala") - or f.endswith("scalastyle-config.xml") - for f in changed_files): - run_scala_style_checks(extra_profiles) should_run_java_style_checks = False - if not changed_files or any(f.endswith(".java") - or f.endswith("checkstyle.xml") - or f.endswith("checkstyle-suppressions.xml") - for f in changed_files): - # Run SBT Checkstyle after the build to prevent a side-effect to the build. - should_run_java_style_checks = True - if not changed_files or any(f.endswith("lint-python") - or f.endswith("tox.ini") - or f.endswith(".py") - for f in changed_files): - run_python_style_checks() - if not changed_files or any(f.endswith(".R") - or f.endswith("lint-r") - or f.endswith(".lintr") - for f in changed_files): - run_sparkr_style_checks() + if not should_only_test_modules: + test_modules = determine_modules_to_test(changed_modules) + + # license checks + run_apache_rat_checks() + + # style checks + if not changed_files or any(f.endswith(".scala") + or f.endswith("scalastyle-config.xml") + for f in changed_files): + run_scala_style_checks(extra_profiles) + if not changed_files or any(f.endswith(".java") + or f.endswith("checkstyle.xml") + or f.endswith("checkstyle-suppressions.xml") + for f in changed_files): + # Run SBT Checkstyle after the build to prevent a side-effect to the build. + should_run_java_style_checks = True + if not changed_files or any(f.endswith("lint-python") + or f.endswith("tox.ini") + or f.endswith(".py") + for f in changed_files): + run_python_style_checks() + if not changed_files or any(f.endswith(".R") + or f.endswith("lint-r") + or f.endswith(".lintr") + for f in changed_files): + run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed @@ -664,7 +686,7 @@ def main(): build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) # run the test suites - run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags) + run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: @@ -679,6 +701,10 @@ def main(): def _test(): + if "TEST_ONLY_MODULES" in os.environ: + # Do not do anything except testing the targeted modules. + return + import doctest failure_count = doctest.testmod()[0] if failure_count: diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 85e6a1e9fadac..2e8b8b57a5b4f 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -100,9 +100,75 @@ def __hash__(self): ] ) +kvstore = Module( + name="kvstore", + dependencies=[tags], + source_file_regexes=[ + "common/kvstore/", + ], + sbt_test_goals=[ + "kvstore/test", + ], +) + +network_common = Module( + name="network-common", + dependencies=[tags], + source_file_regexes=[ + "common/network-common/", + ], + sbt_test_goals=[ + "network-common/test", + ], +) + +network_shuffle = Module( + name="network-shuffle", + dependencies=[tags], + source_file_regexes=[ + "common/network-shuffle/", + ], + sbt_test_goals=[ + "network-shuffle/test", + ], +) + +unsafe = Module( + name="unsafe", + dependencies=[tags], + source_file_regexes=[ + "common/unsafe", + ], + sbt_test_goals=[ + "unsafe/test", + ], +) + +launcher = Module( + name="launcher", + dependencies=[tags], + source_file_regexes=[ + "launcher/", + ], + sbt_test_goals=[ + "launcher/test", + ], +) + +core = Module( + name="core", + dependencies=[kvstore, network_common, network_shuffle, unsafe, launcher], + source_file_regexes=[ + "core/", + ], + sbt_test_goals=[ + "core/test", + ], +) + catalyst = Module( name="catalyst", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "sql/catalyst/", ], @@ -111,7 +177,6 @@ def __hash__(self): ], ) - sql = Module( name="sql", dependencies=[catalyst], @@ -123,7 +188,6 @@ def __hash__(self): ], ) - hive = Module( name="hive", dependencies=[sql], @@ -142,7 +206,6 @@ def __hash__(self): ] ) - repl = Module( name="repl", dependencies=[hive], @@ -154,7 +217,6 @@ def __hash__(self): ], ) - hive_thriftserver = Module( name="hive-thriftserver", dependencies=[hive], @@ -192,7 +254,6 @@ def __hash__(self): ] ) - sketch = Module( name="sketch", dependencies=[tags], @@ -204,10 +265,9 @@ def __hash__(self): ] ) - graphx = Module( name="graphx", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "graphx/", ], @@ -216,10 +276,9 @@ def __hash__(self): ] ) - streaming = Module( name="streaming", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "streaming", ], @@ -235,7 +294,7 @@ def __hash__(self): # fail other PRs. streaming_kinesis_asl = Module( name="streaming-kinesis-asl", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "external/kinesis-asl/", "external/kinesis-asl-assembly/", @@ -254,21 +313,23 @@ def __hash__(self): streaming_kafka_0_10 = Module( name="streaming-kafka-0-10", - dependencies=[streaming], + dependencies=[streaming, core], source_file_regexes=[ # The ending "/" is necessary otherwise it will include "sql-kafka" codes "external/kafka-0-10/", "external/kafka-0-10-assembly", + "external/kafka-0-10-token-provider", ], sbt_test_goals=[ "streaming-kafka-0-10/test", + "token-provider-kafka-0-10/test" ] ) mllib_local = Module( name="mllib-local", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "mllib-local", ], @@ -302,10 +363,9 @@ def __hash__(self): ] ) - pyspark_core = Module( name="pyspark-core", - dependencies=[], + dependencies=[core], source_file_regexes=[ "python/(?!pyspark/(ml|mllib|sql|streaming))" ], @@ -339,7 +399,6 @@ def __hash__(self): ] ) - pyspark_sql = Module( name="pyspark-sql", dependencies=[pyspark_core, hive, avro], @@ -593,7 +652,7 @@ def __hash__(self): # No other modules should directly depend on this module. root = Module( name="root", - dependencies=[build], # Changes to build should trigger all tests. + dependencies=[build, core], # Changes to build should trigger all tests. source_file_regexes=[], # In order to run all of the tests, enable every test profile: build_profile_flags=list(set( diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 60c54dfc98a58..2d7c8bdd83022 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1015,9 +1015,20 @@ object TestSettings { sys.props.get("test.exclude.tags").map { tags => Seq("--exclude-categories=" + tags) }.getOrElse(Nil): _*), + // Include tags defined in a system property + testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest, + sys.props.get("test.include.tags").map { tags => + tags.split(",").flatMap { tag => Seq("-n", tag) }.toSeq + }.getOrElse(Nil): _*), + testOptions in Test += Tests.Argument(TestFrameworks.JUnit, + sys.props.get("test.include.tags").map { tags => + Seq("--include-categories=" + tags) + }.getOrElse(Nil): _*), // Show full stack trace and duration in test cases. testOptions in Test += Tests.Argument("-oDF"), testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"), + // Required to detect Junit tests for each project, see also https://github.com/sbt/junit-interface/issues/35 + crossPaths := false, // Enable Junit testing. libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % "test", // `parallelExecutionInTest` controls whether test suites belonging to the same SBT project diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index 913b43b6ddb5a..a96354e3ecb58 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -21,6 +21,9 @@ import time import unittest import warnings +import sys +if sys.version >= '3': + basestring = unicode = str from pyspark import SparkContext, SparkConf from pyspark.sql import Row, SparkSession @@ -435,12 +438,12 @@ def test_createDateFrame_with_category_type(self): assert_frame_equal(result_spark, result_arrow) # ensure original category elements are string - self.assertIsInstance(category_first_element, str) + self.assertIsInstance(category_first_element, basestring) # spark data frame and arrow execution mode enabled data frame type must match pandas self.assertEqual(spark_type, 'string') self.assertEqual(arrow_type, 'string') - self.assertIsInstance(arrow_first_category_element, str) - self.assertIsInstance(spark_first_category_element, str) + self.assertIsInstance(arrow_first_category_element, basestring) + self.assertIsInstance(spark_first_category_element, basestring) def test_createDataFrame_with_float_index(self): # SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 81402f52af3b3..016cafd669019 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -725,7 +725,8 @@ def assertCollectSuccess(typecode, value): if sys.version_info[0] < 3: all_types = set(['c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd']) else: - all_types = set(array.typecodes) + # PyPy seems not having array.typecodes. + all_types = set(['b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'Q', 'f', 'd']) unsupported_types = all_types - set(supported_types) # test unsupported types for t in unsupported_types: diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py index 7ecdf6b0b12db..89edb23070c69 100644 --- a/python/pyspark/streaming/tests/test_dstream.py +++ b/python/pyspark/streaming/tests/test_dstream.py @@ -30,8 +30,9 @@ @unittest.skipIf( - "pypy" in platform.python_implementation().lower() and "COVERAGE_PROCESS_START" in os.environ, - "PyPy implementation causes to hang DStream tests forever when Coverage report is used.") + "pypy" in platform.python_implementation().lower(), + "The tests fail in PyPy3 implementation for an unknown reason. " + "With PyPy, it causes to hang DStream tests forever when Coverage report is used.") class BasicOperationTests(PySparkStreamingTestCase): def test_map(self): @@ -394,8 +395,9 @@ def failed_func(i): @unittest.skipIf( - "pypy" in platform.python_implementation().lower() and "COVERAGE_PROCESS_START" in os.environ, - "PyPy implementation causes to hang DStream tests forever when Coverage report is used.") + "pypy" in platform.python_implementation().lower(), + "The tests fail in PyPy3 implementation for an unknown reason. " + "With PyPy, it causes to hang DStream tests forever when Coverage report is used.") class WindowFunctionTests(PySparkStreamingTestCase): timeout = 15 @@ -474,8 +476,9 @@ def func(dstream): @unittest.skipIf( - "pypy" in platform.python_implementation().lower() and "COVERAGE_PROCESS_START" in os.environ, - "PyPy implementation causes to hang DStream tests forever when Coverage report is used.") + "pypy" in platform.python_implementation().lower(), + "The tests fail in PyPy3 implementation for an unknown reason. " + "With PyPy, it causes to hang DStream tests forever when Coverage report is used.") class CheckpointTests(unittest.TestCase): setupCalled = False diff --git a/python/run-tests.py b/python/run-tests.py index b677a5134ec93..ff31cc9fe151a 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -161,7 +161,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): def get_default_python_executables(): - python_execs = [x for x in ["python3.6", "python2.7", "pypy"] if which(x)] + python_execs = [x for x in ["python3.6", "python2.7", "pypy3"] if which(x)] if "python3.6" not in python_execs: p = which("python3") diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 82af7dceb27f2..c99f444ceb4f9 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -26,10 +26,12 @@ import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy +import org.apache.spark.tags.HeavyHiveTest /** * Runs the test cases that are included in the hive distribution. */ +@HeavyHiveTest class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // TODO: bundle in jar files... get from classpath private lazy val hiveQueryDir = TestHive.getHiveFile( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index 8be3d26bfc93a..9a336432285d0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH import org.apache.spark.sql.test.SQLTestUtils -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, HeavyHiveTest} import org.apache.spark.util.Utils /** @@ -46,6 +46,7 @@ import org.apache.spark.util.Utils * expected version under this local directory, e.g. `/tmp/spark-test/spark-2.0.3`, we will skip the * downloading for this spark version. */ +@HeavyHiveTest @ExtendedHiveTest class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { private val isTestAtLeastJava9 = SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_9) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index 8b97489e2d818..a2dc467552efa 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -38,12 +38,13 @@ import org.apache.spark.sql.hive.test.{HiveTestJars, TestHiveContext} import org.apache.spark.sql.internal.SQLConf.SHUFFLE_PARTITIONS import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH import org.apache.spark.sql.types.{DecimalType, StructType} -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, HeavyHiveTest} import org.apache.spark.util.{ResetSystemProperties, Utils} /** * This suite tests spark-submit with applications using HiveContext. */ +@HeavyHiveTest @ExtendedHiveTest class HiveSparkSubmitSuite extends SparkSubmitTestUtils diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index 8642a5ff16812..c8d7cd4194750 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -41,7 +41,7 @@ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.hive.test.TestHiveVersion import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.StructType -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, HeavyHiveTest} import org.apache.spark.util.{MutableURLClassLoader, Utils} /** @@ -51,6 +51,7 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils} * is not fully tested. */ // TODO: Refactor this to `HiveClientSuite` and make it a subclass of `HiveVersionSuite` +@HeavyHiveTest @ExtendedHiveTest class VersionsSuite extends SparkFunSuite with Logging { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index fac981267f4d7..8f19787c04ebb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.HeavyHiveTest import org.apache.spark.unsafe.UnsafeAlignedOffset @@ -1054,6 +1055,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te class HashAggregationQuerySuite extends AggregationQuerySuite +@HeavyHiveTest class HashAggregationQueryWithControlledFallbackSuite extends AggregationQuerySuite { override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 774fb5b4b9ad5..95d3790d37252 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -45,9 +45,11 @@ import org.apache.spark.sql.internal.SQLConf.ORC_IMPLEMENTATION import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.HeavyHiveTest import org.apache.spark.util.Utils // TODO(gatorsmile): combine HiveCatalogedDDLSuite and HiveDDLSuite +@HeavyHiveTest class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeAndAfterEach { override def afterEach(): Unit = { try { @@ -405,6 +407,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA } } +@HeavyHiveTest class HiveDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach { import testImplicits._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index e5628c33b5ec8..db971807d1276 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -39,6 +39,7 @@ import org.apache.spark.sql.hive.test.{HiveTestJars, TestHive} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.tags.HeavyHiveTest case class TestData(a: Int, b: String) @@ -46,6 +47,7 @@ case class TestData(a: Int, b: String) * A set of test cases expressed in Hive QL that are not covered by the tests * included in the hive distribution. */ +@HeavyHiveTest class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAndAfter { import org.apache.spark.sql.hive.test.TestHive.implicits._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala index b20ef035594da..7479cfb621147 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala @@ -27,13 +27,14 @@ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.sql.types._ -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, HeavyHiveTest} import org.apache.spark.util.Utils /** * A separate set of DDL tests that uses Hive 2.1 libraries, which behave a little differently * from the built-in ones. */ +@HeavyHiveTest @ExtendedHiveTest class Hive_2_1_DDLSuite extends SparkFunSuite with TestHiveSingleton with BeforeAndAfterEach with BeforeAndAfterAll { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 2fe6a59a27c1b..fd090fd964c73 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -43,6 +43,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.HeavyHiveTest import org.apache.spark.util.Utils case class Nested1(f1: Nested2) @@ -2559,6 +2560,8 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } } +@HeavyHiveTest class SQLQuerySuite extends SQLQuerySuiteBase with DisableAdaptiveExecutionSuite +@HeavyHiveTest class SQLQuerySuiteAE extends SQLQuerySuiteBase with EnableAdaptiveExecutionSuite