From 27ef3629dd96d5ee3368cdb258561ff96e907880 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Mon, 13 Jul 2020 08:31:39 -0700
Subject: [PATCH] [SPARK-32292][SPARK-32252][INFRA] Run the relevant tests only
 in GitHub Actions

### What changes were proposed in this pull request?

This PR mainly proposes to run only relevant tests just like Jenkins PR builder does. Currently, GitHub Actions always run full tests which wastes the resources.

In addition, this PR also fixes 3 more issues  very closely related together while I am here.

1. The main idea here is: It reuses the existing logic embedded in `dev/run-tests.py` which Jenkins PR builder use in order to run only the related test cases.

2. While I am here, I fixed SPARK-32292 too to run the doc tests. It was because other references were not available when it is cloned via `checkoutv2`. With `fetch-depth: 0`, the history is available.

3. In addition, it fixes the `dev/run-tests.py` to match with `python/run-tests.py` in terms of its options. Environment variables such as `TEST_ONLY_XXX` were moved as proper options. For example,

    ```bash
    dev/run-tests.py --modules sql,core
    ```

    which is consistent with `python/run-tests.py`, for example,

    ```bash
    python/run-tests.py --modules pyspark-core,pyspark-ml
    ```

4. Lastly, also fixed the formatting issue in module specification in the matrix:

    ```diff
    -            network_common, network_shuffle, repl, launcher
    +            network-common, network-shuffle, repl, launcher,
    ```

    which incorrectly runs build/test the modules.

### Why are the changes needed?

By running only related tests, we can hugely save the resources and avoid unrelated flaky tests, etc.
Also, now it runs the doctest of `dev/run-tests.py` properly, the usages are similar between `dev/run-tests.py` and `python/run-tests.py`, and run `network-common`, `network-shuffle`, `launcher` and `examples` modules too.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

Manually tested in my own forked Spark:

https://github.com/HyukjinKwon/spark/pull/7
https://github.com/HyukjinKwon/spark/pull/8
https://github.com/HyukjinKwon/spark/pull/9
https://github.com/HyukjinKwon/spark/pull/10
https://github.com/HyukjinKwon/spark/pull/11
https://github.com/HyukjinKwon/spark/pull/12

Closes #29086 from HyukjinKwon/SPARK-32292.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .github/workflows/master.yml |  16 +++--
 dev/run-tests.py             | 109 +++++++++++++++++++++++++++--------
 2 files changed, 96 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index 7bb5481a561b9..d6458bf44fd36 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -29,7 +29,7 @@ jobs:
         modules:
           - |-
             core, unsafe, kvstore, avro,
-            network_common, network_shuffle, repl, launcher
+            network-common, network-shuffle, repl, launcher,
             examples, sketch, graphx
           - |-
             catalyst, hive-thriftserver
@@ -75,16 +75,20 @@ jobs:
             excluded-tags: org.apache.spark.tags.ExtendedSQLTest
             comment: "- other tests"
     env:
-      TEST_ONLY_MODULES: ${{ matrix.modules }}
-      TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
-      TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
+      MODULES_TO_TEST: ${{ matrix.modules }}
+      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      INCLUDED_TAGS: ${{ matrix.included-tags }}
       HADOOP_PROFILE: ${{ matrix.hadoop }}
       HIVE_PROFILE: ${{ matrix.hive }}
       # GitHub Actions' default miniconda to use in pip packaging test.
       CONDA_PREFIX: /usr/share/miniconda
+      GITHUB_PREV_SHA: ${{ github.event.before }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
+      # In order to fetch changed files
+      with:
+        fetch-depth: 0
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT, Maven and Zinc
       uses: actions/cache@v1
@@ -161,9 +165,9 @@ jobs:
     - name: "Run tests: ${{ matrix.modules }}"
       run: |
         # Hive tests become flaky when running in parallel as it's too intensive.
-        if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
+        if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
         mkdir -p ~/.m2
-        ./dev/run-tests --parallelism 2
+        ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
         rm -rf ~/.m2/repository/org/apache/spark
 
   # Static analysis, and documentation build
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 03cc3230a65fd..065a27c0e853b 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -101,12 +101,14 @@ def setup_test_environ(environ):
         os.environ[k] = v
 
 
-def determine_modules_to_test(changed_modules):
+def determine_modules_to_test(changed_modules, deduplicated=True):
     """
     Given a set of modules that have changed, compute the transitive closure of those modules'
     dependent modules in order to determine the set of modules that should be tested.
 
     Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
+    If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication
+    by dependencies into account.
 
     >>> [x.name for x in determine_modules_to_test([modules.root])]
     ['root']
@@ -122,11 +124,29 @@ def determine_modules_to_test(changed_modules):
     ... # doctest: +NORMALIZE_WHITESPACE
     ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
      'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
+    >>> sorted([x.name for x in determine_modules_to_test(
+    ...     [modules.sparkr, modules.sql], deduplicated=False)])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml',
+     'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+    >>> sorted([x.name for x in determine_modules_to_test(
+    ...     [modules.sql, modules.core], deduplicated=False)])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver',
+     'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib',
+     'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
+     'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
+     'streaming-kinesis-asl']
     """
     modules_to_test = set()
     for module in changed_modules:
-        modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
+        modules_to_test = modules_to_test.union(
+            determine_modules_to_test(module.dependent_modules, deduplicated))
     modules_to_test = modules_to_test.union(set(changed_modules))
+
+    if not deduplicated:
+        return modules_to_test
+
     # If we need to run all of the tests, then we should short-circuit and return 'root'
     if modules.root in modules_to_test:
         return [modules.root]
@@ -539,6 +559,24 @@ def parse_opts():
         "-p", "--parallelism", type=int, default=8,
         help="The number of suites to test in parallel (default %(default)d)"
     )
+    parser.add_argument(
+        "-m", "--modules", type=str,
+        default=None,
+        help="A comma-separated list of modules to test "
+             "(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules]))
+    )
+    parser.add_argument(
+        "-e", "--excluded-tags", type=str,
+        default=None,
+        help="A comma-separated list of tags to exclude in the tests, "
+             "e.g., org.apache.spark.tags.ExtendedHiveTest "
+    )
+    parser.add_argument(
+        "-i", "--included-tags", type=str,
+        default=None,
+        help="A comma-separated list of tags to include in the tests, "
+             "e.g., org.apache.spark.tags.ExtendedHiveTest "
+    )
 
     args, unknown = parser.parse_known_args()
     if unknown:
@@ -589,43 +627,74 @@ def main():
         # /home/jenkins/anaconda2/envs/py36/bin
         os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
     else:
-        # else we're running locally and can use local settings
+        # else we're running locally or Github Actions.
         build_tool = "sbt"
         hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
         hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
-        test_env = "local"
+        if "GITHUB_ACTIONS" in os.environ:
+            test_env = "github_actions"
+        else:
+            test_env = "local"
 
     print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
           "and Hive profile", hive_version, "under environment", test_env)
     extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)
 
     changed_modules = None
+    test_modules = None
     changed_files = None
-    should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
+    should_only_test_modules = opts.modules is not None
     included_tags = []
+    excluded_tags = []
     if should_only_test_modules:
-        str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
+        str_test_modules = [m.strip() for m in opts.modules.split(",")]
         test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
-        # Directly uses test_modules as changed modules to apply tags and environments
-        # as if all specified test modules are changed.
+
+        # If we're running the tests in Github Actions, attempt to detect and test
+        # only the affected modules.
+        if test_env == "github_actions":
+            if os.environ["GITHUB_BASE_REF"] != "":
+                # Pull requests
+                changed_files = identify_changed_files_from_git_commits(
+                    os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"])
+            else:
+                # Build for each commit.
+                changed_files = identify_changed_files_from_git_commits(
+                    os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"])
+
+            modules_to_test = determine_modules_to_test(
+                determine_modules_for_files(changed_files), deduplicated=False)
+
+            if modules.root not in modules_to_test:
+                # If root module is not found, only test the intersected modules.
+                # If root module is found, just run the modules as specified initially.
+                test_modules = list(set(modules_to_test).intersection(test_modules))
+
         changed_modules = test_modules
-        str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
-        str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
-        excluded_tags = []
-        if str_excluded_tags:
-            excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
-        included_tags = []
-        if str_included_tags:
-            included_tags = [t.strip() for t in str_included_tags.split(",")]
+        if len(changed_modules) == 0:
+            print("[info] There are no modules to test, exiting without testing.")
+            return
+
+    # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
+    # detect modules to test.
     elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
         target_branch = os.environ["ghprbTargetBranch"]
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
+        test_modules = determine_modules_to_test(changed_modules)
         excluded_tags = determine_tags_to_exclude(changed_modules)
 
+    # If there is no changed module found, tests all.
     if not changed_modules:
         changed_modules = [modules.root]
-        excluded_tags = []
+    if not test_modules:
+        test_modules = determine_modules_to_test(changed_modules)
+
+    if opts.excluded_tags:
+        excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")])
+    if opts.included_tags:
+        included_tags.extend([t.strip() for t in opts.included_tags.split(",")])
+
     print("[info] Found the following changed modules:",
           ", ".join(x.name for x in changed_modules))
 
@@ -640,8 +709,6 @@ def main():
 
     should_run_java_style_checks = False
     if not should_only_test_modules:
-        test_modules = determine_modules_to_test(changed_modules)
-
         # license checks
         run_apache_rat_checks()
 
@@ -702,10 +769,6 @@ def main():
 
 
 def _test():
-    if "TEST_ONLY_MODULES" in os.environ:
-        # TODO(SPARK-32252): Enable doctests back in Github Actions.
-        return
-
     import doctest
     failure_count = doctest.testmod()[0]
     if failure_count: