Skip to content

Commit 74cad54

Browse files
authored
Detect changed files and run the relavent tests only in GitHub Actions (#8)
Test in master bracnh
1 parent b6229df commit 74cad54

File tree

5 files changed

+109
-192
lines changed

5 files changed

+109
-192
lines changed

.github/workflows/master.yml

Lines changed: 7 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -75,156 +75,30 @@ jobs:
7575
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
7676
comment: "- other tests"
7777
env:
78-
TEST_ONLY_MODULES: ${{ matrix.modules }}
79-
TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
80-
TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
78+
MODULES_TO_TEST: ${{ matrix.modules }}
79+
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
80+
INCLUDED_TAGS: ${{ matrix.included-tags }}
8181
HADOOP_PROFILE: ${{ matrix.hadoop }}
8282
HIVE_PROFILE: ${{ matrix.hive }}
8383
# GitHub Actions' default miniconda to use in pip packaging test.
8484
CONDA_PREFIX: /usr/share/miniconda
8585
steps:
8686
- name: Checkout Spark repository
8787
uses: actions/checkout@v2
88-
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
89-
- name: Cache Scala, SBT, Maven and Zinc
90-
uses: actions/cache@v1
88+
# In order to fetch changed files
9189
with:
92-
path: build
93-
key: build-${{ hashFiles('**/pom.xml') }}
94-
restore-keys: |
95-
build-
96-
- name: Cache Maven local repository
97-
uses: actions/cache@v2
98-
with:
99-
path: ~/.m2/repository
100-
key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
101-
restore-keys: |
102-
${{ matrix.java }}-${{ matrix.hadoop }}-maven-
103-
- name: Cache Ivy local repository
104-
uses: actions/cache@v2
105-
with:
106-
path: ~/.ivy2/cache
107-
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
108-
restore-keys: |
109-
${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
110-
- name: Install JDK ${{ matrix.java }}
111-
uses: actions/setup-java@v1
112-
with:
113-
java-version: ${{ matrix.java }}
90+
fetch-depth: 0
11491
# PySpark
115-
- name: Install PyPy3
116-
# SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
117-
# Note that order of Python installations here matters because default python3 is
118-
# overridden by pypy3.
119-
uses: actions/setup-python@v2
120-
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
121-
with:
122-
python-version: pypy3
123-
architecture: x64
124-
- name: Install Python 2.7
125-
uses: actions/setup-python@v2
126-
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
127-
with:
128-
python-version: 2.7
129-
architecture: x64
13092
- name: Install Python 3.6
13193
uses: actions/setup-python@v2
13294
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
13395
with:
13496
python-version: 3.6
13597
architecture: x64
136-
- name: Install Python packages
137-
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
138-
# PyArrow is not supported in PyPy yet, see ARROW-2651.
139-
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
140-
run: |
141-
python3 -m pip install numpy pyarrow pandas scipy
142-
python3 -m pip list
143-
python2 -m pip install numpy pyarrow pandas scipy
144-
python2 -m pip list
145-
pypy3 -m pip install numpy pandas
146-
pypy3 -m pip list
147-
# SparkR
148-
- name: Install R 3.6
149-
uses: r-lib/actions/setup-r@v1
150-
if: contains(matrix.modules, 'sparkr')
151-
with:
152-
r-version: 3.6
153-
- name: Install R packages
154-
if: contains(matrix.modules, 'sparkr')
155-
run: |
156-
sudo apt-get install -y libcurl4-openssl-dev
157-
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
158-
# Show installed packages in R.
159-
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
160-
# Run the tests.
16198
- name: "Run tests: ${{ matrix.modules }}"
16299
run: |
163100
# Hive tests become flaky when running in parallel as it's too intensive.
164-
if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
101+
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
165102
mkdir -p ~/.m2
166-
./dev/run-tests --parallelism 2
103+
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
167104
rm -rf ~/.m2/repository/org/apache/spark
168-
169-
# Static analysis, and documentation build
170-
lint:
171-
name: Linters, licenses, dependencies and documentation generation
172-
runs-on: ubuntu-latest
173-
steps:
174-
- name: Checkout Spark repository
175-
uses: actions/checkout@v2
176-
- name: Cache Maven local repository
177-
uses: actions/cache@v2
178-
with:
179-
path: ~/.m2/repository
180-
key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
181-
restore-keys: |
182-
docs-maven-
183-
- name: Install JDK 1.8
184-
uses: actions/setup-java@v1
185-
with:
186-
java-version: 1.8
187-
- name: Install Python 3.6
188-
uses: actions/setup-python@v2
189-
with:
190-
python-version: 3.6
191-
architecture: x64
192-
- name: Install Python linter dependencies
193-
run: |
194-
pip3 install flake8 sphinx numpy
195-
- name: Install R 3.6
196-
uses: r-lib/actions/setup-r@v1
197-
with:
198-
r-version: 3.6
199-
- name: Install R linter dependencies and SparkR
200-
run: |
201-
sudo apt-get install -y libcurl4-openssl-dev
202-
sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
203-
sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')"
204-
./R/install-dev.sh
205-
- name: Install Ruby 2.7 for documentation generation
206-
uses: actions/setup-ruby@v1
207-
with:
208-
ruby-version: 2.7
209-
- name: Install dependencies for documentation generation
210-
run: |
211-
sudo apt-get install -y libcurl4-openssl-dev pandoc
212-
pip install sphinx mkdocs numpy
213-
gem install jekyll jekyll-redirect-from rouge
214-
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
215-
- name: Scala linter
216-
run: ./dev/lint-scala
217-
- name: Java linter
218-
run: ./dev/lint-java
219-
- name: Python linter
220-
run: ./dev/lint-python
221-
- name: R linter
222-
run: ./dev/lint-r
223-
- name: License test
224-
run: ./dev/check-license
225-
- name: Dependencies test
226-
run: ./dev/test-dependencies.sh
227-
- name: Run documentation build
228-
run: |
229-
cd docs
230-
jekyll build

R/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# R on Spark
1+
# R on Spark.
22

33
SparkR is an R package that provides a light-weight frontend to use Spark from R.
44

dev/run-tests.py

Lines changed: 98 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -79,17 +79,20 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
7979
identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
8080
True
8181
"""
82-
if target_branch is None and target_ref is None:
83-
raise AttributeError("must specify either target_branch or target_ref")
84-
elif target_branch is not None and target_ref is not None:
82+
if target_branch is not None and target_ref is not None:
8583
raise AttributeError("must specify either target_branch or target_ref, not both")
8684
if target_branch is not None:
87-
diff_target = target_branch
85+
diff_target = [target_branch]
8886
run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
87+
elif target_ref is not None:
88+
diff_target = [target_ref]
8989
else:
90-
diff_target = target_ref
91-
raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
92-
universal_newlines=True)
90+
# If both are not specified, just show the diff from the commit only.
91+
diff_target = []
92+
raw_output = subprocess.check_output(
93+
['git', 'diff', '--name-only', patch_sha] + diff_target,
94+
universal_newlines=True)
95+
print(raw_output)
9396
# Remove any empty strings
9497
return [f for f in raw_output.split('\n') if f]
9598

@@ -539,6 +542,24 @@ def parse_opts():
539542
"-p", "--parallelism", type=int, default=8,
540543
help="The number of suites to test in parallel (default %(default)d)"
541544
)
545+
parser.add_argument(
546+
"-m", "--modules", type=str,
547+
default=None,
548+
help="A comma-separated list of modules to test "
549+
"(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules]))
550+
)
551+
parser.add_argument(
552+
"-e", "--excluded-tags", type=str,
553+
default=None,
554+
help="A comma-separated list of tags to exclude in the tests, "
555+
"e.g., org.apache.spark.tags.ExtendedHiveTest "
556+
)
557+
parser.add_argument(
558+
"-i", "--included-tags", type=str,
559+
default=None,
560+
help="A comma-separated list of tags to include in the tests, "
561+
"e.g., org.apache.spark.tags.ExtendedHiveTest "
562+
)
542563

543564
args, unknown = parser.parse_known_args()
544565
if unknown:
@@ -589,43 +610,64 @@ def main():
589610
# /home/jenkins/anaconda2/envs/py36/bin
590611
os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH")
591612
else:
592-
# else we're running locally and can use local settings
613+
# else we're running locally or Github Actions.
593614
build_tool = "sbt"
594615
hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
595616
hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
596-
test_env = "local"
617+
if "GITHUB_ACTIONS" in os.environ:
618+
test_env = "github_actions"
619+
else:
620+
test_env = "local"
597621

598622
print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
599623
"and Hive profile", hive_version, "under environment", test_env)
600624
extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)
601625

602626
changed_modules = None
627+
test_modules = None
603628
changed_files = None
604-
should_only_test_modules = "TEST_ONLY_MODULES" in os.environ
629+
should_only_test_modules = opts.modules is not None
605630
included_tags = []
631+
excluded_tags = []
606632
if should_only_test_modules:
607-
str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")]
633+
str_test_modules = [m.strip() for m in opts.modules.split(",")]
608634
test_modules = [m for m in modules.all_modules if m.name in str_test_modules]
609-
# Directly uses test_modules as changed modules to apply tags and environments
610-
# as if all specified test modules are changed.
635+
636+
# If we're running the tests in Github Actions, attempt to detect and test
637+
# only the affected modules.
638+
if test_env == "github_actions":
639+
base_ref = os.environ["GITHUB_BASE_REF"]
640+
changed_files = identify_changed_files_from_git_commits(
641+
os.environ["GITHUB_SHA"], target_branch=None if base_ref == "" else base_ref)
642+
print("changed_files : %s" % changed_files)
643+
test_modules = list(set(determine_modules_to_test(
644+
determine_modules_for_files(changed_files))).intersection(test_modules))
645+
print("test_modules : %s" % test_modules)
646+
611647
changed_modules = test_modules
612-
str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None)
613-
str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None)
614-
excluded_tags = []
615-
if str_excluded_tags:
616-
excluded_tags = [t.strip() for t in str_excluded_tags.split(",")]
617-
included_tags = []
618-
if str_included_tags:
619-
included_tags = [t.strip() for t in str_included_tags.split(",")]
648+
649+
# If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and
650+
# detect modules to test.
620651
elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
621652
target_branch = os.environ["ghprbTargetBranch"]
622653
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
623654
changed_modules = determine_modules_for_files(changed_files)
655+
test_modules = determine_modules_to_test(changed_modules)
624656
excluded_tags = determine_tags_to_exclude(changed_modules)
625657

658+
# If there is no changed module found, tests all.
626659
if not changed_modules:
627660
changed_modules = [modules.root]
628-
excluded_tags = []
661+
if not test_modules:
662+
test_modules = determine_modules_to_test(changed_modules)
663+
664+
str_excluded_tags = opts.excluded_tags
665+
str_included_tags = opts.included_tags
666+
if str_excluded_tags:
667+
excluded_tags.extend([t.strip() for t in str_excluded_tags.split(",")])
668+
if str_included_tags:
669+
included_tags.extend([t.strip() for t in str_included_tags.split(",")])
670+
629671
print("[info] Found the following changed modules:",
630672
", ".join(x.name for x in changed_modules))
631673

@@ -640,8 +682,6 @@ def main():
640682

641683
should_run_java_style_checks = False
642684
if not should_only_test_modules:
643-
test_modules = determine_modules_to_test(changed_modules)
644-
645685
# license checks
646686
run_apache_rat_checks()
647687

@@ -672,40 +712,43 @@ def main():
672712
# if "DOCS" in changed_modules and test_env == "amplab_jenkins":
673713
# build_spark_documentation()
674714

675-
if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
676-
run_build_tests()
677-
678-
# spark build
679-
build_apache_spark(build_tool, extra_profiles)
680-
681-
# backwards compatibility checks
682-
if build_tool == "sbt":
683-
# Note: compatibility tests only supported in sbt for now
684-
detect_binary_inop_with_mima(extra_profiles)
685-
# Since we did not build assembly/package before running dev/mima, we need to
686-
# do it here because the tests still rely on it; see SPARK-13294 for details.
687-
build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
688-
689-
# run the test suites
690-
run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)
691-
692-
modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
693-
if modules_with_python_tests:
694-
# We only run PySpark tests with coverage report in one specific job with
695-
# Spark master with SBT in Jenkins.
696-
is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
697-
run_python_tests(
698-
modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
699-
run_python_packaging_tests()
700-
if any(m.should_run_r_tests for m in test_modules):
701-
run_sparkr_tests()
715+
print(changed_modules)
716+
print(test_modules)
717+
print([m for m in test_modules if m.python_test_goals])
718+
print([m.should_run_r_tests for m in test_modules])
719+
print(excluded_tags)
720+
print(included_tags)
721+
722+
# if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins":
723+
# run_build_tests()
724+
#
725+
# # spark build
726+
# build_apache_spark(build_tool, extra_profiles)
727+
#
728+
# # backwards compatibility checks
729+
# if build_tool == "sbt":
730+
# # Note: compatibility tests only supported in sbt for now
731+
# detect_binary_inop_with_mima(extra_profiles)
732+
# # Since we did not build assembly/package before running dev/mima, we need to
733+
# # do it here because the tests still rely on it; see SPARK-13294 for details.
734+
# build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
735+
#
736+
# # run the test suites
737+
# run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)
738+
#
739+
# modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
740+
# if modules_with_python_tests:
741+
# # We only run PySpark tests with coverage report in one specific job with
742+
# # Spark master with SBT in Jenkins.
743+
# is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ
744+
# run_python_tests(
745+
# modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job)
746+
# run_python_packaging_tests()
747+
# if any(m.should_run_r_tests for m in test_modules):
748+
# run_sparkr_tests()
702749

703750

704751
def _test():
705-
if "TEST_ONLY_MODULES" in os.environ:
706-
# TODO(SPARK-32252): Enable doctests back in Github Actions.
707-
return
708-
709752
import doctest
710753
failure_count = doctest.testmod()[0]
711754
if failure_count:

python/pyspark/worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#
1717

1818
"""
19-
Worker that receives input from Piped RDD.
19+
Worker that receives input from Piped RDD
2020
"""
2121
from __future__ import print_function
2222
from __future__ import absolute_import

0 commit comments

Comments
 (0)