From 6bdd710c4d4125b0801a93d57f53e05e301ebebd Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 14 Jul 2020 20:44:09 -0700 Subject: [PATCH] [SPARK-32316][TESTS][INFRA] Test PySpark with Python 3.8 in Github Actions ### What changes were proposed in this pull request? This PR aims to test PySpark with Python 3.8 in Github Actions. In the script side, it is already ready: https://github.com/apache/spark/blob/4ad9bfd53b84a6d2497668c73af6899bae14c187/python/run-tests.py#L161 This PR includes small related fixes together: 1. Install Python 3.8 2. Only install one Python implementation instead of installing many for SQL and Yarn test cases because they need one Python executable in their test cases that is higher than Python 2. 3. Do not install Python 2 which is not needed anymore after we dropped Python 2 at SPARK-32138 4. Remove a comment about installing PyPy3 on Jenkins - SPARK-32278. It is already installed. ### Why are the changes needed? Currently, only PyPy3 and Python 3.6 are being tested with PySpark in Github Actions. We should test the latest version of Python as well because some optimizations can be only enabled with Python 3.8+. See also https://github.com/apache/spark/pull/29114 ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? Was not tested. Github Actions build in this PR will test it out. Closes #29116 from HyukjinKwon/test-python3.8-togehter. Authored-by: HyukjinKwon Signed-off-by: Dongjoon Hyun --- .github/workflows/master.yml | 32 ++++++++++++++++++-------------- python/run-tests.py | 1 - 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 5cf00c6ed9e67..fe01b92036377 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -117,38 +117,42 @@ jobs: java-version: ${{ matrix.java }} # PySpark - name: Install PyPy3 - # SQL component also has Python related tests, for example, IntegratedUDFTestUtils. # Note that order of Python installations here matters because default python3 is # overridden by pypy3. uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + if: contains(matrix.modules, 'pyspark') with: python-version: pypy3 architecture: x64 - - name: Install Python 2.7 + - name: Install Python 3.6 uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + if: contains(matrix.modules, 'pyspark') with: - python-version: 2.7 + python-version: 3.6 architecture: x64 - - name: Install Python 3.6 + - name: Install Python 3.8 uses: actions/setup-python@v2 - # Yarn has a Python specific test too, for example, YarnClusterSuite. + # We should install one Python that is higher then 3+ for SQL and Yarn because: + # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. + # - Yarn has a Python specific test too, for example, YarnClusterSuite. if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) with: - python-version: 3.6 + python-version: 3.8 architecture: x64 - - name: Install Python packages - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + - name: Install Python packages (Python 3.6 and PyPy3) + if: contains(matrix.modules, 'pyspark') # PyArrow is not supported in PyPy yet, see ARROW-2651. # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason. run: | - python3 -m pip install numpy pyarrow pandas scipy - python3 -m pip list - python2 -m pip install numpy pyarrow pandas scipy - python2 -m pip list + python3.6 -m pip install numpy pyarrow pandas scipy + python3.6 -m pip list pypy3 -m pip install numpy pandas pypy3 -m pip list + - name: Install Python packages (Python 3.8) + if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + run: | + python3.8 -m pip install numpy pyarrow pandas scipy + python3.8 -m pip list # SparkR - name: Install R 3.6 uses: r-lib/actions/setup-r@v1 diff --git a/python/run-tests.py b/python/run-tests.py index 23076eab1c3e4..357eb8f449beb 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -157,7 +157,6 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): def get_default_python_executables(): - # TODO(SPARK-32278): install PyPy3 in Jenkins to test python_execs = [x for x in ["python3.6", "python3.8", "pypy3"] if which(x)] if "python3.6" not in python_execs: