Skip to content

[SPARK-52212][PYTHON][INFRA] Upgrade linter image to python 3.11 #50931

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -787,8 +787,6 @@ jobs:
LC_ALL: C.UTF-8
LANG: C.UTF-8
NOLINT_ON_COMPILE: false
PYSPARK_DRIVER_PYTHON: python3.9
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
image: ${{ needs.precondition.outputs.image_lint_url_link }}
Expand Down Expand Up @@ -849,11 +847,18 @@ jobs:
run: ./dev/mima
- name: Scala linter
run: ./dev/lint-scala
- name: Scala structured logging check
- name: Scala structured logging check for branch-3.5 and branch-4.0
if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0'
run: |
if [ -f ./dev/structured_logging_style.py ]; then
python3.9 ./dev/structured_logging_style.py
fi
- name: Scala structured logging check
if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0'
run: |
if [ -f ./dev/structured_logging_style.py ]; then
python3.11 ./dev/structured_logging_style.py
fi
- name: Java linter
run: ./dev/lint-java
- name: Spark connect jvm client mima check
Expand All @@ -865,10 +870,18 @@ jobs:
# Should delete this section after SPARK 3.5 EOL.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
- name: List Python packages
- name: List Python packages for branch-3.5 and branch-4.0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make sure not affecting 4.0 release, we should merge this PR after 4.0 release.

if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0'
run: python3.9 -m pip list
- name: Python linter
- name: List Python packages
if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0'
run: python3.11 -m pip list
- name: Python linter for branch-3.5 and branch-4.0
if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0'
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
- name: Python linter
if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0'
run: PYTHON_EXECUTABLE=python3.11 ./dev/lint-python
# Should delete this section after SPARK 3.5 EOL.
- name: Install dependencies for Python code generation check for branch-3.5
if: inputs.branch == 'branch-3.5'
Expand Down
21 changes: 13 additions & 8 deletions dev/spark-test-image/lint/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image for Linter"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE=20250312
ENV FULL_REFRESH_DATE=20250519

ENV DEBIAN_FRONTEND=noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN=true
Expand All @@ -51,6 +51,7 @@ RUN apt-get update && apt-get install -y \
npm \
pkg-config \
qpdf \
tzdata \
r-base \
software-properties-common \
wget \
Expand All @@ -65,13 +66,17 @@ RUN Rscript -e "install.packages(c('devtools', 'knitr', 'markdown', 'rmarkdown',
# See more in SPARK-39735
ENV R_LIBS_SITE="/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"

# Install Python 3.9
# Install Python 3.11
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y python3.9 python3.9-distutils \
RUN apt-get update && apt-get install -y \
python3.11 \
&& apt-get autoremove --purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9

RUN python3.9 -m pip install \

RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
RUN python3.11 -m pip install \
'black==23.12.1' \
'flake8==3.9.0' \
'googleapis-common-protos-stubs==2.2.0' \
Expand All @@ -91,6 +96,6 @@ RUN python3.9 -m pip install \
'pyarrow>=19.0.0' \
'pytest-mypy-plugins==1.9.3' \
'pytest==7.1.3' \
&& python3.9 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu \
&& python3.9 -m pip install torcheval \
&& python3.9 -m pip cache purge
&& python3.11 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu \
&& python3.11 -m pip install torcheval \
&& python3.11 -m pip cache purge
4 changes: 2 additions & 2 deletions python/pyspark/ml/tests/typing/test_classification.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

# Should support
OneVsRest(classifier=LogisticRegression())
OneVsRest(classifier=LogisticRegressionModel.load("/foo")) # E: Argument "classifier" to "OneVsRest" has incompatible type "LogisticRegressionModel"; expected "Optional[Classifier[Never]]" [arg-type]
OneVsRest(classifier="foo") # E: Argument "classifier" to "OneVsRest" has incompatible type "str"; expected "Optional[Classifier[Never]]" [arg-type]
OneVsRest(classifier=LogisticRegressionModel.load("/foo")) # E: Argument "classifier" to "OneVsRest" has incompatible type "LogisticRegressionModel"; expected "Classifier[Never] | None" [arg-type]
OneVsRest(classifier="foo") # E: Argument "classifier" to "OneVsRest" has incompatible type "str"; expected "Classifier[Never] | None" [arg-type]


- case: fitFMClassifier
Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/ml/tests/typing/test_feature.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@
out: |
main:14: error: No overload variant of "StringIndexer" matches argument types "str", "list[str]" [call-overload]
main:14: note: Possible overload variants:
main:14: note: def StringIndexer(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
main:14: note: def StringIndexer(self, *, inputCols: Optional[list[str]] = ..., outputCols: Optional[list[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
main:14: note: def StringIndexer(self, *, inputCol: str | None = ..., outputCol: str | None = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
main:14: note: def StringIndexer(self, *, inputCols: list[str] | None = ..., outputCols: list[str] | None = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
main:15: error: No overload variant of "StringIndexer" matches argument types "list[str]", "str" [call-overload]
main:15: note: Possible overload variants:
main:15: note: def StringIndexer(self, *, inputCol: Optional[str] = ..., outputCol: Optional[str] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
main:15: note: def StringIndexer(self, *, inputCols: Optional[list[str]] = ..., outputCols: Optional[list[str]] = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
main:15: note: def StringIndexer(self, *, inputCol: str | None = ..., outputCol: str | None = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
main:15: note: def StringIndexer(self, *, inputCols: list[str] | None = ..., outputCols: list[str] | None = ..., handleInvalid: str = ..., stringOrderType: str = ...) -> StringIndexer
2 changes: 1 addition & 1 deletion python/pyspark/sql/connect/shell/progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from IPython.utils.terminal import get_terminal_size
except ImportError:

def get_terminal_size(defaultx: Any = None, defaulty: Any = None) -> Any:
def get_terminal_size(defaultx: Any = None, defaulty: Any = None) -> Any: # type: ignore[misc]
return (80, 25)


Expand Down
14 changes: 7 additions & 7 deletions python/pyspark/sql/tests/typing/test_dataframe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
out: |
main:16: error: No overload variant of "sample" of "DataFrame" matches argument type "bool" [call-overload]
main:16: note: Possible overload variants:
main:16: note: def sample(self, fraction: float, seed: Optional[int] = ...) -> DataFrame
main:16: note: def sample(self, withReplacement: Optional[bool], fraction: float, seed: Optional[int] = ...) -> DataFrame
main:16: note: def sample(self, fraction: float, seed: int | None = ...) -> DataFrame
main:16: note: def sample(self, withReplacement: bool | None, fraction: float, seed: int | None = ...) -> DataFrame


- case: selectColumns
Expand All @@ -54,7 +54,7 @@
df.select(["name", "age"])
df.select([col("name"), col("age")])

df.select(["name", col("age")]) # E: Argument 1 to "select" of "DataFrame" has incompatible type "list[object]"; expected "Union[list[Column], list[str]]" [arg-type]
df.select(["name", col("age")]) # E: Argument 1 to "select" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str]" [arg-type]


- case: groupBy
Expand All @@ -71,7 +71,7 @@
df.groupby(["name", "age"])
df.groupBy([col("name"), col("age")])
df.groupby([col("name"), col("age")])
df.groupBy(["name", col("age")]) # E: Argument 1 to "groupBy" of "DataFrame" has incompatible type "list[object]"; expected "Union[list[Column], list[str], list[int]]" [arg-type]
df.groupBy(["name", col("age")]) # E: Argument 1 to "groupBy" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str] | list[int]" [arg-type]


- case: rollup
Expand All @@ -88,7 +88,7 @@
df.rollup([col("name"), col("age")])


df.rollup(["name", col("age")]) # E: Argument 1 to "rollup" of "DataFrame" has incompatible type "list[object]"; expected "Union[list[Column], list[str]]" [arg-type]
df.rollup(["name", col("age")]) # E: Argument 1 to "rollup" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str]" [arg-type]


- case: cube
Expand All @@ -105,7 +105,7 @@
df.cube([col("name"), col("age")])


df.cube(["name", col("age")]) # E: Argument 1 to "cube" of "DataFrame" has incompatible type "list[object]"; expected "Union[list[Column], list[str]]" [arg-type]
df.cube(["name", col("age")]) # E: Argument 1 to "cube" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str]" [arg-type]


- case: dropColumns
Expand All @@ -124,7 +124,7 @@
out: |
main:10: error: No overload variant of "drop" of "DataFrame" matches argument types "Column", "Column" [call-overload]
main:10: note: Possible overload variants:
main:10: note: def drop(self, cols: Union[Column, str]) -> DataFrame
main:10: note: def drop(self, cols: Column | str) -> DataFrame
main:10: note: def drop(self, *cols: str) -> DataFrame


Expand Down
32 changes: 16 additions & 16 deletions python/pyspark/sql/tests/typing/test_functions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,33 +69,33 @@
out: |
main:29: error: No overload variant of "array" matches argument types "list[Column]", "list[Column]" [call-overload]
main:29: note: Possible overload variants:
main:29: note: def array(*cols: Union[Column, str]) -> Column
main:29: note: def array(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:29: note: def array(*cols: Column | str) -> Column
main:29: note: def array(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
main:30: error: No overload variant of "create_map" matches argument types "list[Column]", "list[Column]" [call-overload]
main:30: note: Possible overload variants:
main:30: note: def create_map(*cols: Union[Column, str]) -> Column
main:30: note: def create_map(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:30: note: def create_map(*cols: Column | str) -> Column
main:30: note: def create_map(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
main:31: error: No overload variant of "map_concat" matches argument types "list[Column]", "list[Column]" [call-overload]
main:31: note: Possible overload variants:
main:31: note: def map_concat(*cols: Union[Column, str]) -> Column
main:31: note: def map_concat(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:31: note: def map_concat(*cols: Column | str) -> Column
main:31: note: def map_concat(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
main:32: error: No overload variant of "struct" matches argument types "list[str]", "list[str]" [call-overload]
main:32: note: Possible overload variants:
main:32: note: def struct(*cols: Union[Column, str]) -> Column
main:32: note: def struct(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:32: note: def struct(*cols: Column | str) -> Column
main:32: note: def struct(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
main:33: error: No overload variant of "array" matches argument types "list[str]", "list[str]" [call-overload]
main:33: note: Possible overload variants:
main:33: note: def array(*cols: Union[Column, str]) -> Column
main:33: note: def array(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:33: note: def array(*cols: Column | str) -> Column
main:33: note: def array(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
main:34: error: No overload variant of "create_map" matches argument types "list[str]", "list[str]" [call-overload]
main:34: note: Possible overload variants:
main:34: note: def create_map(*cols: Union[Column, str]) -> Column
main:34: note: def create_map(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:34: note: def create_map(*cols: Column | str) -> Column
main:34: note: def create_map(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
main:35: error: No overload variant of "map_concat" matches argument types "list[str]", "list[str]" [call-overload]
main:35: note: Possible overload variants:
main:35: note: def map_concat(*cols: Union[Column, str]) -> Column
main:35: note: def map_concat(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:35: note: def map_concat(*cols: Column | str) -> Column
main:35: note: def map_concat(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
main:36: error: No overload variant of "struct" matches argument types "list[str]", "list[str]" [call-overload]
main:36: note: Possible overload variants:
main:36: note: def struct(*cols: Union[Column, str]) -> Column
main:36: note: def struct(Union[Sequence[Union[Column, str]], tuple[Union[Column, str], ...]], /) -> Column
main:36: note: def struct(*cols: Column | str) -> Column
main:36: note: def struct(Sequence[Column | str] | tuple[Column | str, ...], /) -> Column
4 changes: 2 additions & 2 deletions python/pyspark/sql/tests/typing/test_readwriter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@

spark.read.load(foo=True)

spark.read.load(foo=["a"]) # E: Argument "foo" to "load" of "DataFrameReader" has incompatible type "list[str]"; expected "Union[bool, float, int, str, None]" [arg-type]
spark.read.option("foo", (1, )) # E: Argument 2 to "option" of "DataFrameReader" has incompatible type "tuple[int]"; expected "Union[bool, float, int, str, None]" [arg-type]
spark.read.load(foo=["a"]) # E: Argument "foo" to "load" of "DataFrameReader" has incompatible type "list[str]"; expected "bool | float | int | str | None" [arg-type]
spark.read.option("foo", (1, )) # E: Argument 2 to "option" of "DataFrameReader" has incompatible type "tuple[int]"; expected "bool | float | int | str | None" [arg-type]


- case: readStreamOptions
Expand Down
20 changes: 10 additions & 10 deletions python/pyspark/sql/tests/typing/test_session.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,16 @@
main:14: error: Value of type variable "AtomicValue" of "createDataFrame" of "SparkSession" cannot be "tuple[str, int]" [type-var]
main:18: error: No overload variant of "createDataFrame" of "SparkSession" matches argument types "list[tuple[str, int]]", "StructType", "float" [call-overload]
main:18: note: Possible overload variants:
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: Union[list[str], tuple[str, ...]] = ..., samplingRatio: Optional[float] = ...) -> DataFrame
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: Union[list[str], tuple[str, ...]] = ..., samplingRatio: Optional[float] = ...) -> DataFrame
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: Union[StructType, str], *, verifySchema: bool = ...) -> DataFrame
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: Union[StructType, str], *, verifySchema: bool = ...) -> DataFrame
main:18: note: def [AtomicValue in (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: RDD[AtomicValue], schema: Union[AtomicType, str], verifySchema: bool = ...) -> DataFrame
main:18: note: def [AtomicValue in (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: Iterable[AtomicValue], schema: Union[AtomicType, str], verifySchema: bool = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: DataFrame, samplingRatio: Optional[float] = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: Any, samplingRatio: Optional[float] = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: DataFrame, schema: Union[StructType, str], verifySchema: bool = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: Any, schema: Union[StructType, str], verifySchema: bool = ...) -> DataFrame
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: list[str] | tuple[str, ...] = ..., samplingRatio: float | None = ...) -> DataFrame
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: list[str] | tuple[str, ...] = ..., samplingRatio: float | None = ...) -> DataFrame
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: StructType | str, *, verifySchema: bool = ...) -> DataFrame
main:18: note: def [RowLike in (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: StructType | str, *, verifySchema: bool = ...) -> DataFrame
main:18: note: def [AtomicValue in (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: RDD[AtomicValue], schema: AtomicType | str, verifySchema: bool = ...) -> DataFrame
main:18: note: def [AtomicValue in (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: Iterable[AtomicValue], schema: AtomicType | str, verifySchema: bool = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: DataFrame, samplingRatio: float | None = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: Any, samplingRatio: float | None = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: DataFrame, schema: StructType | str, verifySchema: bool = ...) -> DataFrame
main:18: note: def createDataFrame(self, data: Any, schema: StructType | str, verifySchema: bool = ...) -> DataFrame

- case: createDataFrameFromEmptyRdd
main: |
Expand Down