[SPARK-40789][PYTHON][TESTS] Separate tests under pyspark.sql.tests

HyukjinKwon · HyukjinKwon · commit 4f2630e308ca · 2022-10-14T11:13:06.000+09:00
### What changes were proposed in this pull request? This PR proposes to split the tests into the sub-packages: **Before** ``` tests ├── __init__.py ├── test_arrow.py ├── test_arrow_map.py ├── test_catalog.py ├── test_column.py ├── test_conf.py ├── test_connect_basic.py ├── test_connect_column_expressions.py ├── test_connect_plan_only.py ├── test_connect_select_ops.py ├── test_context.py ├── test_dataframe.py ├── test_datasources.py ├── test_functions.py ├── test_group.py ├── test_pandas_cogrouped_map.py ├── test_pandas_grouped_map.py ├── test_pandas_grouped_map_with_state.py ├── test_pandas_map.py ├── test_pandas_udf.py ├── test_pandas_udf_grouped_agg.py ├── test_pandas_udf_scalar.py ├── test_pandas_udf_typehints.py ├── test_pandas_udf_typehints_with_future_annotations.py ├── test_pandas_udf_window.py ├── test_readwriter.py ├── test_serde.py ├── test_session.py ├── test_streaming.py ├── test_streaming_listener.py ├── test_types.py ├── test_udf.py ├── test_udf_profiler.py ├── test_utils.py └── typing ├── ... ``` **After** ``` tests ├── __init__.py ├── connect │ ├── __init__.py │ ├── test_connect_basic.py │ ├── test_connect_column_expressions.py │ ├── test_connect_plan_only.py │ └── test_connect_select_ops.py ├── pandas │ ├── __init__.py │ ├── test_pandas_cogrouped_map.py │ ├── test_pandas_grouped_map.py │ ├── test_pandas_grouped_map_with_state.py │ ├── test_pandas_map.py │ ├── test_pandas_udf.py │ ├── test_pandas_udf_grouped_agg.py │ ├── test_pandas_udf_scalar.py │ ├── test_pandas_udf_typehints.py │ ├── test_pandas_udf_typehints_with_future_annotations.py │ └── test_pandas_udf_window.py ├── streaming │ ├── __init__.py │ ├── test_streaming.py │ └── test_streaming_listener.py ├── test_arrow.py ├── test_arrow_map.py ├── test_catalog.py ├── test_column.py ├── test_conf.py ├── test_context.py ├── test_dataframe.py ├── test_datasources.py ├── test_functions.py ├── test_group.py ├── test_readwriter.py ├── test_serde.py ├── test_session.py ├── test_types.py ├── test_udf.py ├── test_udf_profiler.py ├── test_utils.py └── typing ├── ... ``` This way is consistent with `pyspark.pandas.tests`. ### Why are the changes needed? To make it easier to maintain, track and add the tests. ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? CI in this PR should test it out. Closes #38239 from HyukjinKwon/SPARK-40789. Lead-authored-by: Hyukjin Kwon <gurwls223@apache.org> Co-authored-by: Hyukjin Kwon <gurwls223@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -84,12 +84,12 @@ SPARK SHELL:
   - "repl/**/*"
   - "bin/spark-shell*"
 SQL:
-#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming/**/*", "!python/pyspark/sql/tests/test_streaming.py"]
+#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming/**/*", "!python/pyspark/sql/tests/streaming/test_streaming.py"]
   - "**/sql/**/*"
   - "common/unsafe/**/*"
   #- "!python/pyspark/sql/avro/**/*"
   #- "!python/pyspark/sql/streaming/**/*"
-  #- "!python/pyspark/sql/tests/test_streaming.py"
+  #- "!python/pyspark/sql/tests/streaming/test_streaming.py"
   - "bin/spark-sql*"
   - "bin/beeline*"
   - "sbin/*thriftserver*.sh"
@@ -125,7 +125,7 @@ STRUCTURED STREAMING:
   - "**/sql/**/streaming/**/*"
   - "connector/kafka-0-10-sql/**/*"
   - "python/pyspark/sql/streaming/**/*"
-  - "python/pyspark/sql/tests/test_streaming.py"
+  - "python/pyspark/sql/tests/streaming/test_streaming.py"
   - "**/*streaming.R"
 PYTHON:
   - "bin/pyspark*"
@@ -156,5 +156,5 @@ CONNECT:
   - "**/sql/sparkconnect/**/*"
   - "python/pyspark/sql/**/connect/**/*"
 PROTOBUF:
- - "connector/protobuf/**/*"
- - "python/pyspark/sql/protobuf/**/*"
+  - "connector/protobuf/**/*"
+  - "python/pyspark/sql/protobuf/**/*"
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -461,22 +461,22 @@ def __hash__(self):
         "pyspark.sql.tests.test_datasources",
         "pyspark.sql.tests.test_functions",
         "pyspark.sql.tests.test_group",
-        "pyspark.sql.tests.test_pandas_cogrouped_map",
-        "pyspark.sql.tests.test_pandas_grouped_map",
-        "pyspark.sql.tests.test_pandas_grouped_map_with_state",
-        "pyspark.sql.tests.test_pandas_map",
+        "pyspark.sql.tests.pandas.test_pandas_cogrouped_map",
+        "pyspark.sql.tests.pandas.test_pandas_grouped_map",
+        "pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state",
+        "pyspark.sql.tests.pandas.test_pandas_map",
         "pyspark.sql.tests.test_arrow_map",
-        "pyspark.sql.tests.test_pandas_udf",
-        "pyspark.sql.tests.test_pandas_udf_grouped_agg",
-        "pyspark.sql.tests.test_pandas_udf_scalar",
-        "pyspark.sql.tests.test_pandas_udf_typehints",
-        "pyspark.sql.tests.test_pandas_udf_typehints_with_future_annotations",
-        "pyspark.sql.tests.test_pandas_udf_window",
+        "pyspark.sql.tests.pandas.test_pandas_udf",
+        "pyspark.sql.tests.pandas.test_pandas_udf_grouped_agg",
+        "pyspark.sql.tests.pandas.test_pandas_udf_scalar",
+        "pyspark.sql.tests.pandas.test_pandas_udf_typehints",
+        "pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations",
+        "pyspark.sql.tests.pandas.test_pandas_udf_window",
         "pyspark.sql.tests.test_readwriter",
         "pyspark.sql.tests.test_serde",
         "pyspark.sql.tests.test_session",
-        "pyspark.sql.tests.test_streaming",
-        "pyspark.sql.tests.test_streaming_listener",
+        "pyspark.sql.tests.streaming.test_streaming",
+        "pyspark.sql.tests.streaming.test_streaming_listener",
         "pyspark.sql.tests.test_types",
         "pyspark.sql.tests.test_udf",
         "pyspark.sql.tests.test_udf_profiler",
@@ -492,10 +492,10 @@ def __hash__(self):
         # doctests
         # No doctests yet.
         # unittests
-        "pyspark.sql.tests.test_connect_column_expressions",
-        "pyspark.sql.tests.test_connect_plan_only",
-        "pyspark.sql.tests.test_connect_select_ops",
-        "pyspark.sql.tests.test_connect_basic",
+        "pyspark.sql.tests.connect.test_connect_column_expressions",
+        "pyspark.sql.tests.connect.test_connect_plan_only",
+        "pyspark.sql.tests.connect.test_connect_select_ops",
+        "pyspark.sql.tests.connect.test_connect_basic",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
diff --git a/python/pyspark/sql/connect/README.md b/python/pyspark/sql/connect/README.md
@@ -46,6 +46,6 @@ To use the release version of Spark Connect:
 ## Run Tests
 
 ```bash
-./python/run-tests --testnames 'pyspark.sql.tests.test_connect_basic'
+./python/run-tests --testnames 'pyspark.sql.tests.connect.test_connect_basic'
 ```
 
diff --git a/python/pyspark/sql/tests/connect/__init__.py b/python/pyspark/sql/tests/connect/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -81,7 +81,7 @@ def test_simple_explain_string(self):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_connect_basic import *  # noqa: F401
+    from pyspark.sql.tests.connect.test_connect_basic import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore
diff --git a/python/pyspark/sql/tests/connect/test_connect_column_expressions.py b/python/pyspark/sql/tests/connect/test_connect_column_expressions.py
@@ -54,7 +54,7 @@ def test_column_literals(self):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_connect_column_expressions import *  # noqa: F401
+    from pyspark.sql.tests.connect.test_connect_column_expressions import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore
diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py b/python/pyspark/sql/tests/connect/test_connect_plan_only.py
@@ -64,7 +64,7 @@ def read_table(x):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_connect_plan_only import *  # noqa: F401
+    from pyspark.sql.tests.connect.test_connect_plan_only import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore
diff --git a/python/pyspark/sql/tests/connect/test_connect_select_ops.py b/python/pyspark/sql/tests/connect/test_connect_select_ops.py
@@ -29,7 +29,7 @@ def test_select_with_literal(self):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_connect_select_ops import *  # noqa: F401
+    from pyspark.sql.tests.connect.test_connect_select_ops import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore
diff --git a/python/pyspark/sql/tests/pandas/__init__.py b/python/pyspark/sql/tests/pandas/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
@@ -404,7 +404,7 @@ def merge_pandas(lft, rgt):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_cogrouped_map import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_cogrouped_map import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
@@ -741,7 +741,7 @@ def my_pandas_udf(pdf):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_grouped_map import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_grouped_map import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py
@@ -237,7 +237,7 @@ def assert_test():
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_grouped_map_with_state import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_grouped_map_with_state import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py
@@ -204,7 +204,7 @@ def func(iterator):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_map import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_map import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf.py b/python/pyspark/sql/tests/pandas/test_pandas_udf.py
@@ -293,7 +293,7 @@ def noop(s: pd.Series) -> pd.Series:
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_udf import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_grouped_agg.py
@@ -548,7 +548,7 @@ def mean(x):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_grouped_agg import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_udf_grouped_agg import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py
@@ -1330,7 +1330,7 @@ def udf(x):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_scalar import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_udf_scalar import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py
@@ -357,7 +357,7 @@ def func(col: "Union[pd.Series, pd.DataFrame]", *, col2: "pd.DataFrame") -> "pd.
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_typehints import *  # noqa: #401
+    from pyspark.sql.tests.pandas.test_pandas_udf_typehints import *  # noqa: #401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py
@@ -364,7 +364,7 @@ def func(col: "Union[pd.Series, pd.DataFrame]", *, col2: "pd.DataFrame") -> "pd.
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_typehints_with_future_annotations import *  # noqa: #401
+    from pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations import *  # noqa: #401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_window.py
@@ -395,7 +395,7 @@ def test_bounded_mixed(self):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_pandas_udf_window import *  # noqa: F401
+    from pyspark.sql.tests.pandas.test_pandas_udf_window import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/streaming/__init__.py b/python/pyspark/sql/tests/streaming/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py
@@ -629,7 +629,7 @@ def test_streaming_write_to_table(self):
 
 if __name__ == "__main__":
     import unittest
-    from pyspark.sql.tests.test_streaming import *  # noqa: F401
+    from pyspark.sql.tests.streaming.test_streaming import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]
diff --git a/python/pyspark/sql/tests/streaming/test_streaming_listener.py b/python/pyspark/sql/tests/streaming/test_streaming_listener.py
@@ -296,7 +296,7 @@ def onQueryTerminated(self, event):
 if __name__ == "__main__":
     import unittest
 
-    from pyspark.sql.tests.test_streaming_listener import *  # noqa: F401
+    from pyspark.sql.tests.streaming.test_streaming_listener import *  # noqa: F401
 
     try:
         import xmlrunner  # type: ignore[import]