Revert "Python: Refactor integration tests (apache#7698)" (apache#7729)

This reverts commit 3a584a2.
zhongqishang · May 30, 2023 · 8858f1c · 8858f1c
1 parent 7d3fa50
commit 8858f1c
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 112 deletions.
diff --git a/python/Makefile b/python/Makefile
@@ -34,16 +34,10 @@ test-s3:
 
 test-integration:
 	docker-compose -f dev/docker-compose-integration.yml kill
-	docker-compose -f dev/docker-compose-integration.yml rm -f
+	docker-compose -f dev/docker-compose-integration.yml build
 	docker-compose -f dev/docker-compose-integration.yml up -d
-	sleep 10
-	docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
-	poetry run pytest tests/ -v -m integration ${PYTEST_ARGS}
-
-test-integration-rebuild:
-	docker-compose -f dev/docker-compose-integration.yml kill
-	docker-compose -f dev/docker-compose-integration.yml rm -f
-	docker-compose -f dev/docker-compose-integration.yml build --no-cache
+	sleep 30
+	poetry run pytest tests/ -m integration ${PYTEST_ARGS}
 
 test-adlfs:
 	sh ./dev/run-azurite.sh

diff --git a/python/dev/Dockerfile b/python/dev/Dockerfile
@@ -37,7 +37,7 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/
 WORKDIR ${SPARK_HOME}
 
 ENV SPARK_VERSION=3.3.2
-ENV ICEBERG_VERSION=1.2.1
+ENV ICEBERG_VERSION=1.2.0
 ENV AWS_SDK_VERSION=2.20.18
 
 RUN curl -s https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
@@ -62,8 +62,6 @@ ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
 RUN chmod u+x /opt/spark/sbin/* && \
     chmod u+x /opt/spark/bin/*
 
-RUN pip3 install -q ipython
-
 COPY entrypoint.sh .
 COPY provision.py .
 

diff --git a/python/dev/entrypoint.sh b/python/dev/entrypoint.sh
@@ -22,4 +22,4 @@ start-master.sh -p 7077
 start-worker.sh spark://spark-iceberg:7077
 start-history-server.sh
 
-tail -f /dev/null
+python3 ./provision.py
diff --git a/python/dev/provision.py b/python/dev/provision.py
@@ -14,12 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import time
 
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import current_date, date_add, expr
 
 spark = SparkSession.builder.getOrCreate()
 
+print("Create database")
+
 spark.sql(
     """
   CREATE DATABASE IF NOT EXISTS default;
@@ -28,7 +31,19 @@
 
 spark.sql(
     """
-  CREATE OR REPLACE TABLE default.test_null_nan
+  use default;
+"""
+)
+
+spark.sql(
+    """
+  DROP TABLE IF EXISTS test_null_nan;
+"""
+)
+
+spark.sql(
+    """
+  CREATE TABLE test_null_nan
   USING iceberg
   AS SELECT
     1            AS idx,
@@ -44,122 +59,78 @@
 
 spark.sql(
     """
-  CREATE OR REPLACE TABLE default.test_null_nan_rewritten
-  USING iceberg
-  AS SELECT * FROM default.test_null_nan
+  DROP TABLE IF EXISTS test_null_nan_rewritten;
 """
 )
 
 spark.sql(
     """
-CREATE OR REPLACE TABLE default.test_limit as
-  SELECT * LATERAL VIEW explode(ARRAY(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) AS idx;
+  CREATE TABLE test_null_nan_rewritten
+  USING iceberg
+  AS SELECT * FROM test_null_nan
 """
 )
 
 spark.sql(
     """
-CREATE OR REPLACE TABLE default.test_positional_mor_deletes (
-    dt     date,
-    number integer,
-    letter string
-)
-USING iceberg
-TBLPROPERTIES (
-    'write.delete.mode'='merge-on-read',
-    'write.update.mode'='merge-on-read',
-    'write.merge.mode'='merge-on-read',
-    'format-version'='2'
-);
+  DROP TABLE IF EXISTS test_limit;
 """
 )
 
-# Partitioning is not really needed, but there is a bug:
-# https://github.com/apache/iceberg/pull/7685
 spark.sql(
     """
-    ALTER TABLE default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years
-"""
-)
-
-spark.sql(
+    CREATE TABLE test_limit
+    USING iceberg
+      AS SELECT
+          1            AS idx
+      UNION ALL SELECT
+          2            AS idx
+      UNION ALL SELECT
+          3            AS idx
+      UNION ALL SELECT
+          4            AS idx
+      UNION ALL SELECT
+          5            AS idx
+      UNION ALL SELECT
+          6            AS idx
+      UNION ALL SELECT
+          7            AS idx
+      UNION ALL SELECT
+          8            AS idx
+      UNION ALL SELECT
+          9            AS idx
+      UNION ALL SELECT
+          10           AS idx
     """
-INSERT INTO default.test_positional_mor_deletes
-VALUES
-    (CAST('2023-03-01' AS date), 1, 'a'),
-    (CAST('2023-03-02' AS date), 2, 'b'),
-    (CAST('2023-03-03' AS date), 3, 'c'),
-    (CAST('2023-03-04' AS date), 4, 'd'),
-    (CAST('2023-03-05' AS date), 5, 'e'),
-    (CAST('2023-03-06' AS date), 6, 'f'),
-    (CAST('2023-03-07' AS date), 7, 'g'),
-    (CAST('2023-03-08' AS date), 8, 'h'),
-    (CAST('2023-03-09' AS date), 9, 'i'),
-    (CAST('2023-03-10' AS date), 10, 'j'),
-    (CAST('2023-03-11' AS date), 11, 'k'),
-    (CAST('2023-03-12' AS date), 12, 'l');
-"""
 )
 
 spark.sql(
     """
-DELETE FROM default.test_positional_mor_deletes WHERE number = 9
+  DROP TABLE IF EXISTS test_deletes;
 """
 )
 
 spark.sql(
     """
-  CREATE OR REPLACE TABLE default.test_positional_mor_double_deletes (
-    dt     date,
-    number integer,
-    letter string
-  )
+  CREATE TABLE test_deletes
   USING iceberg
   TBLPROPERTIES (
     'write.delete.mode'='merge-on-read',
     'write.update.mode'='merge-on-read',
-    'write.merge.mode'='merge-on-read',
-    'format-version'='2'
-  );
-"""
-)
-
-# Partitioning is not really needed, but there is a bug:
-# https://github.com/apache/iceberg/pull/7685
-spark.sql(
-    """
-    ALTER TABLE default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years
-"""
-)
-
-spark.sql(
-    """
-INSERT INTO default.test_positional_mor_double_deletes
-VALUES
-    (CAST('2023-03-01' AS date), 1, 'a'),
-    (CAST('2023-03-02' AS date), 2, 'b'),
-    (CAST('2023-03-03' AS date), 3, 'c'),
-    (CAST('2023-03-04' AS date), 4, 'd'),
-    (CAST('2023-03-05' AS date), 5, 'e'),
-    (CAST('2023-03-06' AS date), 6, 'f'),
-    (CAST('2023-03-07' AS date), 7, 'g'),
-    (CAST('2023-03-08' AS date), 8, 'h'),
-    (CAST('2023-03-09' AS date), 9, 'i'),
-    (CAST('2023-03-10' AS date), 10, 'j'),
-    (CAST('2023-03-11' AS date), 11, 'k'),
-    (CAST('2023-03-12' AS date), 12, 'l');
-"""
-)
-
-spark.sql(
-    """
-    DELETE FROM default.test_positional_mor_double_deletes WHERE number = 9
+    'write.merge.mode'='merge-on-read'
+  )
+  AS SELECT
+    1       AS idx,
+    True    AS deleted
+UNION ALL SELECT
+    2       AS idx,
+    False   AS deleted;
 """
 )
 
 spark.sql(
     """
-    DELETE FROM default.test_positional_mor_double_deletes WHERE letter == 'f'
+  DELETE FROM test_deletes WHERE deleted = True;
 """
 )
 
@@ -185,3 +156,6 @@
 all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version", "2").partitionedBy(
     "intCol"
 ).createOrReplace()
+
+while True:
+    time.sleep(1)
diff --git a/python/mkdocs/docs/contributing.md b/python/mkdocs/docs/contributing.md
@@ -107,22 +107,6 @@ make test PYTEST_ARGS="--pdb"
 
 To see all available pytest arguments, run `make test PYTEST_ARGS="--help"`.
 
-### Integration tests
-
-PyIceberg has integration tests with Apache Spark. Spark will create a new database and provision some tables that PyIceberg can query against.
-
-```sh
-make test-integration
-```
-
-This will restart the containers, to get to a clean state, and then run the PyTest suite. In case something changed in the Dockerfile or the provision script, you can run:
-
-```sh
-make test-integration-rebuild
-```
-
-To rebuild the containers from scratch.
-
 ## Code standards
 
 Below are the formalized conventions that we adhere to in the PyIceberg project. The goal of this is to have a common agreement on how to evolve the codebase, but also using it as guidelines for newcomers to the project.