From 3a584a28352e5f13ca128599f4d331aa5eeaa374 Mon Sep 17 00:00:00 2001
From: Fokko Driesprong <fokko@apache.org>
Date: Thu, 25 May 2023 13:45:09 -0500
Subject: [PATCH] Python: Refactor integration tests (#7698)

* Python: Refactor integration tests

This splits out running and building the integration tests that
enables quick development iterations. I've also added ipython
that will give a more meaningful error when something goes wrong
with the provisioning of the tests.

* Simplify SQL
---
 python/Makefile                    |  12 ++-
 python/dev/Dockerfile              |   4 +-
 python/dev/entrypoint.sh           |   2 +-
 python/dev/provision.py            | 140 +++++++++++++++++------------
 python/mkdocs/docs/contributing.md |  16 ++++
 5 files changed, 112 insertions(+), 62 deletions(-)

diff --git a/python/Makefile b/python/Makefile
index ea0a3e82a8fd..444a3785bcc5 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -34,10 +34,16 @@ test-s3:
 
 test-integration:
 	docker-compose -f dev/docker-compose-integration.yml kill
-	docker-compose -f dev/docker-compose-integration.yml build
+	docker-compose -f dev/docker-compose-integration.yml rm -f
 	docker-compose -f dev/docker-compose-integration.yml up -d
-	sleep 30
-	poetry run pytest tests/ -m integration ${PYTEST_ARGS}
+	sleep 10
+	docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
+	poetry run pytest tests/ -v -m integration ${PYTEST_ARGS}
+
+test-integration-rebuild:
+	docker-compose -f dev/docker-compose-integration.yml kill
+	docker-compose -f dev/docker-compose-integration.yml rm -f
+	docker-compose -f dev/docker-compose-integration.yml build --no-cache
 
 test-adlfs:
 	sh ./dev/run-azurite.sh
diff --git a/python/dev/Dockerfile b/python/dev/Dockerfile
index 65d5503b579f..c6bbe543d328 100644
--- a/python/dev/Dockerfile
+++ b/python/dev/Dockerfile
@@ -37,7 +37,7 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/
 WORKDIR ${SPARK_HOME}
 
 ENV SPARK_VERSION=3.3.2
-ENV ICEBERG_VERSION=1.2.0
+ENV ICEBERG_VERSION=1.2.1
 ENV AWS_SDK_VERSION=2.20.18
 
 RUN curl -s https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
@@ -62,6 +62,8 @@ ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
 RUN chmod u+x /opt/spark/sbin/* && \
     chmod u+x /opt/spark/bin/*
 
+RUN pip3 install -q ipython
+
 COPY entrypoint.sh .
 COPY provision.py .
 
diff --git a/python/dev/entrypoint.sh b/python/dev/entrypoint.sh
index d777f8f5a284..574e876c7702 100755
--- a/python/dev/entrypoint.sh
+++ b/python/dev/entrypoint.sh
@@ -22,4 +22,4 @@ start-master.sh -p 7077
 start-worker.sh spark://spark-iceberg:7077
 start-history-server.sh
 
-python3 ./provision.py
+tail -f /dev/null
diff --git a/python/dev/provision.py b/python/dev/provision.py
index 81bd094c5826..73ec34fdc109 100644
--- a/python/dev/provision.py
+++ b/python/dev/provision.py
@@ -14,15 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import time
 
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import current_date, date_add, expr
 
 spark = SparkSession.builder.getOrCreate()
 
-print("Create database")
-
 spark.sql(
     """
   CREATE DATABASE IF NOT EXISTS default;
@@ -31,19 +28,7 @@
 
 spark.sql(
     """
-  use default;
-"""
-)
-
-spark.sql(
-    """
-  DROP TABLE IF EXISTS test_null_nan;
-"""
-)
-
-spark.sql(
-    """
-  CREATE TABLE test_null_nan
+  CREATE OR REPLACE TABLE default.test_null_nan
   USING iceberg
   AS SELECT
     1            AS idx,
@@ -59,78 +44,122 @@
 
 spark.sql(
     """
-  DROP TABLE IF EXISTS test_null_nan_rewritten;
+  CREATE OR REPLACE TABLE default.test_null_nan_rewritten
+  USING iceberg
+  AS SELECT * FROM default.test_null_nan
 """
 )
 
 spark.sql(
     """
-  CREATE TABLE test_null_nan_rewritten
-  USING iceberg
-  AS SELECT * FROM test_null_nan
+CREATE OR REPLACE TABLE default.test_limit as
+  SELECT * LATERAL VIEW explode(ARRAY(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) AS idx;
 """
 )
 
 spark.sql(
     """
-  DROP TABLE IF EXISTS test_limit;
+CREATE OR REPLACE TABLE default.test_positional_mor_deletes (
+    dt     date,
+    number integer,
+    letter string
+)
+USING iceberg
+TBLPROPERTIES (
+    'write.delete.mode'='merge-on-read',
+    'write.update.mode'='merge-on-read',
+    'write.merge.mode'='merge-on-read',
+    'format-version'='2'
+);
 """
 )
 
+# Partitioning is not really needed, but there is a bug:
+# https://github.com/apache/iceberg/pull/7685
 spark.sql(
     """
-    CREATE TABLE test_limit
-    USING iceberg
-      AS SELECT
-          1            AS idx
-      UNION ALL SELECT
-          2            AS idx
-      UNION ALL SELECT
-          3            AS idx
-      UNION ALL SELECT
-          4            AS idx
-      UNION ALL SELECT
-          5            AS idx
-      UNION ALL SELECT
-          6            AS idx
-      UNION ALL SELECT
-          7            AS idx
-      UNION ALL SELECT
-          8            AS idx
-      UNION ALL SELECT
-          9            AS idx
-      UNION ALL SELECT
-          10           AS idx
+    ALTER TABLE default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years
+"""
+)
+
+spark.sql(
     """
+INSERT INTO default.test_positional_mor_deletes
+VALUES
+    (CAST('2023-03-01' AS date), 1, 'a'),
+    (CAST('2023-03-02' AS date), 2, 'b'),
+    (CAST('2023-03-03' AS date), 3, 'c'),
+    (CAST('2023-03-04' AS date), 4, 'd'),
+    (CAST('2023-03-05' AS date), 5, 'e'),
+    (CAST('2023-03-06' AS date), 6, 'f'),
+    (CAST('2023-03-07' AS date), 7, 'g'),
+    (CAST('2023-03-08' AS date), 8, 'h'),
+    (CAST('2023-03-09' AS date), 9, 'i'),
+    (CAST('2023-03-10' AS date), 10, 'j'),
+    (CAST('2023-03-11' AS date), 11, 'k'),
+    (CAST('2023-03-12' AS date), 12, 'l');
+"""
 )
 
 spark.sql(
     """
-  DROP TABLE IF EXISTS test_deletes;
+DELETE FROM default.test_positional_mor_deletes WHERE number = 9
 """
 )
 
 spark.sql(
     """
-  CREATE TABLE test_deletes
+  CREATE OR REPLACE TABLE default.test_positional_mor_double_deletes (
+    dt     date,
+    number integer,
+    letter string
+  )
   USING iceberg
   TBLPROPERTIES (
     'write.delete.mode'='merge-on-read',
     'write.update.mode'='merge-on-read',
-    'write.merge.mode'='merge-on-read'
-  )
-  AS SELECT
-    1       AS idx,
-    True    AS deleted
-UNION ALL SELECT
-    2       AS idx,
-    False   AS deleted;
+    'write.merge.mode'='merge-on-read',
+    'format-version'='2'
+  );
+"""
+)
+
+# Partitioning is not really needed, but there is a bug:
+# https://github.com/apache/iceberg/pull/7685
+spark.sql(
+    """
+    ALTER TABLE default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years
+"""
+)
+
+spark.sql(
+    """
+INSERT INTO default.test_positional_mor_double_deletes
+VALUES
+    (CAST('2023-03-01' AS date), 1, 'a'),
+    (CAST('2023-03-02' AS date), 2, 'b'),
+    (CAST('2023-03-03' AS date), 3, 'c'),
+    (CAST('2023-03-04' AS date), 4, 'd'),
+    (CAST('2023-03-05' AS date), 5, 'e'),
+    (CAST('2023-03-06' AS date), 6, 'f'),
+    (CAST('2023-03-07' AS date), 7, 'g'),
+    (CAST('2023-03-08' AS date), 8, 'h'),
+    (CAST('2023-03-09' AS date), 9, 'i'),
+    (CAST('2023-03-10' AS date), 10, 'j'),
+    (CAST('2023-03-11' AS date), 11, 'k'),
+    (CAST('2023-03-12' AS date), 12, 'l');
+"""
+)
+
+spark.sql(
+    """
+    DELETE FROM default.test_positional_mor_double_deletes WHERE number = 9
 """
 )
 
 spark.sql(
     """
-  DELETE FROM test_deletes WHERE deleted = True;
+    DELETE FROM default.test_positional_mor_double_deletes WHERE letter == 'f'
 """
 )
 
@@ -156,6 +185,3 @@
 all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version", "2").partitionedBy(
     "intCol"
 ).createOrReplace()
-
-while True:
-    time.sleep(1)
diff --git a/python/mkdocs/docs/contributing.md b/python/mkdocs/docs/contributing.md
index bf6f12872d7d..989cbbea44f8 100644
--- a/python/mkdocs/docs/contributing.md
+++ b/python/mkdocs/docs/contributing.md
@@ -107,6 +107,22 @@ make test PYTEST_ARGS="--pdb"
 
 To see all available pytest arguments, run `make test PYTEST_ARGS="--help"`.
 
+### Integration tests
+
+PyIceberg has integration tests with Apache Spark. Spark will create a new database and provision some tables that PyIceberg can query against.
+
+```sh
+make test-integration
+```
+
+This will restart the containers, to get to a clean state, and then run the PyTest suite. In case something changed in the Dockerfile or the provision script, you can run:
+
+```sh
+make test-integration-rebuild
+```
+
+To rebuild the containers from scratch.
+
 ## Code standards
 
 Below are the formalized conventions that we adhere to in the PyIceberg project. The goal of this is to have a common agreement on how to evolve the codebase, but also using it as guidelines for newcomers to the project.