From 3a584a28352e5f13ca128599f4d331aa5eeaa374 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 25 May 2023 13:45:09 -0500 Subject: [PATCH] Python: Refactor integration tests (#7698) * Python: Refactor integration tests This splits out running and building the integration tests that enables quick development iterations. I've also added ipython that will give a more meaningful error when something goes wrong with the provisioning of the tests. * Simplify SQL --- python/Makefile | 12 ++- python/dev/Dockerfile | 4 +- python/dev/entrypoint.sh | 2 +- python/dev/provision.py | 140 +++++++++++++++++------------ python/mkdocs/docs/contributing.md | 16 ++++ 5 files changed, 112 insertions(+), 62 deletions(-) diff --git a/python/Makefile b/python/Makefile index ea0a3e82a8fd..444a3785bcc5 100644 --- a/python/Makefile +++ b/python/Makefile @@ -34,10 +34,16 @@ test-s3: test-integration: docker-compose -f dev/docker-compose-integration.yml kill - docker-compose -f dev/docker-compose-integration.yml build + docker-compose -f dev/docker-compose-integration.yml rm -f docker-compose -f dev/docker-compose-integration.yml up -d - sleep 30 - poetry run pytest tests/ -m integration ${PYTEST_ARGS} + sleep 10 + docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py + poetry run pytest tests/ -v -m integration ${PYTEST_ARGS} + +test-integration-rebuild: + docker-compose -f dev/docker-compose-integration.yml kill + docker-compose -f dev/docker-compose-integration.yml rm -f + docker-compose -f dev/docker-compose-integration.yml build --no-cache test-adlfs: sh ./dev/run-azurite.sh diff --git a/python/dev/Dockerfile b/python/dev/Dockerfile index 65d5503b579f..c6bbe543d328 100644 --- a/python/dev/Dockerfile +++ b/python/dev/Dockerfile @@ -37,7 +37,7 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/ WORKDIR ${SPARK_HOME} ENV SPARK_VERSION=3.3.2 -ENV ICEBERG_VERSION=1.2.0 +ENV ICEBERG_VERSION=1.2.1 ENV AWS_SDK_VERSION=2.20.18 RUN curl -s https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ @@ -62,6 +62,8 @@ ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" RUN chmod u+x /opt/spark/sbin/* && \ chmod u+x /opt/spark/bin/* +RUN pip3 install -q ipython + COPY entrypoint.sh . COPY provision.py . diff --git a/python/dev/entrypoint.sh b/python/dev/entrypoint.sh index d777f8f5a284..574e876c7702 100755 --- a/python/dev/entrypoint.sh +++ b/python/dev/entrypoint.sh @@ -22,4 +22,4 @@ start-master.sh -p 7077 start-worker.sh spark://spark-iceberg:7077 start-history-server.sh -python3 ./provision.py +tail -f /dev/null diff --git a/python/dev/provision.py b/python/dev/provision.py index 81bd094c5826..73ec34fdc109 100644 --- a/python/dev/provision.py +++ b/python/dev/provision.py @@ -14,15 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import time from pyspark.sql import SparkSession from pyspark.sql.functions import current_date, date_add, expr spark = SparkSession.builder.getOrCreate() -print("Create database") - spark.sql( """ CREATE DATABASE IF NOT EXISTS default; @@ -31,19 +28,7 @@ spark.sql( """ - use default; -""" -) - -spark.sql( - """ - DROP TABLE IF EXISTS test_null_nan; -""" -) - -spark.sql( - """ - CREATE TABLE test_null_nan + CREATE OR REPLACE TABLE default.test_null_nan USING iceberg AS SELECT 1 AS idx, @@ -59,78 +44,122 @@ spark.sql( """ - DROP TABLE IF EXISTS test_null_nan_rewritten; + CREATE OR REPLACE TABLE default.test_null_nan_rewritten + USING iceberg + AS SELECT * FROM default.test_null_nan """ ) spark.sql( """ - CREATE TABLE test_null_nan_rewritten - USING iceberg - AS SELECT * FROM test_null_nan +CREATE OR REPLACE TABLE default.test_limit as + SELECT * LATERAL VIEW explode(ARRAY(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) AS idx; """ ) spark.sql( """ - DROP TABLE IF EXISTS test_limit; +CREATE OR REPLACE TABLE default.test_positional_mor_deletes ( + dt date, + number integer, + letter string +) +USING iceberg +TBLPROPERTIES ( + 'write.delete.mode'='merge-on-read', + 'write.update.mode'='merge-on-read', + 'write.merge.mode'='merge-on-read', + 'format-version'='2' +); """ ) +# Partitioning is not really needed, but there is a bug: +# https://github.com/apache/iceberg/pull/7685 spark.sql( """ - CREATE TABLE test_limit - USING iceberg - AS SELECT - 1 AS idx - UNION ALL SELECT - 2 AS idx - UNION ALL SELECT - 3 AS idx - UNION ALL SELECT - 4 AS idx - UNION ALL SELECT - 5 AS idx - UNION ALL SELECT - 6 AS idx - UNION ALL SELECT - 7 AS idx - UNION ALL SELECT - 8 AS idx - UNION ALL SELECT - 9 AS idx - UNION ALL SELECT - 10 AS idx + ALTER TABLE default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years +""" +) + +spark.sql( """ +INSERT INTO default.test_positional_mor_deletes +VALUES + (CAST('2023-03-01' AS date), 1, 'a'), + (CAST('2023-03-02' AS date), 2, 'b'), + (CAST('2023-03-03' AS date), 3, 'c'), + (CAST('2023-03-04' AS date), 4, 'd'), + (CAST('2023-03-05' AS date), 5, 'e'), + (CAST('2023-03-06' AS date), 6, 'f'), + (CAST('2023-03-07' AS date), 7, 'g'), + (CAST('2023-03-08' AS date), 8, 'h'), + (CAST('2023-03-09' AS date), 9, 'i'), + (CAST('2023-03-10' AS date), 10, 'j'), + (CAST('2023-03-11' AS date), 11, 'k'), + (CAST('2023-03-12' AS date), 12, 'l'); +""" ) spark.sql( """ - DROP TABLE IF EXISTS test_deletes; +DELETE FROM default.test_positional_mor_deletes WHERE number = 9 """ ) spark.sql( """ - CREATE TABLE test_deletes + CREATE OR REPLACE TABLE default.test_positional_mor_double_deletes ( + dt date, + number integer, + letter string + ) USING iceberg TBLPROPERTIES ( 'write.delete.mode'='merge-on-read', 'write.update.mode'='merge-on-read', - 'write.merge.mode'='merge-on-read' - ) - AS SELECT - 1 AS idx, - True AS deleted -UNION ALL SELECT - 2 AS idx, - False AS deleted; + 'write.merge.mode'='merge-on-read', + 'format-version'='2' + ); +""" +) + +# Partitioning is not really needed, but there is a bug: +# https://github.com/apache/iceberg/pull/7685 +spark.sql( + """ + ALTER TABLE default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years +""" +) + +spark.sql( + """ +INSERT INTO default.test_positional_mor_double_deletes +VALUES + (CAST('2023-03-01' AS date), 1, 'a'), + (CAST('2023-03-02' AS date), 2, 'b'), + (CAST('2023-03-03' AS date), 3, 'c'), + (CAST('2023-03-04' AS date), 4, 'd'), + (CAST('2023-03-05' AS date), 5, 'e'), + (CAST('2023-03-06' AS date), 6, 'f'), + (CAST('2023-03-07' AS date), 7, 'g'), + (CAST('2023-03-08' AS date), 8, 'h'), + (CAST('2023-03-09' AS date), 9, 'i'), + (CAST('2023-03-10' AS date), 10, 'j'), + (CAST('2023-03-11' AS date), 11, 'k'), + (CAST('2023-03-12' AS date), 12, 'l'); +""" +) + +spark.sql( + """ + DELETE FROM default.test_positional_mor_double_deletes WHERE number = 9 """ ) spark.sql( """ - DELETE FROM test_deletes WHERE deleted = True; + DELETE FROM default.test_positional_mor_double_deletes WHERE letter == 'f' """ ) @@ -156,6 +185,3 @@ all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version", "2").partitionedBy( "intCol" ).createOrReplace() - -while True: - time.sleep(1) diff --git a/python/mkdocs/docs/contributing.md b/python/mkdocs/docs/contributing.md index bf6f12872d7d..989cbbea44f8 100644 --- a/python/mkdocs/docs/contributing.md +++ b/python/mkdocs/docs/contributing.md @@ -107,6 +107,22 @@ make test PYTEST_ARGS="--pdb" To see all available pytest arguments, run `make test PYTEST_ARGS="--help"`. +### Integration tests + +PyIceberg has integration tests with Apache Spark. Spark will create a new database and provision some tables that PyIceberg can query against. + +```sh +make test-integration +``` + +This will restart the containers, to get to a clean state, and then run the PyTest suite. In case something changed in the Dockerfile or the provision script, you can run: + +```sh +make test-integration-rebuild +``` + +To rebuild the containers from scratch. + ## Code standards Below are the formalized conventions that we adhere to in the PyIceberg project. The goal of this is to have a common agreement on how to evolve the codebase, but also using it as guidelines for newcomers to the project.