Skip to content

Commit

Permalink
Revert "Python: Refactor integration tests (apache#7698)" (apache#7729)
Browse files Browse the repository at this point in the history
This reverts commit 3a584a2.
  • Loading branch information
Fokko authored May 30, 2023
1 parent 7d3fa50 commit 8858f1c
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 112 deletions.
12 changes: 3 additions & 9 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,10 @@ test-s3:

test-integration:
docker-compose -f dev/docker-compose-integration.yml kill
docker-compose -f dev/docker-compose-integration.yml rm -f
docker-compose -f dev/docker-compose-integration.yml build
docker-compose -f dev/docker-compose-integration.yml up -d
sleep 10
docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
poetry run pytest tests/ -v -m integration ${PYTEST_ARGS}

test-integration-rebuild:
docker-compose -f dev/docker-compose-integration.yml kill
docker-compose -f dev/docker-compose-integration.yml rm -f
docker-compose -f dev/docker-compose-integration.yml build --no-cache
sleep 30
poetry run pytest tests/ -m integration ${PYTEST_ARGS}

test-adlfs:
sh ./dev/run-azurite.sh
Expand Down
4 changes: 1 addition & 3 deletions python/dev/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/
WORKDIR ${SPARK_HOME}

ENV SPARK_VERSION=3.3.2
ENV ICEBERG_VERSION=1.2.1
ENV ICEBERG_VERSION=1.2.0
ENV AWS_SDK_VERSION=2.20.18

RUN curl -s https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
Expand All @@ -62,8 +62,6 @@ ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
RUN chmod u+x /opt/spark/sbin/* && \
chmod u+x /opt/spark/bin/*

RUN pip3 install -q ipython

COPY entrypoint.sh .
COPY provision.py .

Expand Down
2 changes: 1 addition & 1 deletion python/dev/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ start-master.sh -p 7077
start-worker.sh spark://spark-iceberg:7077
start-history-server.sh

tail -f /dev/null
python3 ./provision.py
140 changes: 57 additions & 83 deletions python/dev/provision.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import current_date, date_add, expr

spark = SparkSession.builder.getOrCreate()

print("Create database")

spark.sql(
"""
CREATE DATABASE IF NOT EXISTS default;
Expand All @@ -28,7 +31,19 @@

spark.sql(
"""
CREATE OR REPLACE TABLE default.test_null_nan
use default;
"""
)

spark.sql(
"""
DROP TABLE IF EXISTS test_null_nan;
"""
)

spark.sql(
"""
CREATE TABLE test_null_nan
USING iceberg
AS SELECT
1 AS idx,
Expand All @@ -44,122 +59,78 @@

spark.sql(
"""
CREATE OR REPLACE TABLE default.test_null_nan_rewritten
USING iceberg
AS SELECT * FROM default.test_null_nan
DROP TABLE IF EXISTS test_null_nan_rewritten;
"""
)

spark.sql(
"""
CREATE OR REPLACE TABLE default.test_limit as
SELECT * LATERAL VIEW explode(ARRAY(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) AS idx;
CREATE TABLE test_null_nan_rewritten
USING iceberg
AS SELECT * FROM test_null_nan
"""
)

spark.sql(
"""
CREATE OR REPLACE TABLE default.test_positional_mor_deletes (
dt date,
number integer,
letter string
)
USING iceberg
TBLPROPERTIES (
'write.delete.mode'='merge-on-read',
'write.update.mode'='merge-on-read',
'write.merge.mode'='merge-on-read',
'format-version'='2'
);
DROP TABLE IF EXISTS test_limit;
"""
)

# Partitioning is not really needed, but there is a bug:
# https://github.com/apache/iceberg/pull/7685
spark.sql(
"""
ALTER TABLE default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years
"""
)

spark.sql(
CREATE TABLE test_limit
USING iceberg
AS SELECT
1 AS idx
UNION ALL SELECT
2 AS idx
UNION ALL SELECT
3 AS idx
UNION ALL SELECT
4 AS idx
UNION ALL SELECT
5 AS idx
UNION ALL SELECT
6 AS idx
UNION ALL SELECT
7 AS idx
UNION ALL SELECT
8 AS idx
UNION ALL SELECT
9 AS idx
UNION ALL SELECT
10 AS idx
"""
INSERT INTO default.test_positional_mor_deletes
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)

spark.sql(
"""
DELETE FROM default.test_positional_mor_deletes WHERE number = 9
DROP TABLE IF EXISTS test_deletes;
"""
)

spark.sql(
"""
CREATE OR REPLACE TABLE default.test_positional_mor_double_deletes (
dt date,
number integer,
letter string
)
CREATE TABLE test_deletes
USING iceberg
TBLPROPERTIES (
'write.delete.mode'='merge-on-read',
'write.update.mode'='merge-on-read',
'write.merge.mode'='merge-on-read',
'format-version'='2'
);
"""
)

# Partitioning is not really needed, but there is a bug:
# https://github.com/apache/iceberg/pull/7685
spark.sql(
"""
ALTER TABLE default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years
"""
)

spark.sql(
"""
INSERT INTO default.test_positional_mor_double_deletes
VALUES
(CAST('2023-03-01' AS date), 1, 'a'),
(CAST('2023-03-02' AS date), 2, 'b'),
(CAST('2023-03-03' AS date), 3, 'c'),
(CAST('2023-03-04' AS date), 4, 'd'),
(CAST('2023-03-05' AS date), 5, 'e'),
(CAST('2023-03-06' AS date), 6, 'f'),
(CAST('2023-03-07' AS date), 7, 'g'),
(CAST('2023-03-08' AS date), 8, 'h'),
(CAST('2023-03-09' AS date), 9, 'i'),
(CAST('2023-03-10' AS date), 10, 'j'),
(CAST('2023-03-11' AS date), 11, 'k'),
(CAST('2023-03-12' AS date), 12, 'l');
"""
)

spark.sql(
"""
DELETE FROM default.test_positional_mor_double_deletes WHERE number = 9
'write.merge.mode'='merge-on-read'
)
AS SELECT
1 AS idx,
True AS deleted
UNION ALL SELECT
2 AS idx,
False AS deleted;
"""
)

spark.sql(
"""
DELETE FROM default.test_positional_mor_double_deletes WHERE letter == 'f'
DELETE FROM test_deletes WHERE deleted = True;
"""
)

Expand All @@ -185,3 +156,6 @@
all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version", "2").partitionedBy(
"intCol"
).createOrReplace()

while True:
time.sleep(1)
16 changes: 0 additions & 16 deletions python/mkdocs/docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,22 +107,6 @@ make test PYTEST_ARGS="--pdb"

To see all available pytest arguments, run `make test PYTEST_ARGS="--help"`.

### Integration tests

PyIceberg has integration tests with Apache Spark. Spark will create a new database and provision some tables that PyIceberg can query against.

```sh
make test-integration
```

This will restart the containers, to get to a clean state, and then run the PyTest suite. In case something changed in the Dockerfile or the provision script, you can run:

```sh
make test-integration-rebuild
```

To rebuild the containers from scratch.

## Code standards

Below are the formalized conventions that we adhere to in the PyIceberg project. The goal of this is to have a common agreement on how to evolve the codebase, but also using it as guidelines for newcomers to the project.
Expand Down

0 comments on commit 8858f1c

Please sign in to comment.