18
18
import math
19
19
import os
20
20
import time
21
- from datetime import date , datetime
21
+ from datetime import date , datetime , timedelta
22
22
from pathlib import Path
23
23
from typing import Any , Dict
24
24
from urllib .parse import urlparse
25
25
26
26
import numpy as np
27
27
import pandas as pd
28
28
import pyarrow as pa
29
+ import pyarrow .compute as pc
29
30
import pyarrow .parquet as pq
30
31
import pytest
31
32
import pytz
39
40
from pyiceberg .catalog .rest import RestCatalog
40
41
from pyiceberg .catalog .sql import SqlCatalog
41
42
from pyiceberg .exceptions import NoSuchTableError
42
- from pyiceberg .expressions import GreaterThanOrEqual , In , Not
43
+ from pyiceberg .expressions import And , EqualTo , GreaterThanOrEqual , In , LessThan , Not
43
44
from pyiceberg .io .pyarrow import _dataframe_to_data_files
44
45
from pyiceberg .partitioning import PartitionField , PartitionSpec
45
46
from pyiceberg .schema import Schema
46
47
from pyiceberg .table import TableProperties
47
- from pyiceberg .transforms import DayTransform , IdentityTransform
48
+ from pyiceberg .transforms import DayTransform , HourTransform , IdentityTransform
48
49
from pyiceberg .types import (
49
50
DateType ,
50
51
DoubleType ,
@@ -1344,18 +1345,7 @@ def test_overwrite_all_data_with_filter(session_catalog: Catalog) -> None:
1344
1345
1345
1346
1346
1347
@pytest .mark .integration
1347
- def test_delete_threshold () -> None :
1348
- catalog = load_catalog (
1349
- "local" ,
1350
- ** {
1351
- "type" : "rest" ,
1352
- "uri" : "http://localhost:8181" ,
1353
- "s3.endpoint" : "http://localhost:9000" ,
1354
- "s3.access-key-id" : "admin" ,
1355
- "s3.secret-access-key" : "password" ,
1356
- },
1357
- )
1358
-
1348
+ def test_delete_threshold (session_catalog : Catalog ) -> None :
1359
1349
schema = Schema (
1360
1350
NestedField (field_id = 101 , name = "id" , field_type = LongType (), required = True ),
1361
1351
NestedField (field_id = 103 , name = "created_at" , field_type = DateType (), required = False ),
@@ -1365,13 +1355,13 @@ def test_delete_threshold() -> None:
1365
1355
partition_spec = PartitionSpec (PartitionField (source_id = 103 , field_id = 2000 , transform = DayTransform (), name = "created_at_day" ))
1366
1356
1367
1357
try :
1368
- catalog .drop_table (
1358
+ session_catalog .drop_table (
1369
1359
identifier = "default.scores" ,
1370
1360
)
1371
1361
except NoSuchTableError :
1372
1362
pass
1373
1363
1374
- catalog .create_table (
1364
+ session_catalog .create_table (
1375
1365
identifier = "default.scores" ,
1376
1366
schema = schema ,
1377
1367
partition_spec = partition_spec ,
@@ -1395,7 +1385,7 @@ def test_delete_threshold() -> None:
1395
1385
# Create the dataframe
1396
1386
df = pd .DataFrame ({"id" : id_column , "created_at" : created_at_column , "relevancy_score" : relevancy_score_column })
1397
1387
1398
- iceberg_table = catalog .load_table ("default.scores" )
1388
+ iceberg_table = session_catalog .load_table ("default.scores" )
1399
1389
1400
1390
# Convert the pandas DataFrame to a PyArrow Table with the Iceberg schema
1401
1391
arrow_schema = iceberg_table .schema ().as_arrow ()
@@ -1409,3 +1399,52 @@ def test_delete_threshold() -> None:
1409
1399
assert len (iceberg_table .scan (row_filter = Not (delete_condition )).to_arrow ()) == lower_before
1410
1400
iceberg_table .delete (delete_condition )
1411
1401
assert len (iceberg_table .scan ().to_arrow ()) == lower_before
1402
+
1403
+
1404
+ @pytest .mark .integration
1405
+ def test_rewrite_manifest_after_partition_evolution (session_catalog : Catalog ) -> None :
1406
+ np .random .seed (876 )
1407
+ N = 1440
1408
+ d = {
1409
+ "timestamp" : pa .array ([datetime (2023 , 1 , 1 , 0 , 0 , 0 ) + timedelta (minutes = i ) for i in range (N )]),
1410
+ "category" : pa .array ([np .random .choice (["A" , "B" , "C" ]) for _ in range (N )]),
1411
+ "value" : pa .array (np .random .normal (size = N )),
1412
+ }
1413
+ data = pa .Table .from_pydict (d )
1414
+
1415
+ try :
1416
+ session_catalog .drop_table (
1417
+ identifier = "default.test_error_table" ,
1418
+ )
1419
+ except NoSuchTableError :
1420
+ pass
1421
+
1422
+ table = session_catalog .create_table (
1423
+ "default.test_error_table" ,
1424
+ schema = data .schema ,
1425
+ )
1426
+
1427
+ with table .update_spec () as update :
1428
+ update .add_field ("timestamp" , transform = HourTransform ())
1429
+
1430
+ table .append (data )
1431
+
1432
+ with table .update_spec () as update :
1433
+ update .add_field ("category" , transform = IdentityTransform ())
1434
+
1435
+ data_ = data .filter (
1436
+ (pc .field ("category" ) == "A" )
1437
+ & (pc .field ("timestamp" ) >= datetime (2023 , 1 , 1 , 0 ))
1438
+ & (pc .field ("timestamp" ) < datetime (2023 , 1 , 1 , 1 ))
1439
+ )
1440
+
1441
+ table .overwrite (
1442
+ df = data_ ,
1443
+ overwrite_filter = And (
1444
+ And (
1445
+ GreaterThanOrEqual ("timestamp" , datetime (2023 , 1 , 1 , 0 ).isoformat ()),
1446
+ LessThan ("timestamp" , datetime (2023 , 1 , 1 , 1 ).isoformat ()),
1447
+ ),
1448
+ EqualTo ("category" , "A" ),
1449
+ ),
1450
+ )
0 commit comments