Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use a join for upsert deduplication (#1685)
This changes the deduplication logic to use join to duplicate the rows. While the original design wasn't wrong, it is more efficient to push things down into PyArrow to have better multi-threading and no GIL. I did a small benchmark: ```python import time import pyarrow as pa from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError from pyiceberg.schema import Schema from pyiceberg.types import NestedField, StringType, IntegerType def _drop_table(catalog: Catalog, identifier: str) -> None: try: catalog.drop_table(identifier) except NoSuchTableError: pass def test_vo(session_catalog: Catalog): catalog = session_catalog identifier = "default.test_upsert_benchmark" _drop_table(catalog, identifier) schema = Schema( NestedField(1, "idx", IntegerType(), required=True), NestedField(2, "number", IntegerType(), required=True), # Mark City as the identifier field, also known as the primary-key identifier_field_ids=[1], ) tbl = catalog.create_table(identifier, schema=schema) arrow_schema = pa.schema( [ pa.field("idx", pa.int32(), nullable=False), pa.field("number", pa.int32(), nullable=False), ] ) # Write some data df = pa.Table.from_pylist( [ {"idx": idx, "number": idx} for idx in range(1, 100000) ], schema=arrow_schema, ) tbl.append(df) df_upsert = pa.Table.from_pylist( # Overlap [ {"idx": idx, "number": idx} for idx in range(80000, 90000) ]+ # Update [ {"idx": idx, "number": idx + 1} for idx in range(90000, 100000) ] # Insert + [ {"idx": idx, "number": idx} for idx in range(100000, 110000)], schema=arrow_schema, ) start = time.time() tbl.upsert(df_upsert) stop = time.time() print(f"Took {stop-start} seconds") ``` And the result was: ``` Took 2.0412521362304688 seconds on the fd-join branch Took 3.5236432552337646 seconds on lastest main ```
- Loading branch information