@@ -259,7 +259,7 @@ def _apply(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequ
259
259
return self
260
260
261
261
def _scan (self , row_filter : Union [str , BooleanExpression ] = ALWAYS_TRUE ) -> DataScan :
262
- """Minimal data scan the table with the current state of the transaction."""
262
+ """Minimal data scan of the table with the current state of the transaction."""
263
263
return DataScan (
264
264
table_metadata = self .table_metadata ,
265
265
io = self ._table .io ,
@@ -681,6 +681,8 @@ def commit_transaction(self) -> Table:
681
681
682
682
683
683
class CreateTableTransaction (Transaction ):
684
+ """A transaction that involves the creation of a a new table."""
685
+
684
686
def _initial_changes (self , table_metadata : TableMetadata ) -> None :
685
687
"""Set the initial changes that can reconstruct the initial table metadata when creating the CreateTableTransaction."""
686
688
self ._updates += (
@@ -749,17 +751,23 @@ class TableIdentifier(IcebergBaseModel):
749
751
750
752
751
753
class CommitTableRequest (IcebergBaseModel ):
754
+ """A pydantic BaseModel for a table commit request."""
755
+
752
756
identifier : TableIdentifier = Field ()
753
757
requirements : Tuple [TableRequirement , ...] = Field (default_factory = tuple )
754
758
updates : Tuple [TableUpdate , ...] = Field (default_factory = tuple )
755
759
756
760
757
761
class CommitTableResponse (IcebergBaseModel ):
762
+ """A pydantic BaseModel for a table commit response."""
763
+
758
764
metadata : TableMetadata
759
765
metadata_location : str = Field (alias = "metadata-location" )
760
766
761
767
762
768
class Table :
769
+ """An Iceberg table."""
770
+
763
771
_identifier : Identifier = Field ()
764
772
metadata : TableMetadata
765
773
metadata_location : str = Field ()
@@ -785,11 +793,19 @@ def transaction(self) -> Transaction:
785
793
786
794
@property
787
795
def inspect (self ) -> InspectTable :
788
- """Return the InspectTable object to browse the table metadata."""
796
+ """Return the InspectTable object to browse the table metadata.
797
+
798
+ Returns:
799
+ InspectTable object based on this Table.
800
+ """
789
801
return InspectTable (self )
790
802
791
803
def refresh (self ) -> Table :
792
- """Refresh the current table metadata."""
804
+ """Refresh the current table metadata.
805
+
806
+ Returns:
807
+ An updated instance of the same Iceberg table
808
+ """
793
809
fresh = self .catalog .load_table (self ._identifier )
794
810
self .metadata = fresh .metadata
795
811
self .io = fresh .io
@@ -798,7 +814,11 @@ def refresh(self) -> Table:
798
814
799
815
@property
800
816
def identifier (self ) -> Identifier :
801
- """Return the identifier of this table."""
817
+ """Return the identifier of this table.
818
+
819
+ Returns:
820
+ An Identifier tuple of the table name
821
+ """
802
822
deprecation_message (
803
823
deprecated_in = "0.8.0" ,
804
824
removed_in = "0.9.0" ,
@@ -807,7 +827,11 @@ def identifier(self) -> Identifier:
807
827
return (self .catalog .name ,) + self ._identifier
808
828
809
829
def name (self ) -> Identifier :
810
- """Return the identifier of this table."""
830
+ """Return the identifier of this table.
831
+
832
+ Returns:
833
+ An Identifier tuple of the table name
834
+ """
811
835
return self .identifier
812
836
813
837
def scan (
@@ -819,6 +843,35 @@ def scan(
819
843
options : Properties = EMPTY_DICT ,
820
844
limit : Optional [int ] = None ,
821
845
) -> DataScan :
846
+ """Fetch a DataScan based on the table's current metadata.
847
+
848
+ The data scan can be used to project the table's data
849
+ that matches the provided row_filter onto the table's
850
+ current schema.
851
+
852
+ Args:
853
+ row_filter:
854
+ A string or BooleanExpression that decsribes the
855
+ desired rows
856
+ selected_fileds:
857
+ A tuple of strings representing the column names
858
+ to return in the output dataframe.
859
+ case_sensitive:
860
+ If True column matching is case sensitive
861
+ snapshot_id:
862
+ Optional Snapshot ID to time travel to. If None,
863
+ scans the table as of the current snapshot ID.
864
+ options:
865
+ Additional Table properties as a dictionary of
866
+ string key value pairs to use for this scan.
867
+ limit:
868
+ An integer representing the number of rows to
869
+ return in the scan result. If None, fetches all
870
+ matching rows.
871
+
872
+ Returns:
873
+ A DataScan based on the table's current metadata.
874
+ """
822
875
return DataScan (
823
876
table_metadata = self .metadata ,
824
877
io = self .io ,
@@ -1212,6 +1265,8 @@ class ScanTask(ABC):
1212
1265
1213
1266
@dataclass (init = False )
1214
1267
class FileScanTask (ScanTask ):
1268
+ """Task representing a data file and its corresponding delete files."""
1269
+
1215
1270
file : DataFile
1216
1271
delete_files : Set [DataFile ]
1217
1272
start : int
@@ -1236,6 +1291,11 @@ def _open_manifest(
1236
1291
partition_filter : Callable [[DataFile ], bool ],
1237
1292
metrics_evaluator : Callable [[DataFile ], bool ],
1238
1293
) -> List [ManifestEntry ]:
1294
+ """Open a manifest file and return matching manifest entries.
1295
+
1296
+ Returns:
1297
+ A list of ManifestEntry that matches the provided filters.
1298
+ """
1239
1299
return [
1240
1300
manifest_entry
1241
1301
for manifest_entry in manifest .fetch_manifest_entry (io , discard_deleted = True )
@@ -1395,13 +1455,30 @@ def plan_files(self) -> Iterable[FileScanTask]:
1395
1455
]
1396
1456
1397
1457
def to_arrow (self ) -> pa .Table :
1458
+ """Read an Arrow table eagerly from this DataScan.
1459
+
1460
+ All rows will be loaded into memory at once.
1461
+
1462
+ Returns:
1463
+ pa.Table: Materialized Arrow Table from the Iceberg table's DataScan
1464
+ """
1398
1465
from pyiceberg .io .pyarrow import ArrowScan
1399
1466
1400
1467
return ArrowScan (
1401
1468
self .table_metadata , self .io , self .projection (), self .row_filter , self .case_sensitive , self .limit
1402
1469
).to_table (self .plan_files ())
1403
1470
1404
1471
def to_arrow_batch_reader (self ) -> pa .RecordBatchReader :
1472
+ """Return an Arrow RecordBatchReader from this DataScan.
1473
+
1474
+ For large results, using a RecordBatchReader requires less memory than
1475
+ loading an Arrow Table for the same DataScan, because a RecordBatch
1476
+ is read one at a time.
1477
+
1478
+ Returns:
1479
+ pa.RecordBatchReader: Arrow RecordBatchReader from the Iceberg table's DataScan
1480
+ which can be used to read a stream of record batches one by one.
1481
+ """
1405
1482
import pyarrow as pa
1406
1483
1407
1484
from pyiceberg .io .pyarrow import ArrowScan , schema_to_pyarrow
@@ -1417,9 +1494,19 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
1417
1494
)
1418
1495
1419
1496
def to_pandas (self , ** kwargs : Any ) -> pd .DataFrame :
1497
+ """Read a Pandas DataFrame eagerly from this Iceberg table.
1498
+
1499
+ Returns:
1500
+ pd.DataFrame: Materialized Pandas Dataframe from the Iceberg table
1501
+ """
1420
1502
return self .to_arrow ().to_pandas (** kwargs )
1421
1503
1422
1504
def to_duckdb (self , table_name : str , connection : Optional [DuckDBPyConnection ] = None ) -> DuckDBPyConnection :
1505
+ """Shorthand for loading the Iceberg Table in DuckDB.
1506
+
1507
+ Returns:
1508
+ DuckDBPyConnection: In memory DuckDB connection with the Iceberg table.
1509
+ """
1423
1510
import duckdb
1424
1511
1425
1512
con = connection or duckdb .connect (database = ":memory:" )
@@ -1428,13 +1515,20 @@ def to_duckdb(self, table_name: str, connection: Optional[DuckDBPyConnection] =
1428
1515
return con
1429
1516
1430
1517
def to_ray (self ) -> ray .data .dataset .Dataset :
1518
+ """Read a Ray Dataset eagerly from this Iceberg table.
1519
+
1520
+ Returns:
1521
+ ray.data.dataset.Dataset: Materialized Ray Dataset from the Iceberg table
1522
+ """
1431
1523
import ray
1432
1524
1433
1525
return ray .data .from_arrow (self .to_arrow ())
1434
1526
1435
1527
1436
1528
@dataclass (frozen = True )
1437
1529
class WriteTask :
1530
+ """Task with the parameters for writing a DataFile."""
1531
+
1438
1532
write_uuid : uuid .UUID
1439
1533
task_id : int
1440
1534
schema : Schema
@@ -1457,6 +1551,8 @@ def generate_data_file_path(self, extension: str) -> str:
1457
1551
1458
1552
@dataclass (frozen = True )
1459
1553
class AddFileTask :
1554
+ """Task with the parameters for adding a Parquet file as a DataFile."""
1555
+
1460
1556
file_path : str
1461
1557
partition_field_value : Record
1462
1558
0 commit comments