Skip to content

Commit 1d9570d

Browse files
authored
docstrings (apache#1189)
1 parent b8b2f66 commit 1d9570d

File tree

1 file changed

+101
-5
lines changed

1 file changed

+101
-5
lines changed

pyiceberg/table/__init__.py

+101-5
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def _apply(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequ
259259
return self
260260

261261
def _scan(self, row_filter: Union[str, BooleanExpression] = ALWAYS_TRUE) -> DataScan:
262-
"""Minimal data scan the table with the current state of the transaction."""
262+
"""Minimal data scan of the table with the current state of the transaction."""
263263
return DataScan(
264264
table_metadata=self.table_metadata,
265265
io=self._table.io,
@@ -681,6 +681,8 @@ def commit_transaction(self) -> Table:
681681

682682

683683
class CreateTableTransaction(Transaction):
684+
"""A transaction that involves the creation of a a new table."""
685+
684686
def _initial_changes(self, table_metadata: TableMetadata) -> None:
685687
"""Set the initial changes that can reconstruct the initial table metadata when creating the CreateTableTransaction."""
686688
self._updates += (
@@ -749,17 +751,23 @@ class TableIdentifier(IcebergBaseModel):
749751

750752

751753
class CommitTableRequest(IcebergBaseModel):
754+
"""A pydantic BaseModel for a table commit request."""
755+
752756
identifier: TableIdentifier = Field()
753757
requirements: Tuple[TableRequirement, ...] = Field(default_factory=tuple)
754758
updates: Tuple[TableUpdate, ...] = Field(default_factory=tuple)
755759

756760

757761
class CommitTableResponse(IcebergBaseModel):
762+
"""A pydantic BaseModel for a table commit response."""
763+
758764
metadata: TableMetadata
759765
metadata_location: str = Field(alias="metadata-location")
760766

761767

762768
class Table:
769+
"""An Iceberg table."""
770+
763771
_identifier: Identifier = Field()
764772
metadata: TableMetadata
765773
metadata_location: str = Field()
@@ -785,11 +793,19 @@ def transaction(self) -> Transaction:
785793

786794
@property
787795
def inspect(self) -> InspectTable:
788-
"""Return the InspectTable object to browse the table metadata."""
796+
"""Return the InspectTable object to browse the table metadata.
797+
798+
Returns:
799+
InspectTable object based on this Table.
800+
"""
789801
return InspectTable(self)
790802

791803
def refresh(self) -> Table:
792-
"""Refresh the current table metadata."""
804+
"""Refresh the current table metadata.
805+
806+
Returns:
807+
An updated instance of the same Iceberg table
808+
"""
793809
fresh = self.catalog.load_table(self._identifier)
794810
self.metadata = fresh.metadata
795811
self.io = fresh.io
@@ -798,7 +814,11 @@ def refresh(self) -> Table:
798814

799815
@property
800816
def identifier(self) -> Identifier:
801-
"""Return the identifier of this table."""
817+
"""Return the identifier of this table.
818+
819+
Returns:
820+
An Identifier tuple of the table name
821+
"""
802822
deprecation_message(
803823
deprecated_in="0.8.0",
804824
removed_in="0.9.0",
@@ -807,7 +827,11 @@ def identifier(self) -> Identifier:
807827
return (self.catalog.name,) + self._identifier
808828

809829
def name(self) -> Identifier:
810-
"""Return the identifier of this table."""
830+
"""Return the identifier of this table.
831+
832+
Returns:
833+
An Identifier tuple of the table name
834+
"""
811835
return self.identifier
812836

813837
def scan(
@@ -819,6 +843,35 @@ def scan(
819843
options: Properties = EMPTY_DICT,
820844
limit: Optional[int] = None,
821845
) -> DataScan:
846+
"""Fetch a DataScan based on the table's current metadata.
847+
848+
The data scan can be used to project the table's data
849+
that matches the provided row_filter onto the table's
850+
current schema.
851+
852+
Args:
853+
row_filter:
854+
A string or BooleanExpression that decsribes the
855+
desired rows
856+
selected_fileds:
857+
A tuple of strings representing the column names
858+
to return in the output dataframe.
859+
case_sensitive:
860+
If True column matching is case sensitive
861+
snapshot_id:
862+
Optional Snapshot ID to time travel to. If None,
863+
scans the table as of the current snapshot ID.
864+
options:
865+
Additional Table properties as a dictionary of
866+
string key value pairs to use for this scan.
867+
limit:
868+
An integer representing the number of rows to
869+
return in the scan result. If None, fetches all
870+
matching rows.
871+
872+
Returns:
873+
A DataScan based on the table's current metadata.
874+
"""
822875
return DataScan(
823876
table_metadata=self.metadata,
824877
io=self.io,
@@ -1212,6 +1265,8 @@ class ScanTask(ABC):
12121265

12131266
@dataclass(init=False)
12141267
class FileScanTask(ScanTask):
1268+
"""Task representing a data file and its corresponding delete files."""
1269+
12151270
file: DataFile
12161271
delete_files: Set[DataFile]
12171272
start: int
@@ -1236,6 +1291,11 @@ def _open_manifest(
12361291
partition_filter: Callable[[DataFile], bool],
12371292
metrics_evaluator: Callable[[DataFile], bool],
12381293
) -> List[ManifestEntry]:
1294+
"""Open a manifest file and return matching manifest entries.
1295+
1296+
Returns:
1297+
A list of ManifestEntry that matches the provided filters.
1298+
"""
12391299
return [
12401300
manifest_entry
12411301
for manifest_entry in manifest.fetch_manifest_entry(io, discard_deleted=True)
@@ -1395,13 +1455,30 @@ def plan_files(self) -> Iterable[FileScanTask]:
13951455
]
13961456

13971457
def to_arrow(self) -> pa.Table:
1458+
"""Read an Arrow table eagerly from this DataScan.
1459+
1460+
All rows will be loaded into memory at once.
1461+
1462+
Returns:
1463+
pa.Table: Materialized Arrow Table from the Iceberg table's DataScan
1464+
"""
13981465
from pyiceberg.io.pyarrow import ArrowScan
13991466

14001467
return ArrowScan(
14011468
self.table_metadata, self.io, self.projection(), self.row_filter, self.case_sensitive, self.limit
14021469
).to_table(self.plan_files())
14031470

14041471
def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
1472+
"""Return an Arrow RecordBatchReader from this DataScan.
1473+
1474+
For large results, using a RecordBatchReader requires less memory than
1475+
loading an Arrow Table for the same DataScan, because a RecordBatch
1476+
is read one at a time.
1477+
1478+
Returns:
1479+
pa.RecordBatchReader: Arrow RecordBatchReader from the Iceberg table's DataScan
1480+
which can be used to read a stream of record batches one by one.
1481+
"""
14051482
import pyarrow as pa
14061483

14071484
from pyiceberg.io.pyarrow import ArrowScan, schema_to_pyarrow
@@ -1417,9 +1494,19 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
14171494
)
14181495

14191496
def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
1497+
"""Read a Pandas DataFrame eagerly from this Iceberg table.
1498+
1499+
Returns:
1500+
pd.DataFrame: Materialized Pandas Dataframe from the Iceberg table
1501+
"""
14201502
return self.to_arrow().to_pandas(**kwargs)
14211503

14221504
def to_duckdb(self, table_name: str, connection: Optional[DuckDBPyConnection] = None) -> DuckDBPyConnection:
1505+
"""Shorthand for loading the Iceberg Table in DuckDB.
1506+
1507+
Returns:
1508+
DuckDBPyConnection: In memory DuckDB connection with the Iceberg table.
1509+
"""
14231510
import duckdb
14241511

14251512
con = connection or duckdb.connect(database=":memory:")
@@ -1428,13 +1515,20 @@ def to_duckdb(self, table_name: str, connection: Optional[DuckDBPyConnection] =
14281515
return con
14291516

14301517
def to_ray(self) -> ray.data.dataset.Dataset:
1518+
"""Read a Ray Dataset eagerly from this Iceberg table.
1519+
1520+
Returns:
1521+
ray.data.dataset.Dataset: Materialized Ray Dataset from the Iceberg table
1522+
"""
14311523
import ray
14321524

14331525
return ray.data.from_arrow(self.to_arrow())
14341526

14351527

14361528
@dataclass(frozen=True)
14371529
class WriteTask:
1530+
"""Task with the parameters for writing a DataFile."""
1531+
14381532
write_uuid: uuid.UUID
14391533
task_id: int
14401534
schema: Schema
@@ -1457,6 +1551,8 @@ def generate_data_file_path(self, extension: str) -> str:
14571551

14581552
@dataclass(frozen=True)
14591553
class AddFileTask:
1554+
"""Task with the parameters for adding a Parquet file as a DataFile."""
1555+
14601556
file_path: str
14611557
partition_field_value: Record
14621558

0 commit comments

Comments
 (0)