145145 from pyiceberg_core .datafusion import IcebergDataFusionTable
146146
147147 from pyiceberg .catalog import Catalog
148+ from pyiceberg .catalog .rest .scan_planning import (
149+ RESTContentFile ,
150+ RESTDeleteFile ,
151+ RESTFileScanTask ,
152+ )
148153
149154ALWAYS_TRUE = AlwaysTrue ()
150155DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE = "downcast-ns-timestamp-to-us-on-write"
@@ -1168,6 +1173,8 @@ def scan(
11681173 snapshot_id = snapshot_id ,
11691174 options = options ,
11701175 limit = limit ,
1176+ catalog = self .catalog ,
1177+ table_identifier = self ._identifier ,
11711178 )
11721179
11731180 @property
@@ -1684,6 +1691,8 @@ class TableScan(ABC):
16841691 snapshot_id : int | None
16851692 options : Properties
16861693 limit : int | None
1694+ catalog : Catalog | None
1695+ table_identifier : Identifier | None
16871696
16881697 def __init__ (
16891698 self ,
@@ -1695,6 +1704,8 @@ def __init__(
16951704 snapshot_id : int | None = None ,
16961705 options : Properties = EMPTY_DICT ,
16971706 limit : int | None = None ,
1707+ catalog : Catalog | None = None ,
1708+ table_identifier : Identifier | None = None ,
16981709 ):
16991710 self .table_metadata = table_metadata
17001711 self .io = io
@@ -1704,6 +1715,8 @@ def __init__(
17041715 self .snapshot_id = snapshot_id
17051716 self .options = options
17061717 self .limit = limit
1718+ self .catalog = catalog
1719+ self .table_identifier = table_identifier
17071720
17081721 def snapshot (self ) -> Snapshot | None :
17091722 if self .snapshot_id :
@@ -1798,6 +1811,74 @@ def __init__(
17981811 self .delete_files = delete_files or set ()
17991812 self .residual = residual
18001813
1814+ @staticmethod
1815+ def from_rest_response (
1816+ rest_task : RESTFileScanTask ,
1817+ delete_files : list [RESTDeleteFile ],
1818+ ) -> FileScanTask :
1819+ """Convert a RESTFileScanTask to a FileScanTask.
1820+
1821+ Args:
1822+ rest_task: The REST file scan task.
1823+ delete_files: The list of delete files from the ScanTasks response.
1824+
1825+ Returns:
1826+ A FileScanTask with the converted data and delete files.
1827+
1828+ Raises:
1829+ NotImplementedError: If equality delete files are encountered.
1830+ """
1831+ from pyiceberg .catalog .rest .scan_planning import RESTEqualityDeleteFile
1832+
1833+ data_file = _rest_file_to_data_file (rest_task .data_file )
1834+
1835+ resolved_deletes : set [DataFile ] = set ()
1836+ if rest_task .delete_file_references :
1837+ for idx in rest_task .delete_file_references :
1838+ delete_file = delete_files [idx ]
1839+ if isinstance (delete_file , RESTEqualityDeleteFile ):
1840+ raise NotImplementedError (f"PyIceberg does not yet support equality deletes: { delete_file .file_path } " )
1841+ resolved_deletes .add (_rest_file_to_data_file (delete_file ))
1842+
1843+ return FileScanTask (
1844+ data_file = data_file ,
1845+ delete_files = resolved_deletes ,
1846+ residual = rest_task .residual_filter if rest_task .residual_filter else ALWAYS_TRUE ,
1847+ )
1848+
1849+
1850+ def _rest_file_to_data_file (rest_file : RESTContentFile ) -> DataFile :
1851+ """Convert a REST content file to a manifest DataFile."""
1852+ from pyiceberg .catalog .rest .scan_planning import CONTENT_TYPE_MAP , RESTDataFile
1853+
1854+ if isinstance (rest_file , RESTDataFile ):
1855+ column_sizes = rest_file .column_sizes .to_dict () if rest_file .column_sizes else None
1856+ value_counts = rest_file .value_counts .to_dict () if rest_file .value_counts else None
1857+ null_value_counts = rest_file .null_value_counts .to_dict () if rest_file .null_value_counts else None
1858+ nan_value_counts = rest_file .nan_value_counts .to_dict () if rest_file .nan_value_counts else None
1859+ else :
1860+ column_sizes = None
1861+ value_counts = None
1862+ null_value_counts = None
1863+ nan_value_counts = None
1864+
1865+ data_file = DataFile .from_args (
1866+ content = CONTENT_TYPE_MAP [rest_file .content ],
1867+ file_path = rest_file .file_path ,
1868+ file_format = rest_file .file_format ,
1869+ partition = Record (* rest_file .partition ) if rest_file .partition else Record (),
1870+ record_count = rest_file .record_count ,
1871+ file_size_in_bytes = rest_file .file_size_in_bytes ,
1872+ column_sizes = column_sizes ,
1873+ value_counts = value_counts ,
1874+ null_value_counts = null_value_counts ,
1875+ nan_value_counts = nan_value_counts ,
1876+ split_offsets = rest_file .split_offsets ,
1877+ sort_order_id = rest_file .sort_order_id ,
1878+ )
1879+ data_file .spec_id = rest_file .spec_id
1880+ return data_file
1881+
18011882
18021883def _open_manifest (
18031884 io : FileIO ,
@@ -1970,12 +2051,35 @@ def scan_plan_helper(self) -> Iterator[list[ManifestEntry]]:
19702051 ],
19712052 )
19722053
1973- def plan_files (self ) -> Iterable [FileScanTask ]:
1974- """Plans the relevant files by filtering on the PartitionSpecs.
2054+ def _should_use_rest_planning (self ) -> bool :
2055+ """Check if REST scan planning should be used for this scan."""
2056+ from pyiceberg .catalog .rest import RestCatalog
2057+
2058+ if not isinstance (self .catalog , RestCatalog ):
2059+ return False
2060+ return self .catalog .is_rest_scan_planning_enabled ()
2061+
2062+ def _plan_files_rest (self ) -> Iterable [FileScanTask ]:
2063+ """Plan files using REST server-side scan planning."""
2064+ from pyiceberg .catalog .rest import RestCatalog
2065+ from pyiceberg .catalog .rest .scan_planning import PlanTableScanRequest
2066+
2067+ if not isinstance (self .catalog , RestCatalog ):
2068+ raise TypeError ("REST scan planning requires a RestCatalog" )
2069+ if self .table_identifier is None :
2070+ raise ValueError ("REST scan planning requires a table identifier" )
2071+
2072+ request = PlanTableScanRequest (
2073+ snapshot_id = self .snapshot_id ,
2074+ select = list (self .selected_fields ) if self .selected_fields != ("*" ,) else None ,
2075+ filter = self .row_filter if self .row_filter != ALWAYS_TRUE else None ,
2076+ case_sensitive = self .case_sensitive ,
2077+ )
19752078
1976- Returns:
1977- List of FileScanTasks that contain both data and delete files.
1978- """
2079+ return self .catalog .plan_scan (self .table_identifier , request )
2080+
2081+ def _plan_files_local (self ) -> Iterable [FileScanTask ]:
2082+ """Plan files locally by reading manifests."""
19792083 data_entries : list [ManifestEntry ] = []
19802084 positional_delete_entries = SortedList (key = lambda entry : entry .sequence_number or INITIAL_SEQUENCE_NUMBER )
19812085
@@ -2006,6 +2110,20 @@ def plan_files(self) -> Iterable[FileScanTask]:
20062110 for data_entry in data_entries
20072111 ]
20082112
2113+ def plan_files (self ) -> Iterable [FileScanTask ]:
2114+ """Plans the relevant files by filtering on the PartitionSpecs.
2115+
2116+ If the table comes from a REST catalog with scan planning enabled,
2117+ this will use server-side scan planning. Otherwise, it falls back
2118+ to local planning.
2119+
2120+ Returns:
2121+ List of FileScanTasks that contain both data and delete files.
2122+ """
2123+ if self ._should_use_rest_planning ():
2124+ return self ._plan_files_rest ()
2125+ return self ._plan_files_local ()
2126+
20092127 def to_arrow (self ) -> pa .Table :
20102128 """Read an Arrow table eagerly from this DataScan.
20112129
0 commit comments