fix spec_id passing

geruh · geruh · commit 85de0c67d5c1 · 2025-12-27T02:49:06.000-08:00
diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py
@@ -15,7 +15,6 @@
 #  specific language governing permissions and limitations
 #  under the License.
 from collections import deque
-from collections.abc import Iterator
 from enum import Enum
 from typing import (
     TYPE_CHECKING,
@@ -460,18 +459,17 @@ def _fetch_scan_tasks(self, identifier: str | Identifier, plan_task: str) -> Sca
 
         return ScanTasks.model_validate_json(response.text)
 
-    def plan_scan(self, identifier: str | Identifier, request: PlanTableScanRequest) -> Iterator["FileScanTask"]:
-        """Plan a table scan and yield FileScanTasks.
+    def plan_scan(self, identifier: str | Identifier, request: PlanTableScanRequest) -> list["FileScanTask"]:
+        """Plan a table scan and return FileScanTasks.
 
         Handles the full scan planning lifecycle including pagination.
-        Each response batch is self-contained, so tasks are yielded as received.
 
         Args:
             identifier: Table identifier.
             request: The scan plan request parameters.
 
-        Yields:
-            FileScanTask objects ready for execution.
+        Returns:
+            List of FileScanTask objects ready for execution.
 
         Raises:
             RuntimeError: If planning fails, is cancelled, or returns unexpected response.
@@ -492,19 +490,23 @@ def plan_scan(self, identifier: str | Identifier, request: PlanTableScanRequest)
         if not isinstance(response, PlanCompleted):
             raise RuntimeError(f"Invalid planStatus for response: {type(response).__name__}")
 
-        # Yield tasks from initial response
+        tasks: list[FileScanTask] = []
+
+        # Collect tasks from initial response
         for task in response.file_scan_tasks:
-            yield FileScanTask.from_rest_response(task, response.delete_files)
+            tasks.append(FileScanTask.from_rest_response(task, response.delete_files))
 
-        # Fetch and yield from additional batches
+        # Fetch and collect from additional batches
         pending_tasks = deque(response.plan_tasks)
         while pending_tasks:
             plan_task = pending_tasks.popleft()
             batch = self._fetch_scan_tasks(identifier, plan_task)
             for task in batch.file_scan_tasks:
-                yield FileScanTask.from_rest_response(task, batch.delete_files)
+                tasks.append(FileScanTask.from_rest_response(task, batch.delete_files))
             pending_tasks.extend(batch.plan_tasks)
 
+        return tasks
+
     def _create_legacy_oauth2_auth_manager(self, session: Session) -> AuthManager:
         """Create the LegacyOAuth2AuthManager by fetching required properties.
 
diff --git a/pyiceberg/catalog/rest/scan_planning.py b/pyiceberg/catalog/rest/scan_planning.py
@@ -198,6 +198,7 @@ class PlanTableScanRequest(IcebergBaseModel):
     start_snapshot_id: int | None = Field(alias="start-snapshot-id", default=None)
     end_snapshot_id: int | None = Field(alias="end-snapshot-id", default=None)
     stats_fields: list[str] | None = Field(alias="stats-fields", default=None)
+    min_rows_requested: int | None = Field(alias="min-rows-requested", default=None)
 
     @model_validator(mode="after")
     def _validate_snapshot_fields(self) -> PlanTableScanRequest:
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -1847,7 +1847,7 @@ def from_rest_response(
         )
 
 
-def _rest_file_to_data_file(rest_file: RESTContentFile, *, include_stats: bool) -> DataFile:
+def _rest_file_to_data_file(rest_file: RESTContentFile, include_stats: bool) -> DataFile:
     """Convert a REST content file to a manifest DataFile."""
     from pyiceberg.catalog.rest.scan_planning import CONTENT_TYPE_MAP
 
@@ -1856,7 +1856,7 @@ def _rest_file_to_data_file(rest_file: RESTContentFile, *, include_stats: bool)
     null_value_counts = getattr(rest_file, "null_value_counts", None)
     nan_value_counts = getattr(rest_file, "nan_value_counts", None)
 
-    return DataFile.from_args(
+    data_file = DataFile.from_args(
         content=CONTENT_TYPE_MAP[rest_file.content],
         file_path=rest_file.file_path,
         file_format=rest_file.file_format,
@@ -1869,8 +1869,9 @@ def _rest_file_to_data_file(rest_file: RESTContentFile, *, include_stats: bool)
         nan_value_counts=nan_value_counts.to_dict() if include_stats and nan_value_counts else None,
         split_offsets=rest_file.split_offsets,
         sort_order_id=rest_file.sort_order_id,
-        spec_id=rest_file.spec_id,
     )
+    data_file.spec_id = rest_file.spec_id
+    return data_file
 
 
 def _open_manifest(