apache · johnhoran · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
@@ -946,6 +946,7 @@ def trigger_reentry(self, context: Context, event: dict[str, Any]) -> Any:
         """
         self.pod = None
         xcom_sidecar_output = None
+        skip_cleanup = False
         try:
             pod_name = event["name"]
             pod_namespace = event["namespace"]
@@ -958,6 +959,14 @@ def trigger_reentry(self, context: Context, event: dict[str, Any]) -> Any:
             follow = self.logging_interval is None
             last_log_time = event.get("last_log_time")
 
+            if event["status"] == "timeout":
+                pod_phase = self.pod.status.phase if self.pod.status and self.pod.status.phase else None
+                if pod_phase in {PodPhase.RUNNING, *PodPhase.terminal_states}:
+                    self.log.info("Pod has transitioned from pending state after timeout, deferring again")
+                    self.invoke_defer_method(last_log_time=last_log_time, context=context)
+                    skip_cleanup = True
+                    return
+
             if event["status"] in ("error", "failed", "timeout", "success"):
                 if self.get_logs:
                     self._write_logs(self.pod, follow=follow, since_time=last_log_time)
@@ -988,12 +997,14 @@ def trigger_reentry(self, context: Context, event: dict[str, Any]) -> Any:
                     message = event.get("stack_trace", event["message"])
                     raise AirflowException(message)
         except TaskDeferred:
+            skip_cleanup = True
             raise
         finally:
-            self._clean(event=event, context=context, result=xcom_sidecar_output)
+            if not skip_cleanup:
+                self._clean(event=event, context=context, result=xcom_sidecar_output)
 
-            if self.do_xcom_push and xcom_sidecar_output:
-                context["ti"].xcom_push(XCOM_RETURN_KEY, xcom_sidecar_output)
+                if self.do_xcom_push and xcom_sidecar_output:
+                    context["ti"].xcom_push(XCOM_RETURN_KEY, xcom_sidecar_output)
 
     def _clean(self, event: dict[str, Any], result: dict | None, context: Context) -> None:
         if self.pod is None:

@@ -31,6 +31,7 @@
 from airflow.providers.cncf.kubernetes.utils.pod_manager import (
     AsyncPodManager,
     OnFinishAction,
+    PodLaunchFailedException,
     PodLaunchTimeoutException,
     PodPhase,
 )
@@ -183,7 +184,7 @@ async def run(self) -> AsyncIterator[TriggerEvent]:
                 event = await self._wait_for_container_completion()
             yield event
             return
-        except PodLaunchTimeoutException as e:
+        except (PodLaunchTimeoutException, PodLaunchFailedException) as e:
 def detect_pod_terminate_early_issues(pod: V1Pod) -> str | None: 
     """ 
     Identify issues that justify terminating the pod early. 
     :param pod: The pod object to check. 
     :return: An error message if an issue is detected; otherwise, None. 
     """ 
     pod_status = pod.status 
     if pod_status.container_statuses: 
         for container_status in pod_status.container_statuses: 
             container_state: V1ContainerState = container_status.state 
             container_waiting: V1ContainerStateWaiting | None = container_state.waiting 
             if container_waiting: 
                 if container_waiting.reason in ["ErrImagePull", "ImagePullBackOff", "InvalidImageName"]: 
                     return ( 
                         f"Pod docker image cannot be pulled, unable to start: {container_waiting.reason}" 
                         f"\n{container_waiting.message}" 
                     ) 
     return None 
 def detect_pod_terminate_early_issues(pod: V1Pod) -> str | None: 
     """ 
     Identify issues that justify terminating the pod early. 
  
     :param pod: The pod object to check. 
     :return: An error message if an issue is detected; otherwise, None. 
     """ 
     pod_status = pod.status 
     if pod_status.container_statuses: 
         for container_status in pod_status.container_statuses: 
             container_state: V1ContainerState = container_status.state 
             container_waiting: V1ContainerStateWaiting | None = container_state.waiting 
             if container_waiting: 
                 if container_waiting.reason in ["ErrImagePull", "ImagePullBackOff", "InvalidImageName"]: 
                     return ( 
                         f"Pod docker image cannot be pulled, unable to start: {container_waiting.reason}" 
                         f"\n{container_waiting.message}" 
                     ) 
     return None 
             message = self._format_exception_description(e)
             yield TriggerEvent(
                 {

@@ -188,19 +188,47 @@ def detect_pod_terminate_early_issues(pod: V1Pod) -> str | None:
     """
     Identify issues that justify terminating the pod early.
 
+    This method distinguishes between permanent failures (e.g., invalid image names)
+    and transient errors (e.g., rate limits) that should be retried by Kubernetes.
+
     :param pod: The pod object to check.
     :return: An error message if an issue is detected; otherwise, None.
     """
+    # Indicators in error messages that suggest transient issues
+    TRANSIENT_ERROR_PATTERNS = [
+        "pull qps exceeded",
+        "rate limit",
+        "too many requests",
+        "quota exceeded",
+        "temporarily unavailable",
+        "timeout",
+        "account limit",
+    ]
+
+    FATAL_STATES = ["InvalidImageName", "ErrImageNeverPull"]
+    TRANSIENT_STATES = ["ErrImagePull", "ImagePullBackOff"]
+
     pod_status = pod.status
     if pod_status.container_statuses:
         for container_status in pod_status.container_statuses:
             container_state: V1ContainerState = container_status.state
             container_waiting: V1ContainerStateWaiting | None = container_state.waiting
-            if container_waiting:
-                if container_waiting.reason in ["ErrImagePull", "ImagePullBackOff", "InvalidImageName"]:
+            if not container_waiting:
+                continue
+
+            if container_waiting.reason in FATAL_STATES:
+                return (
+                    f"Image cannot be pulled, unable to start: {container_waiting.reason}"
+                    f"\n{container_waiting.message or ''}"
+                )
+
+            if container_waiting.reason in TRANSIENT_STATES:
+                message_lower = (container_waiting.message or "").lower()
+                is_transient = any(pattern in message_lower for pattern in TRANSIENT_ERROR_PATTERNS)
+                if not is_transient:
                     return (
-                        f"Pod docker image cannot be pulled, unable to start: {container_waiting.reason}"
-                        f"\n{container_waiting.message}"
+                        f"Image cannot be pulled, unable to start: {container_waiting.reason}"
+                        f"\n{container_waiting.message or ''}"
                     )
     return None
 

@@ -2928,6 +2928,40 @@ def test_skip_deferral_on_terminated_pod(
         k.execute(context)
         mocked_trigger_reentry.assert_called_once()
 
+    @patch(HOOK_CLASS)
+    @patch(KUB_OP_PATH.format("pod_manager"))
+    @patch(f"{KPO_MODULE}.KubernetesPodOperator._clean")
+    def test_should_defer_on_running_after_timeout(self, mock_clean, mock_manager, mocked_hook, mocker):
+        k = KubernetesPodOperator(task_id="task", deferrable=True)
+        running_state = mock.MagicMock(
+            **{
+                "metadata.name": TEST_NAME,
+                "metadata.namespace": TEST_NAMESPACE,
+                "status.phase": "Running",
+                "status.container_statuses": [
+                    k8s.V1ContainerStatus(
+                        name=k.base_container_name,
+                        state=k8s.V1ContainerState(running=k8s.V1ContainerStateRunning()),
+                        image="alpine",
+                        image_id="",
+                        ready=True,
+                        restart_count=0,
+                    )
+                ],
+            },
+        )
+        mocked_hook.return_value.get_pod.return_value = running_state
+        ti_mock = MagicMock()
+
+        event = {"status": "timeout", "message": "timeout", "name": TEST_NAME, "namespace": TEST_NAMESPACE}
+
+        mock_file = mock_open(read_data='{"a": "b"}')
+        mocker.patch("builtins.open", mock_file)
+
+        with pytest.raises(TaskDeferred):
+            k.trigger_reentry({"ti": ti_mock}, event)
+        mock_clean.assert_not_called()
+
 
 @pytest.mark.parametrize("do_xcom_push", [True, False])
 @patch(KUB_OP_PATH.format("extract_xcom"))

@@ -715,7 +715,9 @@ async def test_start_pod_raises_fast_error_on_image_error(self, fail_reason):
         pod_response.status.container_statuses = [container_statuse]
 
         self.mock_kube_client.read_namespaced_pod.return_value = pod_response
-        expected_msg = f"Pod docker image cannot be pulled, unable to start: {waiting_state.reason}\n{waiting_state.message}"
+        expected_msg = (
+            f"Image cannot be pulled, unable to start: {waiting_state.reason}\n{waiting_state.message}"
+        )
         mock_pod = MagicMock()
         with pytest.raises(AirflowException, match=expected_msg):
             await self.pod_manager.await_pod_start(
@@ -1262,7 +1264,9 @@ async def test_start_pod_raises_fast_error_on_image_error(self, fail_reason):
         container_status.state.waiting = waiting_state
         pod_response.status.container_statuses = [container_status]
         self.mock_async_hook.get_pod.return_value = pod_response
-        expected_msg = f"Pod docker image cannot be pulled, unable to start: {waiting_state.reason}\n{waiting_state.message}"
+        expected_msg = (
+            f"Image cannot be pulled, unable to start: {waiting_state.reason}\n{waiting_state.message}"
+        )
         mock_pod = mock.MagicMock()
         with pytest.raises(AirflowException, match=expected_msg):
             await self.async_pod_manager.await_pod_start(