comet-ml · JetoPistola · May 20, 2026 · May 20, 2026 · baz-reviewer · May 20, 2026
@@ -389,9 +389,19 @@ def _on_inner_step(
     # BEFORE ``apply_fn`` so the per-action ``ok`` record can't carry
     # post-apply counters; a summary record after the cascade is the
     # cleanest additive surface.
+    # Summary status flips to "failed" when any skip counter is non-zero so
+    # a programmatic consumer doing ``jq '.actions[] | select(.status ==
+    # "failed")'`` picks it up. Per-(experiment, reason) ``skip`` records
+    # emitted by the cascade itself carry ``status="skipped"`` and the
+    # affected source ids; this summary aggregates the totals (OPIK-6599).
+    has_skips = (
+        result.experiments_skipped
+        + result.items_skipped_missing_trace
+        + result.items_skipped_missing_item
+    ) > 0
     audit.record(
         type="cascade_experiments_summary",
-        status="ok",
+        status="failed" if has_skips else "ok",
         details={
             "source_dataset_id": action.source_dataset_id,
             "to_dataset": action.dest_name,

@@ -136,6 +136,12 @@ def finish(self, label: str = "done") -> None:
 
 _EXPERIMENT_PAGE_SIZE = 100
 
+# Cap the per-record ``sample_source_ids`` list so a pathological
+# all-items-missing case doesn't bloat the audit JSON. The count is
+# always recorded in full; the sample is only there to give an operator
+# enough breadcrumbs to investigate a few offending source ids.
+_SKIP_SAMPLE_LIMIT = 20
+
 # Buffer around the experiment's trace start/end times when bulk-fetching
 # spans via ``search_spans(from_time, to_time)``. Late-arriving spans
 # (the streamer is async; a span tied to a trace can land after the trace's
@@ -236,7 +242,6 @@ def cascade_experiments(
           ``{"id", "name", "reason"}`` entries for the audit log.
     """
     result = ExperimentCascadeResult()
-    del audit  # not currently used (umbrella action wraps via execute_plan_loop)
 
     # Default to an empty remap so callers that haven't picked up the
     # new kwarg (older tests, ad-hoc invocations) behave the same as
@@ -278,6 +283,7 @@ def cascade_experiments(
             item_id_remap=item_id_remap,
             optimization_id_remap=optimization_id_remap,
             result=result,
+            audit=audit,
             inner_progress_callback=inner_progress_callback,
         )
 
@@ -298,6 +304,7 @@ def cascade_one_experiment(
     item_id_remap: Dict[str, str],
     optimization_id_remap: Optional[Dict[str, str]] = None,
     result: ExperimentCascadeResult,
+    audit: Optional[AuditLog] = None,
     inner_progress_callback: Optional[InnerProgressCallback] = None,
 ) -> None:
     """Migrate one source experiment: read items -> copy traces + spans ->
@@ -449,16 +456,96 @@ def cascade_one_experiment(
                 "reason": "recreate_experiment returned False",
             }
         )
+        _record_skip(
+            audit,
+            reason="experiment_recreate_returned_false",
+            experiment_id=experiment_id,
+            experiment_name=source_experiment.name,
+            count=1,
+            sample_source_ids=[experiment_id],
+        )
 
     # Tally per-item skips visible after the recreate call. ``recreate_experiment``
     # prints its own skip counts but doesn't return them; we infer the two
     # mapping-miss totals by comparing source items against the remap entries
     # so the cascade-level audit counters stay accurate.
+    #
+    # Per-(experiment, reason) audit records are emitted at the end with
+    # the affected source ids (capped at ``_SKIP_SAMPLE_LIMIT``) so the
+    # CLI can fail loud with a machine-readable breakdown -- see OPIK-6599.
+    # Cap the per-reason sample lists during collection -- ``_record_skip``
+    # would slice them anyway, but trimming early bounds peak memory in
+    # the pathological case (e.g. 10k items all missing the same remap).
+    # ``count`` comes from the always-fully-incremented counters so the
+    # audit record carries the true total even when the sample is capped.
+    missing_trace_count = 0
+    missing_item_count = 0
+    missing_trace_sample: List[str] = []
+    missing_item_sample: List[str] = []
     for item in items:
         if item.trace_id and item.trace_id not in result.trace_id_remap:
             result.items_skipped_missing_trace += 1
+            missing_trace_count += 1
+            if len(missing_trace_sample) < _SKIP_SAMPLE_LIMIT:
+                missing_trace_sample.append(item.trace_id)
         if item.dataset_item_id and item.dataset_item_id not in item_id_remap:
             result.items_skipped_missing_item += 1
+            missing_item_count += 1
+            if len(missing_item_sample) < _SKIP_SAMPLE_LIMIT:
+                missing_item_sample.append(item.dataset_item_id)
+
+    if missing_trace_count:
+        _record_skip(
+            audit,
+            reason="items_missing_trace_remap",
+            experiment_id=experiment_id,
+            experiment_name=source_experiment.name,
+            count=missing_trace_count,
+            sample_source_ids=missing_trace_sample,
+        )
+    if missing_item_count:
+        _record_skip(
+            audit,
+            reason="items_missing_dataset_item_remap",
+            experiment_id=experiment_id,
+            experiment_name=source_experiment.name,
+            count=missing_item_count,
+            sample_source_ids=missing_item_sample,
+        )
+
+
+def _record_skip(
+    audit: Optional[AuditLog],
+    *,
+    reason: str,
+    experiment_id: str,
+    experiment_name: Optional[str],
+    count: int,
+    sample_source_ids: List[str],
+) -> None:
+    """Append a per-(experiment, reason) ``skip`` record to the audit log.
+
+    Sample ids are capped at ``_SKIP_SAMPLE_LIMIT`` so a pathological skip
+    (e.g. 10k items losing their dataset_item_id remap) doesn't balloon
+    the audit JSON. ``count`` is always the full population so a
+    machine-readable consumer can sum across records.
+
+    No-op when ``audit`` is ``None`` — keeps tests and ad-hoc invocations
+    that don't pass an audit log working as before.
+    """
+    if audit is None:
+        return
+    audit.record(
+        type="skip",
+        status="skipped",
+        details={
+            "reason": reason,
+            "experiment_id": experiment_id,
+            "experiment_name": experiment_name,
+            "count": count,
+            "sample_source_ids": sample_source_ids[:_SKIP_SAMPLE_LIMIT],
+        },
+    )
 
 
 def _list_source_experiments(

@@ -13,7 +13,7 @@
 import sys
 import time
 from pathlib import Path
-from typing import Any, Iterator, Optional
+from typing import Any, Dict, Iterator, Optional
 
 import click
 from rich.console import Console
@@ -48,6 +48,11 @@
 )
 
 console = Console()
+# Dedicated stderr console for the loud-fail path so the SKIP_SUMMARY
+# line lands on stderr without flipping the default console (OPIK-6599).
+# Tests assert against this stream; CI gates can grep stderr without
+# parsing the audit JSON.
+_stderr_console = Console(stderr=True)
 
 MIGRATE_CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]}
 
@@ -185,6 +190,91 @@ def _finalize_and_fail(
     sys.exit(1)
 
 
+def _finalize_with_skips_or_ok(
+    audit: AuditLog,
+    audit_path: Path,
+    name: str,
+    target_label: str,
+    target_project: str,
+    elapsed_seconds: float,
+) -> None:
+    """Finalize the audit log, then either fail loud on skips or print the
+    happy-path message.
+
+    Per OPIK-6599: when the cascade emits any ``skip`` audit record, the
+    migrate is "succeeded but lossy" — the destination state has partial
+    data and is **not** rolled back. We finalize the audit to ``failed``,
+    print a SKIP_SUMMARY line to **stderr** so CI pipelines can grep
+    without parsing the JSON, and exit non-zero. Operators rely on the
+    audit log to know what made it across.
+    """
+    skip_records = [
+        action for action in audit.actions if action.get("status") == "skipped"
+    ]
+    if not skip_records:
+        audit.finalize("ok")
+        audit.write(audit_path)
+        elapsed = _format_elapsed(elapsed_seconds)
+        console.print(
+            f"[green]Migrated '{name}' into project '{target_project}' as "
+            f"'{target_label}'.[/green] Took {elapsed}. Audit log: {audit_path}"
+        )
+        return
+
+    # Aggregate counts by reason for the SKIP_SUMMARY line. The cascade
+    # summary record carries the totals too, but reading from skip records
+    # directly keeps the message decoupled from the summary record's shape.
+    totals: Dict[str, int] = {
+        "experiments_skipped": 0,
+        "items_skipped_missing_trace": 0,
+        "items_skipped_missing_item": 0,
+    }
+    reason_to_total_key = {
+        "experiment_recreate_returned_false": "experiments_skipped",
+        "items_missing_trace_remap": "items_skipped_missing_trace",
+        "items_missing_dataset_item_remap": "items_skipped_missing_item",
+    }
+    for record in skip_records:
+        total_key = reason_to_total_key.get(record.get("reason", ""))
+        if total_key is None:
+            continue
+        totals[total_key] += int(record.get("count", 0))
+
+    total_skipped = sum(totals.values())
+    audit.finalize("failed")
+    audit.write(audit_path)
+
+    elapsed = _format_elapsed(elapsed_seconds)
+    _stderr_console.print(
+        f"[red]opik migrate: {total_skipped} item{'s' if total_skipped != 1 else ''} "
+        f"skipped — destination state was NOT rolled back; see audit log: "
+        f"{audit_path}[/red]"
+    )
+    _stderr_console.print(
+        f"SKIP_SUMMARY: "
+        f"experiments_skipped={totals['experiments_skipped']} "
+        f"items_skipped_missing_trace={totals['items_skipped_missing_trace']} "
+        f"items_skipped_missing_item={totals['items_skipped_missing_item']}"
+    )
+    # High-level rollback hint. We deliberately don't ship a step-by-step
+    # CLI playbook here -- the audit log is the source of truth for what
+    # was actually created. Each ``ok`` action in ``audit.actions`` carries
+    # the destination entity id; an operator can grep the audit JSON to
+    # see exactly what landed in the destination project before deciding
+    # what to delete. Auto-rollback (a one-flag clean reverse) is tracked
+    # as a follow-up; this PR is the loud-fail mechanic only.
+    _stderr_console.print(
+        f"[yellow]To roll back manually: in project "
+        f"'{target_project}', delete the destination dataset "
+        f"'{target_label}' along with any experiments, optimizations, "
+        f"traces, and spans that were cascaded into it (the audit log "
+        f"lists each created entity id); then rename the source "
+        f"'{name}_v1' back to '{name}'.[/yellow]"
+    )
+    _stderr_console.print(f"[dim](after {elapsed})[/dim]")
+    sys.exit(1)
+
+
 @migrate_group.command(name="dataset")
 @click.argument("name", type=str)
 @click.option(
@@ -289,12 +379,13 @@ def migrate_dataset_command(
             elapsed_seconds=time.monotonic() - started_at,
         )
 
-    audit.finalize("ok")
-    audit.write(audit_path)
-    elapsed = _format_elapsed(time.monotonic() - started_at)
-    console.print(
-        f"[green]Migrated '{name}' into project '{to_project}' as '{plan.target_name}'.[/green] "
-        f"Took {elapsed}. Audit log: {audit_path}"
+    _finalize_with_skips_or_ok(
+        audit,
+        audit_path,
+        name=name,
+        target_label=plan.target_name,
+        target_project=to_project,
+        elapsed_seconds=time.monotonic() - started_at,
     )