XiaomiMiMo · wqymi · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/docs/compose/plans/2026-07-01-actor-models-discovery.md b/docs/compose/plans/2026-07-01-actor-models-discovery.md
diff --git a/docs/compose/plans/2026-07-01-tui-paste-image-vision-fallback.md b/docs/compose/plans/2026-07-01-tui-paste-image-vision-fallback.md
diff --git a/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx b/packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
@@ -1276,6 +1276,11 @@ export function Prompt(props: PromptProps) {
           }
         }
         if (mime.startsWith("image/") || mime === "application/pdf") {
+          if (mime.startsWith("image/") && !activeModelSupportsImage()) {
+            insertFileReference(filepath)
+            toast.show({ message: t("tui.paste.image.fallback_path"), variant: "info", duration: 3000 })
+            return
+          }
           const content = await Filesystem.readArrayBuffer(filepath)
             .then((buffer) => Buffer.from(buffer).toString("base64"))
             .catch(() => {})
@@ -1309,16 +1314,62 @@ export function Prompt(props: PromptProps) {
     }, 0)
   }
 
+  function activeModelSupportsImage() {
+    const current = local.model.current()
+    if (!current) return false
+    const provider = sync.data.provider.find((p) => p.id === current.providerID)
+    return provider?.models[current.modelID]?.capabilities?.input?.image ?? false
+  }
+
+  function insertFileReference(filepath: string) {
+    const filename = path.basename(filepath)
+    const currentOffset = input.visualCursor.offset
+    const extmarkStart = currentOffset
+    const virtualText = `@${filename}`
+    const extmarkEnd = extmarkStart + virtualText.length
+    input.insertText(virtualText + " ")
+    const extmarkId = input.extmarks.create({
+      start: extmarkStart,
+      end: extmarkEnd,
+      virtual: true,
+      styleId: fileStyleId,
+      typeId: promptPartTypeId,
+    })
+    setStore(
+      produce((draft) => {
+        const partIndex = draft.prompt.parts.length
+        draft.prompt.parts.push({
+          type: "file" as const,
+          mime: "text/plain",
+          filename,
+          url: `file://${filepath}`,
+          source: {
+            type: "file",
+            path: filepath,
+            text: { start: extmarkStart, end: extmarkEnd, value: virtualText },
+          },
+        })
+        draft.extmarkToPartIndex.set(extmarkId, partIndex)
+      }),
+    )
+  }
+
   async function pasteFromClipboard() {
     if (props.disabled) return
     const content = await Clipboard.read()
     if (!content) return
     if (content.mime.startsWith("image/")) {
-      await pasteAttachment({
-        filename: "clipboard",
-        mime: content.mime,
-        content: content.data,
-      })
+      if (activeModelSupportsImage()) {
+        await pasteAttachment({
+          filename: "clipboard",
+          mime: content.mime,
+          content: content.data,
+        })
+        return
+      }
+      const filepath = await Clipboard.spillImage(content)
+      insertFileReference(filepath)
+      toast.show({ message: t("tui.paste.image.fallback_path"), variant: "info", duration: 3000 })
       return
     }
     await pastePlainText(content.data.replace(/\r\n/g, "\n").replace(/\r/g, "\n"))

diff --git a/packages/opencode/src/cli/cmd/tui/i18n/en.ts b/packages/opencode/src/cli/cmd/tui/i18n/en.ts
@@ -22,6 +22,7 @@ export const dict: Record<string, string> = {
   "tui.prompt.placeholder.normal": "Type your message... (type / for commands)",
   "tui.prompt.placeholder.shell": 'Run a command... "{{example}}"',
   "tui.prompt.ghost": "{{prediction}}  (Tab to accept)",
+  "tui.paste.image.fallback_path": "Model has no vision support — inserted image path instead",
   "tui.home.placeholder.example.todo": "Fix a TODO in the codebase",
   "tui.home.placeholder.example.stack": "What is the tech stack of this project?",
   "tui.home.placeholder.example.tests": "Fix broken tests",

diff --git a/packages/opencode/src/cli/cmd/tui/i18n/es.ts b/packages/opencode/src/cli/cmd/tui/i18n/es.ts
@@ -26,6 +26,7 @@ export const dict = {
   "tui.prompt.placeholder.normal": 'Pregunta lo que quieras... "{{example}}"',
   "tui.prompt.placeholder.shell": 'Ejecuta un comando... "{{example}}"',
   "tui.prompt.ghost": "{{prediction}}  (Tab para aceptar)",
+  "tui.paste.image.fallback_path": "El modelo no admite imágenes — se insertó la ruta de la imagen en su lugar",
   "tui.home.placeholder.example.todo": "Corregir un TODO en el código",
   "tui.home.placeholder.example.stack": "¿Cuál es el stack técnico del proyecto?",
   "tui.home.placeholder.example.tests": "Arreglar las pruebas fallidas",

diff --git a/packages/opencode/src/cli/cmd/tui/i18n/fr.ts b/packages/opencode/src/cli/cmd/tui/i18n/fr.ts
@@ -26,6 +26,7 @@ export const dict = {
   "tui.prompt.placeholder.normal": 'Posez votre question... "{{example}}"',
   "tui.prompt.placeholder.shell": 'Exécuter une commande... "{{example}}"',
   "tui.prompt.ghost": "{{prediction}}  (Tab pour accepter)",
+  "tui.paste.image.fallback_path": "Le modèle ne prend pas en charge la vision — chemin de l'image inséré à la place",
   "tui.home.placeholder.example.todo": "Corriger un TODO dans le code",
   "tui.home.placeholder.example.stack": "Quelle est la stack technique de ce projet ?",
   "tui.home.placeholder.example.tests": "Réparer les tests cassés",

diff --git a/packages/opencode/src/cli/cmd/tui/i18n/ja.ts b/packages/opencode/src/cli/cmd/tui/i18n/ja.ts
@@ -26,6 +26,7 @@ export const dict = {
   "tui.prompt.placeholder.normal": '何でも聞いてください... "{{example}}"',
   "tui.prompt.placeholder.shell": 'コマンドを実行... "{{example}}"',
   "tui.prompt.ghost": "{{prediction}}  (Tab で確定)",
+  "tui.paste.image.fallback_path": "モデルは画像に対応していないため、代わりに画像パスを挿入しました",
   "tui.home.placeholder.example.todo": "コードベース内の TODO を修正",
   "tui.home.placeholder.example.stack": "このプロジェクトの技術スタックは？",
   "tui.home.placeholder.example.tests": "壊れたテストを修正",

diff --git a/packages/opencode/src/cli/cmd/tui/i18n/ru.ts b/packages/opencode/src/cli/cmd/tui/i18n/ru.ts
@@ -26,6 +26,7 @@ export const dict = {
   "tui.prompt.placeholder.normal": 'Спросите что угодно... "{{example}}"',
   "tui.prompt.placeholder.shell": 'Выполните команду... "{{example}}"',
   "tui.prompt.ghost": "{{prediction}}  (Tab — принять)",
+  "tui.paste.image.fallback_path": "Модель не поддерживает изображения — вместо этого вставлен путь к изображению",
   "tui.home.placeholder.example.todo": "Исправь TODO в кодовой базе",
   "tui.home.placeholder.example.stack": "Какой технологический стек у этого проекта?",
   "tui.home.placeholder.example.tests": "Почини сломанные тесты",

diff --git a/packages/opencode/src/cli/cmd/tui/i18n/zh.ts b/packages/opencode/src/cli/cmd/tui/i18n/zh.ts
@@ -26,6 +26,7 @@ export const dict = {
   "tui.prompt.placeholder.normal": "输入消息...(输入/唤起命令)",
   "tui.prompt.placeholder.shell": '执行命令…… "{{example}}"',
   "tui.prompt.ghost": "{{prediction}}  (按 Tab 采纳)",
+  "tui.paste.image.fallback_path": "当前模型不支持图片，已改为插入图片路径",
   "tui.home.placeholder.example.todo": "修复代码库中的 TODO",
   "tui.home.placeholder.example.stack": "这个项目用了什么技术栈？",
   "tui.home.placeholder.example.tests": "修复失败的测试",

diff --git a/packages/opencode/src/cli/cmd/tui/i18n/zht.ts b/packages/opencode/src/cli/cmd/tui/i18n/zht.ts
@@ -26,6 +26,7 @@ export const dict = {
   "tui.prompt.placeholder.normal": '問點什麼…… "{{example}}"',
   "tui.prompt.placeholder.shell": '執行指令…… "{{example}}"',
   "tui.prompt.ghost": "{{prediction}}  (按 Tab 採納)",
+  "tui.paste.image.fallback_path": "目前模型不支援圖片，已改為插入圖片路徑",
   "tui.home.placeholder.example.todo": "修復程式碼庫中的 TODO",
   "tui.home.placeholder.example.stack": "這個專案用了什麼技術棧？",
   "tui.home.placeholder.example.tests": "修復失敗的測試",

diff --git a/packages/opencode/src/cli/cmd/tui/util/clipboard.ts b/packages/opencode/src/cli/cmd/tui/util/clipboard.ts
@@ -36,6 +36,13 @@ export interface Content {
   mime: string
 }
 
+export async function spillImage(content: { data: string; mime: string }): Promise<string> {
+  const ext = content.mime === "image/png" ? "png" : content.mime === "image/jpeg" ? "jpg" : "bin"
+  const file = path.join(tmpdir(), `opencode-paste-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.${ext}`)
+  await Bun.write(file, Buffer.from(content.data, "base64"))
+  return file
+}
+
 // Checks clipboard for images first, then falls back to text.
 //
 // On Windows prompt/ can call this from multiple paste signals because

diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts
@@ -1239,6 +1239,11 @@ const layer: Layer.Layer<
               cachePromptTTL: model.cachePromptTTL ?? existingModel?.cachePromptTTL,
               variants: {},
             }
+            // mimo-auto is a free-tier routing alias absent from models.dev; it routes to a
+            // vision-capable model, so image input is supported.
+            if (providerID === "mimo" && modelID === "mimo-auto") {
+              parsedModel.capabilities.input.image = true
+            }
             const merged = mergeDeep(ProviderTransform.variants(parsedModel), model.variants ?? {})
             parsedModel.variants = mapValues(
               pickBy(merged, (v) => !v.disabled),

diff --git a/packages/opencode/src/provider/transform.ts b/packages/opencode/src/provider/transform.ts
@@ -17,23 +17,6 @@ function mimeToModality(mime: string): Modality | undefined {
   return undefined
 }
 
-// MiMo vision support isn't reflected in models.dev modality data, so the
-// generic capability check would strip images before they reach the model.
-// mimo-auto and mimo-v2.5 accept images; mimo-v2.5-pro is text-only.
-function supportsImageInput(model: Provider.Model): boolean {
-  if (model.providerID === "mimo" || model.providerID === "xiaomi") {
-    const id = model.id.toLowerCase()
-    if (id.includes("v2.5-pro")) return false
-    if (id === "mimo-auto" || id.includes("v2.5")) return true
-  }
-  // Claude and GPT models are all multimodal regardless of catalog data.
-  const id = model.id.toLowerCase()
-  const apiID = model.api.id.toLowerCase()
-  if (id.includes("claude") || apiID.includes("claude") || model.providerID === "anthropic") return true
-  if (id.includes("gpt") || apiID.includes("gpt")) return true
-  return model.capabilities.input.image
-}
-
 export const OUTPUT_TOKEN_MAX = Flag.MIMOCODE_EXPERIMENTAL_OUTPUT_TOKEN_MAX || 32_000
 const MIMO_OUTPUT_TOKEN_MAX = 128_000
 
@@ -388,7 +371,7 @@ function unsupportedParts(msgs: ModelMessage[], model: Provider.Model): ModelMes
       const filename = part.type === "file" ? part.filename : undefined
       const modality = mimeToModality(mime)
       if (!modality) return part
-      const supported = modality === "image" ? supportsImageInput(model) : model.capabilities.input[modality]
+      const supported = model.capabilities.input[modality]
       if (supported) return part
 
       const name = filename ? `"${filename}"` : modality

diff --git a/packages/opencode/src/session/system.ts b/packages/opencode/src/session/system.ts
@@ -14,7 +14,7 @@ import PROMPT_DEEPSEEK from "./prompt/deepseek.txt"
 import PROMPT_GLM from "./prompt/glm.txt"
 import PROMPT_MINIMAX from "./prompt/minimax.txt"
 import PROMPT_TRINITY from "./prompt/trinity.txt"
-import type { Provider } from "@/provider"
+import { Provider } from "@/provider"
 import type { Agent } from "@/agent/agent"
 import { Permission } from "@/permission"
 import { Skill } from "@/skill"
@@ -50,10 +50,25 @@ export const layer = Layer.effect(
   Effect.gen(function* () {
     const skill = yield* Skill.Service
 
+    const provider = yield* Provider.Service
+    const visionModels = yield* provider
+      .list()
+      .pipe(
+        Effect.map((providers) =>
+          Object.values(providers)
+            .flatMap((info) => Object.values(info.models))
+            .filter((m) => m.capabilities.input.image === true)
+            .map((m) => `${m.providerID}/${m.id}`)
+            .sort((a, b) => a.localeCompare(b))
+            .slice(0, 3),
+        ),
+      )
+      .pipe(Effect.orElseSucceed(() => [] as string[]))
+
     return Service.of({
       environment(model, now) {
         const project = Instance.project
-        return [
+        const base = [
           [
             `You are MiMo Code Agent, built by Xiaomi MiMo Team. You are an interactive agent that helps users with software engineering tasks. Use the instructions below and the tools available to you to assist the user.`,
             `You are powered by the model named ${model.api.id}. The exact model ID is ${model.providerID}/${model.api.id}`,
@@ -72,6 +87,19 @@ export const layer = Layer.effect(
           ].join("\n"),
           `IMPORTANT: Your response must ALWAYS strictly follow the same major language as the user.`,
         ]
+        if (!model.capabilities.input.image)
+          base.push(
+            [
+              `<vision-capability>`,
+              `You CANNOT see or interpret image content — this model has no vision support.`,
+              `Never attempt to analyze an image's visual content yourself. If a task needs image understanding, dispatch a vision-capable subagent via the actor tool, passing the image file path so the subagent can Read it.`,
+              visionModels.length
+                ? `Vision-capable models you can pass to --model: ${visionModels.join(", ")}. Run \`actor models --vision\` to see all of them. Example: actor run <type> "<desc>" "analyze the image at <path>" --model ${visionModels[0]}.`
+                : `No vision-capable model is currently configured. Ask the user to configure a vision model, or use an OCR tool to extract text.`,
+              `If instead you need a file's raw binary structure (not its visual content), use a shell tool such as \`hexdump -C <path>\`, NOT the read tool.`,
+            ].join("\n"),
+          )
+        return base
       },
 
       skills: Effect.fn("SystemPrompt.skills")(function* (agent: Agent.Info) {
@@ -91,6 +119,6 @@ export const layer = Layer.effect(
   }),
 )
 
-export const defaultLayer = layer.pipe(Layer.provide(Skill.defaultLayer))
+export const defaultLayer = layer.pipe(Layer.provide(Skill.defaultLayer), Layer.provide(Provider.defaultLayer))
 
 export * as SystemPrompt from "./system"
diff --git a/packages/opencode/src/tool/actor.shell.txt b/packages/opencode/src/tool/actor.shell.txt
@@ -34,6 +34,9 @@ appends them at request time per-agent).
     # --session: target session id (defaults to current); --type: 'text' (default) or 'actor_notification'
     # (--task is NOT valid here — it applies only to run/spawn for tying a subagent to a task_id)
 
+# list available models (optionally vision-only) to pick a --model value:
+    actor models [--vision] [--limit <n>]
+
 Examples:
     actor run explore "Find error recovery" "Scan src/parser.ts for catch blocks. Return file:line."
 
@@ -70,3 +73,4 @@ When to use what:
     run    — single delegation where the result drives your next step
     spawn  — fan-out for parallel work, OR fire-and-forget background tasks
     wait   — pick up a previously-spawned actor's result on demand
+    models — discover which models you can pass to --model (use --vision to find image-capable models for dispatching an image subagent)
diff --git a/packages/opencode/src/tool/actor.ts b/packages/opencode/src/tool/actor.ts
@@ -29,9 +29,9 @@ export interface ActorPromptOps {
 const id = "actor"
 
 const MODEL_PARAM_DESCRIPTION =
-  "(optional) Model for this subagent: a model group name (e.g. ultra/standard/lite) or a literal provider/model (e.g. mimo-v2.5-pro). Overrides the agent's configured model; defaults to the agent's model, else the parent's. If no model_groups are configured, the tier names resolve to the default model."
+  "(optional) Model for this subagent: a model group name (e.g. ultra/standard/lite) or a literal provider/model (e.g. mimo-v2.5-pro). Overrides the agent's configured model; defaults to the agent's model, else the parent's. If no model_groups are configured, the tier names resolve to the default model. To discover valid provider/model values (e.g. a vision-capable model for image tasks), run `actor models` (or `actor models --vision`)."
 
-const KNOWN_ACTOR_VERBS = ["run", "spawn", "status", "wait", "cancel", "send"]
+const KNOWN_ACTOR_VERBS = ["run", "spawn", "status", "wait", "cancel", "send", "models"]
 
 function levenshteinActor(a: string, b: string): number {
   const m = a.length, n = b.length
@@ -65,6 +65,7 @@ type ActorShellArgs =
   | { operation: { action: "wait"; actor_id: string; timeout_ms?: number } }
   | { operation: { action: "cancel"; actor_id: string } }
   | { operation: { action: "send"; to_actor_id: string; content: string; to_session_id?: string; type?: string } }
+  | { operation: { action: "models"; vision?: boolean; limit?: number } }
 
 function actorArityError(verb: string, expected: string, args: string[], line: number) {
   return Effect.fail({
@@ -186,6 +187,20 @@ const mapActorVerb = Effect.fn("mapActorVerb")(function* (verb: string | undefin
         },
       } as ActorShellArgs
     }
+    case "models": {
+      const vision = args.includes("--vision")
+      const withoutVision = args.filter((a) => a !== "--vision")
+      const { flags, rest } = yield* extractNamedFlags(withoutVision, ["limit"], line)
+      if (rest.length !== 0)
+        return yield* actorArityError("models", "[--vision] [--limit <n>]", rest, line)
+      return {
+        operation: {
+          action: "models" as const,
+          ...(vision ? { vision: true } : {}),
+          ...(flags.limit ? { limit: Number(flags.limit) } : {}),
+        },
+      } as ActorShellArgs
+    }
     default: {
       const suggestion = suggestActorVerb(verb ?? "")
       const detail =
@@ -444,6 +459,12 @@ export const ActorTool = Tool.define(
           ),
       })
 
+      const modelsSchema = z.strictObject({
+        action: z.literal("models"),
+        vision: z.boolean().optional().describe("(optional) If true, list only vision-capable models (models that accept image input)."),
+        limit: z.number().int().positive().optional().describe("(optional) Max number of models to return. Default 50."),
+      })
+
       const parameters = z.strictObject({
         // .meta({ type: "object" }) is REQUIRED — without it the emitted JSON
         // schema's `operation` node has only `anyOf`, no `type`, and some models
@@ -460,6 +481,7 @@ export const ActorTool = Tool.define(
             waitSchema,
             cancelSchema,
             sendSchema,
+            modelsSchema,
           ])
           .meta({ type: "object" }),
       })
@@ -616,6 +638,28 @@ export const ActorTool = Tool.define(
           }
         }
 
+        if (op.action === "models") {
+          const providers = yield* provider.list()
+          const all = Object.values(providers).flatMap((info) =>
+            Object.values(info.models).map((m) => ({
+              ref: `${m.providerID}/${m.id}`,
+              name: m.name,
+              vision: m.capabilities.input.image === true,
+            })),
+          )
+          const filtered = op.vision ? all.filter((m) => m.vision) : all
+          const sorted = filtered.sort((a, b) => a.ref.localeCompare(b.ref))
+          const limit = op.limit ?? 50
+          const shown = sorted.slice(0, limit)
+          const lines = shown.map((m) => `${m.ref}${m.vision ? " (vision)" : ""}`)
+          const header = op.vision ? `Vision-capable models` : `Available models`
+          const more = sorted.length > shown.length ? `\n… and ${sorted.length - shown.length} more (raise --limit)` : ""
+          const output = shown.length === 0
+            ? (op.vision ? "No vision-capable models are configured. Configure a vision model or use an OCR tool." : "No models are configured.")
+            : `${header} (${shown.length} of ${sorted.length}):\n${lines.join("\n")}${more}\nPass any of these to actor --model.`
+          return { title: header, output, metadata: { count: shown.length, total: sorted.length, vision: !!op.vision } as Record<string, any> }
+        }
+
         // op.action ==="run" or "spawn" — schema guarantees
         // description / prompt / subagent_type are present and non-empty.
         if (!ctx.extra?.bypassAgentCheck) {