Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
311 changes: 311 additions & 0 deletions docs/compose/plans/2026-07-01-actor-models-discovery.md

Large diffs are not rendered by default.

530 changes: 530 additions & 0 deletions docs/compose/plans/2026-07-01-tui-paste-image-vision-fallback.md

Large diffs are not rendered by default.

61 changes: 56 additions & 5 deletions packages/opencode/src/cli/cmd/tui/component/prompt/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1276,6 +1276,11 @@ export function Prompt(props: PromptProps) {
}
}
if (mime.startsWith("image/") || mime === "application/pdf") {
if (mime.startsWith("image/") && !activeModelSupportsImage()) {
insertFileReference(filepath)
toast.show({ message: t("tui.paste.image.fallback_path"), variant: "info", duration: 3000 })
return
}
const content = await Filesystem.readArrayBuffer(filepath)
.then((buffer) => Buffer.from(buffer).toString("base64"))
.catch(() => {})
Expand Down Expand Up @@ -1309,16 +1314,62 @@ export function Prompt(props: PromptProps) {
}, 0)
}

function activeModelSupportsImage() {
const current = local.model.current()
if (!current) return false
const provider = sync.data.provider.find((p) => p.id === current.providerID)
return provider?.models[current.modelID]?.capabilities?.input?.image ?? false
}

function insertFileReference(filepath: string) {
const filename = path.basename(filepath)
const currentOffset = input.visualCursor.offset
const extmarkStart = currentOffset
const virtualText = `@${filename}`
const extmarkEnd = extmarkStart + virtualText.length
input.insertText(virtualText + " ")
const extmarkId = input.extmarks.create({
start: extmarkStart,
end: extmarkEnd,
virtual: true,
styleId: fileStyleId,
typeId: promptPartTypeId,
})
setStore(
produce((draft) => {
const partIndex = draft.prompt.parts.length
draft.prompt.parts.push({
type: "file" as const,
mime: "text/plain",
filename,
url: `file://${filepath}`,
source: {
type: "file",
path: filepath,
text: { start: extmarkStart, end: extmarkEnd, value: virtualText },
},
})
draft.extmarkToPartIndex.set(extmarkId, partIndex)
}),
)
}

async function pasteFromClipboard() {
if (props.disabled) return
const content = await Clipboard.read()
if (!content) return
if (content.mime.startsWith("image/")) {
await pasteAttachment({
filename: "clipboard",
mime: content.mime,
content: content.data,
})
if (activeModelSupportsImage()) {
await pasteAttachment({
filename: "clipboard",
mime: content.mime,
content: content.data,
})
return
}
const filepath = await Clipboard.spillImage(content)
insertFileReference(filepath)
toast.show({ message: t("tui.paste.image.fallback_path"), variant: "info", duration: 3000 })
return
}
await pastePlainText(content.data.replace(/\r\n/g, "\n").replace(/\r/g, "\n"))
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/src/cli/cmd/tui/i18n/en.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export const dict: Record<string, string> = {
"tui.prompt.placeholder.normal": "Type your message... (type / for commands)",
"tui.prompt.placeholder.shell": 'Run a command... "{{example}}"',
"tui.prompt.ghost": "{{prediction}} (Tab to accept)",
"tui.paste.image.fallback_path": "Model has no vision support — inserted image path instead",
"tui.home.placeholder.example.todo": "Fix a TODO in the codebase",
"tui.home.placeholder.example.stack": "What is the tech stack of this project?",
"tui.home.placeholder.example.tests": "Fix broken tests",
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/src/cli/cmd/tui/i18n/es.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const dict = {
"tui.prompt.placeholder.normal": 'Pregunta lo que quieras... "{{example}}"',
"tui.prompt.placeholder.shell": 'Ejecuta un comando... "{{example}}"',
"tui.prompt.ghost": "{{prediction}} (Tab para aceptar)",
"tui.paste.image.fallback_path": "El modelo no admite imágenes — se insertó la ruta de la imagen en su lugar",
"tui.home.placeholder.example.todo": "Corregir un TODO en el código",
"tui.home.placeholder.example.stack": "¿Cuál es el stack técnico del proyecto?",
"tui.home.placeholder.example.tests": "Arreglar las pruebas fallidas",
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/src/cli/cmd/tui/i18n/fr.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const dict = {
"tui.prompt.placeholder.normal": 'Posez votre question... "{{example}}"',
"tui.prompt.placeholder.shell": 'Exécuter une commande... "{{example}}"',
"tui.prompt.ghost": "{{prediction}} (Tab pour accepter)",
"tui.paste.image.fallback_path": "Le modèle ne prend pas en charge la vision — chemin de l'image inséré à la place",
"tui.home.placeholder.example.todo": "Corriger un TODO dans le code",
"tui.home.placeholder.example.stack": "Quelle est la stack technique de ce projet ?",
"tui.home.placeholder.example.tests": "Réparer les tests cassés",
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/src/cli/cmd/tui/i18n/ja.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const dict = {
"tui.prompt.placeholder.normal": '何でも聞いてください... "{{example}}"',
"tui.prompt.placeholder.shell": 'コマンドを実行... "{{example}}"',
"tui.prompt.ghost": "{{prediction}} (Tab で確定)",
"tui.paste.image.fallback_path": "モデルは画像に対応していないため、代わりに画像パスを挿入しました",
"tui.home.placeholder.example.todo": "コードベース内の TODO を修正",
"tui.home.placeholder.example.stack": "このプロジェクトの技術スタックは?",
"tui.home.placeholder.example.tests": "壊れたテストを修正",
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/src/cli/cmd/tui/i18n/ru.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const dict = {
"tui.prompt.placeholder.normal": 'Спросите что угодно... "{{example}}"',
"tui.prompt.placeholder.shell": 'Выполните команду... "{{example}}"',
"tui.prompt.ghost": "{{prediction}} (Tab — принять)",
"tui.paste.image.fallback_path": "Модель не поддерживает изображения — вместо этого вставлен путь к изображению",
"tui.home.placeholder.example.todo": "Исправь TODO в кодовой базе",
"tui.home.placeholder.example.stack": "Какой технологический стек у этого проекта?",
"tui.home.placeholder.example.tests": "Почини сломанные тесты",
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/src/cli/cmd/tui/i18n/zh.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const dict = {
"tui.prompt.placeholder.normal": "输入消息...(输入/唤起命令)",
"tui.prompt.placeholder.shell": '执行命令…… "{{example}}"',
"tui.prompt.ghost": "{{prediction}} (按 Tab 采纳)",
"tui.paste.image.fallback_path": "当前模型不支持图片,已改为插入图片路径",
"tui.home.placeholder.example.todo": "修复代码库中的 TODO",
"tui.home.placeholder.example.stack": "这个项目用了什么技术栈?",
"tui.home.placeholder.example.tests": "修复失败的测试",
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/src/cli/cmd/tui/i18n/zht.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const dict = {
"tui.prompt.placeholder.normal": '問點什麼…… "{{example}}"',
"tui.prompt.placeholder.shell": '執行指令…… "{{example}}"',
"tui.prompt.ghost": "{{prediction}} (按 Tab 採納)",
"tui.paste.image.fallback_path": "目前模型不支援圖片,已改為插入圖片路徑",
"tui.home.placeholder.example.todo": "修復程式碼庫中的 TODO",
"tui.home.placeholder.example.stack": "這個專案用了什麼技術棧?",
"tui.home.placeholder.example.tests": "修復失敗的測試",
Expand Down
7 changes: 7 additions & 0 deletions packages/opencode/src/cli/cmd/tui/util/clipboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ export interface Content {
mime: string
}

export async function spillImage(content: { data: string; mime: string }): Promise<string> {
const ext = content.mime === "image/png" ? "png" : content.mime === "image/jpeg" ? "jpg" : "bin"
const file = path.join(tmpdir(), `opencode-paste-${Date.now()}-${Math.random().toString(36).slice(2, 8)}.${ext}`)
await Bun.write(file, Buffer.from(content.data, "base64"))
return file
}

// Checks clipboard for images first, then falls back to text.
//
// On Windows prompt/ can call this from multiple paste signals because
Expand Down
5 changes: 5 additions & 0 deletions packages/opencode/src/provider/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,11 @@ const layer: Layer.Layer<
cachePromptTTL: model.cachePromptTTL ?? existingModel?.cachePromptTTL,
variants: {},
}
// mimo-auto is a free-tier routing alias absent from models.dev; it routes to a
// vision-capable model, so image input is supported.
if (providerID === "mimo" && modelID === "mimo-auto") {
parsedModel.capabilities.input.image = true
}
const merged = mergeDeep(ProviderTransform.variants(parsedModel), model.variants ?? {})
parsedModel.variants = mapValues(
pickBy(merged, (v) => !v.disabled),
Expand Down
19 changes: 1 addition & 18 deletions packages/opencode/src/provider/transform.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,6 @@ function mimeToModality(mime: string): Modality | undefined {
return undefined
}

// MiMo vision support isn't reflected in models.dev modality data, so the
// generic capability check would strip images before they reach the model.
// mimo-auto and mimo-v2.5 accept images; mimo-v2.5-pro is text-only.
function supportsImageInput(model: Provider.Model): boolean {
if (model.providerID === "mimo" || model.providerID === "xiaomi") {
const id = model.id.toLowerCase()
if (id.includes("v2.5-pro")) return false
if (id === "mimo-auto" || id.includes("v2.5")) return true
}
// Claude and GPT models are all multimodal regardless of catalog data.
const id = model.id.toLowerCase()
const apiID = model.api.id.toLowerCase()
if (id.includes("claude") || apiID.includes("claude") || model.providerID === "anthropic") return true
if (id.includes("gpt") || apiID.includes("gpt")) return true
return model.capabilities.input.image
}

export const OUTPUT_TOKEN_MAX = Flag.MIMOCODE_EXPERIMENTAL_OUTPUT_TOKEN_MAX || 32_000
const MIMO_OUTPUT_TOKEN_MAX = 128_000

Expand Down Expand Up @@ -388,7 +371,7 @@ function unsupportedParts(msgs: ModelMessage[], model: Provider.Model): ModelMes
const filename = part.type === "file" ? part.filename : undefined
const modality = mimeToModality(mime)
if (!modality) return part
const supported = modality === "image" ? supportsImageInput(model) : model.capabilities.input[modality]
const supported = model.capabilities.input[modality]
if (supported) return part

const name = filename ? `"${filename}"` : modality
Expand Down
34 changes: 31 additions & 3 deletions packages/opencode/src/session/system.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import PROMPT_DEEPSEEK from "./prompt/deepseek.txt"
import PROMPT_GLM from "./prompt/glm.txt"
import PROMPT_MINIMAX from "./prompt/minimax.txt"
import PROMPT_TRINITY from "./prompt/trinity.txt"
import type { Provider } from "@/provider"
import { Provider } from "@/provider"
import type { Agent } from "@/agent/agent"
import { Permission } from "@/permission"
import { Skill } from "@/skill"
Expand Down Expand Up @@ -50,10 +50,25 @@ export const layer = Layer.effect(
Effect.gen(function* () {
const skill = yield* Skill.Service

const provider = yield* Provider.Service
const visionModels = yield* provider
.list()
.pipe(
Effect.map((providers) =>
Object.values(providers)
.flatMap((info) => Object.values(info.models))
.filter((m) => m.capabilities.input.image === true)
.map((m) => `${m.providerID}/${m.id}`)
.sort((a, b) => a.localeCompare(b))
.slice(0, 3),
),
)
.pipe(Effect.orElseSucceed(() => [] as string[]))

return Service.of({
environment(model, now) {
const project = Instance.project
return [
const base = [
[
`You are MiMo Code Agent, built by Xiaomi MiMo Team. You are an interactive agent that helps users with software engineering tasks. Use the instructions below and the tools available to you to assist the user.`,
`You are powered by the model named ${model.api.id}. The exact model ID is ${model.providerID}/${model.api.id}`,
Expand All @@ -72,6 +87,19 @@ export const layer = Layer.effect(
].join("\n"),
`IMPORTANT: Your response must ALWAYS strictly follow the same major language as the user.`,
]
if (!model.capabilities.input.image)
base.push(
[
`<vision-capability>`,
`You CANNOT see or interpret image content — this model has no vision support.`,
`Never attempt to analyze an image's visual content yourself. If a task needs image understanding, dispatch a vision-capable subagent via the actor tool, passing the image file path so the subagent can Read it.`,
visionModels.length
? `Vision-capable models you can pass to --model: ${visionModels.join(", ")}. Run \`actor models --vision\` to see all of them. Example: actor run <type> "<desc>" "analyze the image at <path>" --model ${visionModels[0]}.`
: `No vision-capable model is currently configured. Ask the user to configure a vision model, or use an OCR tool to extract text.`,
`If instead you need a file's raw binary structure (not its visual content), use a shell tool such as \`hexdump -C <path>\`, NOT the read tool.`,
].join("\n"),
)
return base
},

skills: Effect.fn("SystemPrompt.skills")(function* (agent: Agent.Info) {
Expand All @@ -91,6 +119,6 @@ export const layer = Layer.effect(
}),
)

export const defaultLayer = layer.pipe(Layer.provide(Skill.defaultLayer))
export const defaultLayer = layer.pipe(Layer.provide(Skill.defaultLayer), Layer.provide(Provider.defaultLayer))

export * as SystemPrompt from "./system"
4 changes: 4 additions & 0 deletions packages/opencode/src/tool/actor.shell.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ appends them at request time per-agent).
# --session: target session id (defaults to current); --type: 'text' (default) or 'actor_notification'
# (--task is NOT valid here — it applies only to run/spawn for tying a subagent to a task_id)

# list available models (optionally vision-only) to pick a --model value:
actor models [--vision] [--limit <n>]

Examples:
actor run explore "Find error recovery" "Scan src/parser.ts for catch blocks. Return file:line."

Expand Down Expand Up @@ -70,3 +73,4 @@ When to use what:
run — single delegation where the result drives your next step
spawn — fan-out for parallel work, OR fire-and-forget background tasks
wait — pick up a previously-spawned actor's result on demand
models — discover which models you can pass to --model (use --vision to find image-capable models for dispatching an image subagent)
48 changes: 46 additions & 2 deletions packages/opencode/src/tool/actor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ export interface ActorPromptOps {
const id = "actor"

const MODEL_PARAM_DESCRIPTION =
"(optional) Model for this subagent: a model group name (e.g. ultra/standard/lite) or a literal provider/model (e.g. mimo-v2.5-pro). Overrides the agent's configured model; defaults to the agent's model, else the parent's. If no model_groups are configured, the tier names resolve to the default model."
"(optional) Model for this subagent: a model group name (e.g. ultra/standard/lite) or a literal provider/model (e.g. mimo-v2.5-pro). Overrides the agent's configured model; defaults to the agent's model, else the parent's. If no model_groups are configured, the tier names resolve to the default model. To discover valid provider/model values (e.g. a vision-capable model for image tasks), run `actor models` (or `actor models --vision`)."

const KNOWN_ACTOR_VERBS = ["run", "spawn", "status", "wait", "cancel", "send"]
const KNOWN_ACTOR_VERBS = ["run", "spawn", "status", "wait", "cancel", "send", "models"]

function levenshteinActor(a: string, b: string): number {
const m = a.length, n = b.length
Expand Down Expand Up @@ -65,6 +65,7 @@ type ActorShellArgs =
| { operation: { action: "wait"; actor_id: string; timeout_ms?: number } }
| { operation: { action: "cancel"; actor_id: string } }
| { operation: { action: "send"; to_actor_id: string; content: string; to_session_id?: string; type?: string } }
| { operation: { action: "models"; vision?: boolean; limit?: number } }

function actorArityError(verb: string, expected: string, args: string[], line: number) {
return Effect.fail({
Expand Down Expand Up @@ -186,6 +187,20 @@ const mapActorVerb = Effect.fn("mapActorVerb")(function* (verb: string | undefin
},
} as ActorShellArgs
}
case "models": {
const vision = args.includes("--vision")
const withoutVision = args.filter((a) => a !== "--vision")
const { flags, rest } = yield* extractNamedFlags(withoutVision, ["limit"], line)
if (rest.length !== 0)
return yield* actorArityError("models", "[--vision] [--limit <n>]", rest, line)
return {
operation: {
action: "models" as const,
...(vision ? { vision: true } : {}),
...(flags.limit ? { limit: Number(flags.limit) } : {}),
},
} as ActorShellArgs
}
default: {
const suggestion = suggestActorVerb(verb ?? "")
const detail =
Expand Down Expand Up @@ -444,6 +459,12 @@ export const ActorTool = Tool.define(
),
})

const modelsSchema = z.strictObject({
action: z.literal("models"),
vision: z.boolean().optional().describe("(optional) If true, list only vision-capable models (models that accept image input)."),
limit: z.number().int().positive().optional().describe("(optional) Max number of models to return. Default 50."),
})

const parameters = z.strictObject({
// .meta({ type: "object" }) is REQUIRED — without it the emitted JSON
// schema's `operation` node has only `anyOf`, no `type`, and some models
Expand All @@ -460,6 +481,7 @@ export const ActorTool = Tool.define(
waitSchema,
cancelSchema,
sendSchema,
modelsSchema,
])
.meta({ type: "object" }),
})
Expand Down Expand Up @@ -616,6 +638,28 @@ export const ActorTool = Tool.define(
}
}

if (op.action === "models") {
const providers = yield* provider.list()
const all = Object.values(providers).flatMap((info) =>
Object.values(info.models).map((m) => ({
ref: `${m.providerID}/${m.id}`,
name: m.name,
vision: m.capabilities.input.image === true,
})),
)
const filtered = op.vision ? all.filter((m) => m.vision) : all
const sorted = filtered.sort((a, b) => a.ref.localeCompare(b.ref))
const limit = op.limit ?? 50
const shown = sorted.slice(0, limit)
const lines = shown.map((m) => `${m.ref}${m.vision ? " (vision)" : ""}`)
const header = op.vision ? `Vision-capable models` : `Available models`
const more = sorted.length > shown.length ? `\n… and ${sorted.length - shown.length} more (raise --limit)` : ""
const output = shown.length === 0
? (op.vision ? "No vision-capable models are configured. Configure a vision model or use an OCR tool." : "No models are configured.")
: `${header} (${shown.length} of ${sorted.length}):\n${lines.join("\n")}${more}\nPass any of these to actor --model.`
return { title: header, output, metadata: { count: shown.length, total: sorted.length, vision: !!op.vision } as Record<string, any> }
}

// op.action ==="run" or "spawn" — schema guarantees
// description / prompt / subagent_type are present and non-empty.
if (!ctx.extra?.bypassAgentCheck) {
Expand Down
Loading
Loading