Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 17 additions & 38 deletions packages/core/src/ai-model/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,16 @@ import type { IModelConfig } from '@midscene/shared/env';
import { paddingToMatchBlockByBase64 } from '@midscene/shared/img';
import { getDebug } from '@midscene/shared/logger';
import { assert } from '@midscene/shared/utils';
import type {
ChatCompletionContentPart,
ChatCompletionMessageParam,
} from 'openai/resources/index';
import type { ChatCompletionMessageParam } from 'openai/resources/index';
import {
AIActionType,
buildYamlFlowFromPlans,
fillBboxParam,
findAllMidsceneLocatorField,
markupImageForLLM,
warnGPT4oSizeLimit,
} from './common';
import type { ConversationHistory } from './conversation-history';
import { systemPromptToTaskPlanning } from './prompt/llm-planning';
import { describeUserPage } from './prompt/util';
import { callAIWithObjectResponse } from './service-caller/index';

const debug = getDebug('planning');
Expand All @@ -43,10 +38,9 @@ export async function plan(

const { modelName, vlMode } = modelConfig;

const { description: pageDescription, elementById } = await describeUserPage(
context,
{ vlMode },
);
// Planning requires VL mode (validated by ModelConfigManager.getModelConfig)
assert(vlMode, 'Planning requires vlMode to be configured.');
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Align this assertion message with the shared error wording used in ModelConfigManager for consistency (e.g. 'Planning requires a VL mode to be configured.').

Suggested change
assert(vlMode, 'Planning requires vlMode to be configured.');
assert(vlMode, 'Planning requires a VL mode to be configured.');

Copilot uses AI. Check for mistakes.

const systemPrompt = await systemPromptToTaskPlanning({
actionSpace: opts.actionSpace,
vlMode: vlMode,
Expand All @@ -57,21 +51,19 @@ export async function plan(
let imageHeight = size.height;
const rightLimit = imageWidth;
const bottomLimit = imageHeight;

// Process image based on VL mode requirements
if (vlMode === 'qwen-vl') {
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
imageWidth = paddedResult.width;
imageHeight = paddedResult.height;
imagePayload = paddedResult.imageBase64;
} else if (vlMode === 'qwen3-vl') {
// Reserved for qwen3-vl specific processing
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
// imageWidth = paddedResult.width;
// imageHeight = paddedResult.height;
// imagePayload = paddedResult.imageBase64;
Comment on lines +62 to 66
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid keeping commented-out code; either remove it or replace with a TODO reference that links to a task/issue. If you plan to implement qwen3-vl specific processing, leave a concise TODO comment and remove the inactive code.

Suggested change
// Reserved for qwen3-vl specific processing
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
// imageWidth = paddedResult.width;
// imageHeight = paddedResult.height;
// imagePayload = paddedResult.imageBase64;
// TODO: Implement qwen3-vl specific image processing. See issue #1234.

Copilot uses AI. Check for mistakes.
} else if (!vlMode) {
imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
width: imageWidth,
height: imageHeight,
});
}

warnGPT4oSizeLimit(size, modelName);
Expand Down Expand Up @@ -120,14 +112,7 @@ export async function plan(
detail: 'high',
},
},
...(vlMode
? []
: ([
{
type: 'text',
text: pageDescription,
},
] as ChatCompletionContentPart[])),
// Planning uses pure vision mode, no DOM description needed
],
},
];
Expand Down Expand Up @@ -173,21 +158,15 @@ export async function plan(
locateFields.forEach((field) => {
const locateResult = action.param[field];
if (locateResult) {
if (vlMode) {
action.param[field] = fillBboxParam(
locateResult,
imageWidth,
imageHeight,
rightLimit,
bottomLimit,
vlMode,
);
} else {
const element = elementById(locateResult);
if (element) {
action.param[field].id = element.id;
}
}
// Always use VL mode to fill bbox parameters
action.param[field] = fillBboxParam(
locateResult,
imageWidth,
imageHeight,
rightLimit,
bottomLimit,
vlMode,
);
}
});
});
Expand Down
25 changes: 23 additions & 2 deletions packages/shared/src/env/model-config-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
import type { GlobalConfigManager } from './global-config-manager';

import type { IModelConfig, TIntent, TModelConfigFn } from './types';
import { VL_MODE_RAW_VALID_VALUES as VL_MODES } from './types';

const ALL_INTENTS: TIntent[] = ['VQA', 'default', 'grounding', 'planning'];

Expand Down Expand Up @@ -101,13 +102,15 @@ export class ModelConfigManager {
* if isolatedMode is false, modelConfigMap can be changed by process.env so we need to recalculate it when it's undefined
*/
getModelConfig(intent: TIntent): IModelConfig {
let config: IModelConfig;

if (this.isolatedMode) {
if (!this.modelConfigMap) {
throw new Error(
'modelConfigMap is not initialized in isolated mode, which should not happen',
);
}
return this.modelConfigMap[intent];
config = this.modelConfigMap[intent];
} else {
if (!this.modelConfigMap) {
if (!this.globalConfigManager) {
Expand All @@ -119,8 +122,26 @@ export class ModelConfigManager {
this.globalConfigManager.getAllEnvConfig(),
);
}
return this.modelConfigMap[intent];
config = this.modelConfigMap[intent];
}

// Validate Planning must use VL mode
if (intent === 'planning' && !config.vlMode) {
throw new Error(
`Planning requires a vision language model (VL model). DOM-based planning is not supported.

Please configure one of the following VL modes:
${VL_MODES.map((mode) => `- ${mode}`).join('\n ')}

Configuration examples:
- Environment variable: MIDSCENE_PLANNING_VL_MODE=qwen-vl
- Or use modelConfig function with planning intent

Learn more: https://midscenejs.com/choose-a-model`,
Comment on lines +128 to +140
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first sentence mixes 'VL model' with 'VL mode', while the rest of the message instructs users to configure VL modes. For consistency and clarity, consider: 'Planning requires a vision-language mode (VL mode). DOM-based planning is not supported.'

Copilot uses AI. Check for mistakes.
);
}

return config;
}

getUploadTestServerUrl(): string | undefined {
Expand Down
15 changes: 15 additions & 0 deletions packages/shared/src/env/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,21 @@ export interface IModelConfigForVQA {
[MIDSCENE_VQA_VL_MODE]?: TVlModeValues;
}

/**
* Model configuration for Planning intent.
*
* IMPORTANT: Planning MUST use a vision language model (VL mode).
* DOM-based planning is not supported.
*
* Required: MIDSCENE_PLANNING_VL_MODE must be set to one of:
* - 'qwen-vl'
* - 'qwen3-vl'
* - 'gemini'
* - 'doubao-vision'
* - 'vlm-ui-tars'
* - 'vlm-ui-tars-doubao'
* - 'vlm-ui-tars-doubao-1.5'
*/
export interface IModelConfigForPlanning {
// model name
[MIDSCENE_PLANNING_MODEL_NAME]: string;
Expand Down
160 changes: 157 additions & 3 deletions packages/shared/tests/unit-test/env/modle-config-manager.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
MIDSCENE_PLANNING_MODEL_NAME,
MIDSCENE_PLANNING_OPENAI_API_KEY,
MIDSCENE_PLANNING_OPENAI_BASE_URL,
MIDSCENE_PLANNING_VL_MODE,
MIDSCENE_VQA_MODEL_NAME,
MIDSCENE_VQA_OPENAI_API_KEY,
MIDSCENE_VQA_OPENAI_BASE_URL,
Expand Down Expand Up @@ -48,9 +49,10 @@ describe('ModelConfigManager', () => {
};
case 'planning':
return {
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
[MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key',
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
[MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const,
};
case 'grounding':
return {
Expand Down Expand Up @@ -105,9 +107,10 @@ describe('ModelConfigManager', () => {
};
case 'planning':
return {
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
[MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key',
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
[MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl',
};
case 'grounding':
return {
Expand All @@ -131,10 +134,11 @@ describe('ModelConfigManager', () => {
expect(vqaConfig.from).toBe('modelConfig');

const planningConfig = manager.getModelConfig('planning');
expect(planningConfig.modelName).toBe('gpt-4');
expect(planningConfig.modelName).toBe('qwen-vl-plus');
expect(planningConfig.openaiApiKey).toBe('test-planning-key');
expect(planningConfig.intent).toBe('planning');
expect(planningConfig.from).toBe('modelConfig');
expect(planningConfig.vlMode).toBe('qwen-vl');

const groundingConfig = manager.getModelConfig('grounding');
expect(groundingConfig.modelName).toBe('gpt-4-vision');
Expand Down Expand Up @@ -263,4 +267,154 @@ describe('ModelConfigManager', () => {
expect(config.openaiBaseURL).toBe('https://isolated.openai.com/v1');
});
});

describe('Planning VL mode validation', () => {
it('should throw error when planning has no vlMode in isolated mode', () => {
const modelConfigFn: TModelConfigFn = ({ intent }) => {
if (intent === 'planning') {
// Missing VL mode for planning
return {
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
};
}
return {
[MIDSCENE_MODEL_NAME]: 'gpt-4',
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
};
};

const manager = new ModelConfigManager(modelConfigFn);

expect(() => manager.getModelConfig('planning')).toThrow(
'Planning requires a vision language model (VL model). DOM-based planning is not supported.',
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assertion is brittle against the multi-line error message thrown by ModelConfigManager. Prefer a regex to assert the key part of the message, e.g. toThrow(/Planning requires a vision[- ]?language (model|mode)/), so formatting or additional guidance lines won't break the test.

Suggested change
'Planning requires a vision language model (VL model). DOM-based planning is not supported.',
/Planning requires a vision[- ]?language (model|mode)/,

Copilot uses AI. Check for mistakes.
);
});

it('should succeed when planning has valid vlMode in isolated mode', () => {
const modelConfigFn: TModelConfigFn = ({ intent }) => {
if (intent === 'planning') {
return {
[MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus',
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
[MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const,
};
}
return {
[MIDSCENE_MODEL_NAME]: 'gpt-4',
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
};
};

const manager = new ModelConfigManager(modelConfigFn);
const config = manager.getModelConfig('planning');

expect(config.vlMode).toBe('qwen-vl');
expect(config.modelName).toBe('qwen-vl-plus');
});

it('should throw error when planning has no vlMode in normal mode', () => {
vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'gpt-4');
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key');
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_BASE_URL, 'https://api.openai.com/v1');
// Intentionally not setting MIDSCENE_PLANNING_VL_MODE

const manager = new ModelConfigManager();
manager.registerGlobalConfigManager(new GlobalConfigManager());

expect(() => manager.getModelConfig('planning')).toThrow(
'Planning requires a vision language model (VL model). DOM-based planning is not supported.',
);
});

it('should succeed when planning has valid vlMode in normal mode', () => {
vi.stubEnv(MIDSCENE_PLANNING_MODEL_NAME, 'qwen-vl-plus');
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_API_KEY, 'test-key');
vi.stubEnv(MIDSCENE_PLANNING_OPENAI_BASE_URL, 'https://api.openai.com/v1');
vi.stubEnv(MIDSCENE_PLANNING_VL_MODE, 'qwen-vl');

const manager = new ModelConfigManager();
manager.registerGlobalConfigManager(new GlobalConfigManager());

const config = manager.getModelConfig('planning');

expect(config.vlMode).toBe('qwen-vl');
expect(config.modelName).toBe('qwen-vl-plus');
expect(config.intent).toBe('planning');
});

it('should not affect other intents when planning validation fails', () => {
const modelConfigFn: TModelConfigFn = ({ intent }) => {
if (intent === 'planning') {
// Missing VL mode for planning - should fail
return {
[MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4',
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
};
}
// Other intents should work fine
return {
[MIDSCENE_MODEL_NAME]: 'gpt-4',
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
};
};

const manager = new ModelConfigManager(modelConfigFn);

// Planning should fail
expect(() => manager.getModelConfig('planning')).toThrow(
'Planning requires a vision language model',
Copy link

Copilot AI Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use a regex for this partial-match assertion to avoid accidental false negatives if the error message format changes (e.g. toThrow(/Planning requires a vision[- ]?language/)).

Suggested change
'Planning requires a vision language model',
/Planning requires a vision[- ]?language/i,

Copilot uses AI. Check for mistakes.
);

// Other intents should succeed
expect(() => manager.getModelConfig('default')).not.toThrow();
expect(() => manager.getModelConfig('VQA')).not.toThrow();
expect(() => manager.getModelConfig('grounding')).not.toThrow();
});

it('should accept all valid VL modes for planning', () => {
const vlModeTestCases: Array<{
raw: 'qwen-vl' | 'qwen3-vl' | 'gemini' | 'doubao-vision' | 'vlm-ui-tars' | 'vlm-ui-tars-doubao' | 'vlm-ui-tars-doubao-1.5';
expected: string;
}> = [
{ raw: 'qwen-vl', expected: 'qwen-vl' },
{ raw: 'qwen3-vl', expected: 'qwen3-vl' },
{ raw: 'gemini', expected: 'gemini' },
{ raw: 'doubao-vision', expected: 'doubao-vision' },
// UI-TARS variants all normalize to 'vlm-ui-tars'
{ raw: 'vlm-ui-tars', expected: 'vlm-ui-tars' },
{ raw: 'vlm-ui-tars-doubao', expected: 'vlm-ui-tars' },
{ raw: 'vlm-ui-tars-doubao-1.5', expected: 'vlm-ui-tars' },
];

for (const { raw, expected } of vlModeTestCases) {
const modelConfigFn: TModelConfigFn = ({ intent }) => {
if (intent === 'planning') {
return {
[MIDSCENE_PLANNING_MODEL_NAME]: 'test-model',
[MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
[MIDSCENE_PLANNING_VL_MODE]: raw,
};
}
return {
[MIDSCENE_MODEL_NAME]: 'gpt-4',
[MIDSCENE_OPENAI_API_KEY]: 'test-key',
[MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1',
};
};

const manager = new ModelConfigManager(modelConfigFn);
const config = manager.getModelConfig('planning');

expect(config.vlMode).toBe(expected);
}
});
});
});