-
Couldn't load subscription status.
- Fork 716
feat(core,shared): enforce VL mode requirement for Planning #1332
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -8,21 +8,16 @@ import type { IModelConfig } from '@midscene/shared/env'; | |||||||||||||
| import { paddingToMatchBlockByBase64 } from '@midscene/shared/img'; | ||||||||||||||
| import { getDebug } from '@midscene/shared/logger'; | ||||||||||||||
| import { assert } from '@midscene/shared/utils'; | ||||||||||||||
| import type { | ||||||||||||||
| ChatCompletionContentPart, | ||||||||||||||
| ChatCompletionMessageParam, | ||||||||||||||
| } from 'openai/resources/index'; | ||||||||||||||
| import type { ChatCompletionMessageParam } from 'openai/resources/index'; | ||||||||||||||
| import { | ||||||||||||||
| AIActionType, | ||||||||||||||
| buildYamlFlowFromPlans, | ||||||||||||||
| fillBboxParam, | ||||||||||||||
| findAllMidsceneLocatorField, | ||||||||||||||
| markupImageForLLM, | ||||||||||||||
| warnGPT4oSizeLimit, | ||||||||||||||
| } from './common'; | ||||||||||||||
| import type { ConversationHistory } from './conversation-history'; | ||||||||||||||
| import { systemPromptToTaskPlanning } from './prompt/llm-planning'; | ||||||||||||||
| import { describeUserPage } from './prompt/util'; | ||||||||||||||
| import { callAIWithObjectResponse } from './service-caller/index'; | ||||||||||||||
|
|
||||||||||||||
| const debug = getDebug('planning'); | ||||||||||||||
|
|
@@ -43,10 +38,9 @@ export async function plan( | |||||||||||||
|
|
||||||||||||||
| const { modelName, vlMode } = modelConfig; | ||||||||||||||
|
|
||||||||||||||
| const { description: pageDescription, elementById } = await describeUserPage( | ||||||||||||||
| context, | ||||||||||||||
| { vlMode }, | ||||||||||||||
| ); | ||||||||||||||
| // Planning requires VL mode (validated by ModelConfigManager.getModelConfig) | ||||||||||||||
| assert(vlMode, 'Planning requires vlMode to be configured.'); | ||||||||||||||
|
|
||||||||||||||
| const systemPrompt = await systemPromptToTaskPlanning({ | ||||||||||||||
| actionSpace: opts.actionSpace, | ||||||||||||||
| vlMode: vlMode, | ||||||||||||||
|
|
@@ -57,21 +51,19 @@ export async function plan( | |||||||||||||
| let imageHeight = size.height; | ||||||||||||||
| const rightLimit = imageWidth; | ||||||||||||||
| const bottomLimit = imageHeight; | ||||||||||||||
|
|
||||||||||||||
| // Process image based on VL mode requirements | ||||||||||||||
| if (vlMode === 'qwen-vl') { | ||||||||||||||
| const paddedResult = await paddingToMatchBlockByBase64(imagePayload); | ||||||||||||||
| imageWidth = paddedResult.width; | ||||||||||||||
| imageHeight = paddedResult.height; | ||||||||||||||
| imagePayload = paddedResult.imageBase64; | ||||||||||||||
| } else if (vlMode === 'qwen3-vl') { | ||||||||||||||
| // Reserved for qwen3-vl specific processing | ||||||||||||||
| // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32); | ||||||||||||||
| // imageWidth = paddedResult.width; | ||||||||||||||
| // imageHeight = paddedResult.height; | ||||||||||||||
| // imagePayload = paddedResult.imageBase64; | ||||||||||||||
|
Comment on lines
+62
to
66
|
||||||||||||||
| // Reserved for qwen3-vl specific processing | |
| // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32); | |
| // imageWidth = paddedResult.width; | |
| // imageHeight = paddedResult.height; | |
| // imagePayload = paddedResult.imageBase64; | |
| // TODO: Implement qwen3-vl specific image processing. See issue #1234. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ import { | |
| import type { GlobalConfigManager } from './global-config-manager'; | ||
|
|
||
| import type { IModelConfig, TIntent, TModelConfigFn } from './types'; | ||
| import { VL_MODE_RAW_VALID_VALUES as VL_MODES } from './types'; | ||
|
|
||
| const ALL_INTENTS: TIntent[] = ['VQA', 'default', 'grounding', 'planning']; | ||
|
|
||
|
|
@@ -101,13 +102,15 @@ export class ModelConfigManager { | |
| * if isolatedMode is false, modelConfigMap can be changed by process.env so we need to recalculate it when it's undefined | ||
| */ | ||
| getModelConfig(intent: TIntent): IModelConfig { | ||
| let config: IModelConfig; | ||
|
|
||
| if (this.isolatedMode) { | ||
| if (!this.modelConfigMap) { | ||
| throw new Error( | ||
| 'modelConfigMap is not initialized in isolated mode, which should not happen', | ||
| ); | ||
| } | ||
| return this.modelConfigMap[intent]; | ||
| config = this.modelConfigMap[intent]; | ||
| } else { | ||
| if (!this.modelConfigMap) { | ||
| if (!this.globalConfigManager) { | ||
|
|
@@ -119,8 +122,26 @@ export class ModelConfigManager { | |
| this.globalConfigManager.getAllEnvConfig(), | ||
| ); | ||
| } | ||
| return this.modelConfigMap[intent]; | ||
| config = this.modelConfigMap[intent]; | ||
| } | ||
|
|
||
| // Validate Planning must use VL mode | ||
| if (intent === 'planning' && !config.vlMode) { | ||
| throw new Error( | ||
| `Planning requires a vision language model (VL model). DOM-based planning is not supported. | ||
|
|
||
| Please configure one of the following VL modes: | ||
| ${VL_MODES.map((mode) => `- ${mode}`).join('\n ')} | ||
|
|
||
| Configuration examples: | ||
| - Environment variable: MIDSCENE_PLANNING_VL_MODE=qwen-vl | ||
| - Or use modelConfig function with planning intent | ||
|
|
||
| Learn more: https://midscenejs.com/choose-a-model`, | ||
|
Comment on lines
+128
to
+140
|
||
| ); | ||
| } | ||
|
|
||
| return config; | ||
| } | ||
|
|
||
| getUploadTestServerUrl(): string | undefined { | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -13,6 +13,7 @@ import { | |||||
| MIDSCENE_PLANNING_MODEL_NAME, | ||||||
| MIDSCENE_PLANNING_OPENAI_API_KEY, | ||||||
| MIDSCENE_PLANNING_OPENAI_BASE_URL, | ||||||
| MIDSCENE_PLANNING_VL_MODE, | ||||||
| MIDSCENE_VQA_MODEL_NAME, | ||||||
| MIDSCENE_VQA_OPENAI_API_KEY, | ||||||
| MIDSCENE_VQA_OPENAI_BASE_URL, | ||||||
|
|
@@ -48,9 +49,10 @@ describe('ModelConfigManager', () => { | |||||
| }; | ||||||
| case 'planning': | ||||||
| return { | ||||||
| [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4', | ||||||
| [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus', | ||||||
| [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key', | ||||||
| [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', | ||||||
| [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl' as const, | ||||||
| }; | ||||||
| case 'grounding': | ||||||
| return { | ||||||
|
|
@@ -105,9 +107,10 @@ describe('ModelConfigManager', () => { | |||||
| }; | ||||||
| case 'planning': | ||||||
| return { | ||||||
| [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4', | ||||||
| [MIDSCENE_PLANNING_MODEL_NAME]: 'qwen-vl-plus', | ||||||
| [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-planning-key', | ||||||
| [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', | ||||||
| [MIDSCENE_PLANNING_VL_MODE]: 'qwen-vl', | ||||||
| }; | ||||||
| case 'grounding': | ||||||
| return { | ||||||
|
|
@@ -131,10 +134,11 @@ describe('ModelConfigManager', () => { | |||||
| expect(vqaConfig.from).toBe('modelConfig'); | ||||||
|
|
||||||
| const planningConfig = manager.getModelConfig('planning'); | ||||||
| expect(planningConfig.modelName).toBe('gpt-4'); | ||||||
| expect(planningConfig.modelName).toBe('qwen-vl-plus'); | ||||||
| expect(planningConfig.openaiApiKey).toBe('test-planning-key'); | ||||||
| expect(planningConfig.intent).toBe('planning'); | ||||||
| expect(planningConfig.from).toBe('modelConfig'); | ||||||
| expect(planningConfig.vlMode).toBe('qwen-vl'); | ||||||
|
|
||||||
| const groundingConfig = manager.getModelConfig('grounding'); | ||||||
| expect(groundingConfig.modelName).toBe('gpt-4-vision'); | ||||||
|
|
@@ -263,4 +267,154 @@ describe('ModelConfigManager', () => { | |||||
| expect(config.openaiBaseURL).toBe('https://isolated.openai.com/v1'); | ||||||
| }); | ||||||
| }); | ||||||
|
|
||||||
| describe('Planning VL mode validation', () => { | ||||||
| it('should throw error when planning has no vlMode in isolated mode', () => { | ||||||
| const modelConfigFn: TModelConfigFn = ({ intent }) => { | ||||||
| if (intent === 'planning') { | ||||||
| // Missing VL mode for planning | ||||||
| return { | ||||||
| [MIDSCENE_PLANNING_MODEL_NAME]: 'gpt-4', | ||||||
| [MIDSCENE_PLANNING_OPENAI_API_KEY]: 'test-key', | ||||||
| [MIDSCENE_PLANNING_OPENAI_BASE_URL]: 'https://api.openai.com/v1', | ||||||
| }; | ||||||
| } | ||||||
| return { | ||||||
| [MIDSCENE_MODEL_NAME]: 'gpt-4', | ||||||
| [MIDSCENE_OPENAI_API_KEY]: 'test-key', | ||||||
| [MIDSCENE_OPENAI_BASE_URL]: 'https://api.openai.com/v1', | ||||||
| }; | ||||||
| }; | ||||||
|
|
||||||
| const manager = new ModelConfigManager(modelConfigFn); | ||||||
|
|
||||||
| expect(() => manager.getModelConfig('planning')).toThrow( | ||||||
| 'Planning requires a vision language model (VL model). DOM-based planning is not supported.', | ||||||
|
||||||
| 'Planning requires a vision language model (VL model). DOM-based planning is not supported.', | |
| /Planning requires a vision[- ]?language (model|mode)/, |
Copilot
AI
Oct 17, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use a regex for this partial-match assertion to avoid accidental false negatives if the error message format changes (e.g. toThrow(/Planning requires a vision[- ]?language/)).
| 'Planning requires a vision language model', | |
| /Planning requires a vision[- ]?language/i, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Align this assertion message with the shared error wording used in ModelConfigManager for consistency (e.g. 'Planning requires a VL mode to be configured.').