fix(anthropic): token limits for streaming with tool calls (#3084)

icecrasher321 · web-flow · commit 656beb838387 · 2026-01-30T12:08:16.000-08:00
* remove for bedrock since they handle on their own

* fix

* fix inference config reference

* add to docs

* make it min between max tokens
diff --git a/.cursor/commands/council.md b/.cursor/commands/council.md
@@ -0,0 +1,7 @@
+Based on the given area of interest, please:
+
+1. Dig around the codebase in terms of that given area of interest, gather general information such as keywords and architecture overview.
+2. Spawn off n=10 (unless specified otherwise) task agents to dig deeper into the codebase in terms of that given area of interest, some of them should be out of the box for variance.
+3. Once the task agents are done, use the information to do what the user wants.
+
+If user is in plan mode, use the information to create the plan.
diff --git a/apps/docs/content/docs/en/blocks/agent.mdx b/apps/docs/content/docs/en/blocks/agent.mdx
@@ -58,7 +58,7 @@ Controls response randomness and creativity:
 
 ### Max Output Tokens
 
-Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. For long-form content generation via API, explicitly set a higher value.
+Controls the maximum length of the model's response. For Anthropic models, Sim uses reliable defaults: streaming executions use the model's full capacity (e.g. 64,000 tokens for Claude 4.5), while non-streaming executions default to 8,192 to avoid timeout issues. When using tools with Anthropic models, intermediate tool-calling requests use a capped limit of 8,192 tokens to avoid SDK timeout errors, regardless of your configured max tokens—the final streaming response uses your full configured limit. This only affects Anthropic's direct API; AWS Bedrock handles this automatically. For long-form content generation via API, explicitly set a higher value.
 
 ### API Key
 
diff --git a/apps/sim/providers/anthropic/index.ts b/apps/sim/providers/anthropic/index.ts
@@ -302,13 +302,21 @@ export const anthropicProvider: ProviderConfig = {
       const providerStartTime = Date.now()
       const providerStartTimeISO = new Date(providerStartTime).toISOString()
 
+      // Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
+      // but allow users to set lower values if desired
+      const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
+      const nonStreamingMaxTokens = request.maxTokens
+        ? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
+        : nonStreamingLimit
+      const intermediatePayload = { ...payload, max_tokens: nonStreamingMaxTokens }
+
       try {
         const initialCallTime = Date.now()
-        const originalToolChoice = payload.tool_choice
+        const originalToolChoice = intermediatePayload.tool_choice
         const forcedTools = preparedTools?.forcedTools || []
         let usedForcedTools: string[] = []
 
-        let currentResponse = await anthropic.messages.create(payload)
+        let currentResponse = await anthropic.messages.create(intermediatePayload)
         const firstResponseTime = Date.now() - initialCallTime
 
         let content = ''
@@ -491,7 +499,7 @@ export const anthropicProvider: ProviderConfig = {
             toolsTime += thisToolsTime
 
             const nextPayload = {
-              ...payload,
+              ...intermediatePayload,
               messages: currentMessages,
             }
 
@@ -674,13 +682,21 @@ export const anthropicProvider: ProviderConfig = {
     const providerStartTime = Date.now()
     const providerStartTimeISO = new Date(providerStartTime).toISOString()
 
+    // Cap intermediate calls at non-streaming limit to avoid SDK timeout errors,
+    // but allow users to set lower values if desired
+    const nonStreamingLimit = getMaxOutputTokensForModel(request.model, false)
+    const toolLoopMaxTokens = request.maxTokens
+      ? Math.min(Number.parseInt(String(request.maxTokens)), nonStreamingLimit)
+      : nonStreamingLimit
+    const toolLoopPayload = { ...payload, max_tokens: toolLoopMaxTokens }
+
     try {
       const initialCallTime = Date.now()
-      const originalToolChoice = payload.tool_choice
+      const originalToolChoice = toolLoopPayload.tool_choice
       const forcedTools = preparedTools?.forcedTools || []
       let usedForcedTools: string[] = []
 
-      let currentResponse = await anthropic.messages.create(payload)
+      let currentResponse = await anthropic.messages.create(toolLoopPayload)
       const firstResponseTime = Date.now() - initialCallTime
 
       let content = ''
@@ -867,7 +883,7 @@ export const anthropicProvider: ProviderConfig = {
           toolsTime += thisToolsTime
 
           const nextPayload = {
-            ...payload,
+            ...toolLoopPayload,
             messages: currentMessages,
           }
 
diff --git a/apps/sim/providers/bedrock/index.ts b/apps/sim/providers/bedrock/index.ts
@@ -20,11 +20,7 @@ import {
   generateToolUseId,
   getBedrockInferenceProfileId,
 } from '@/providers/bedrock/utils'
-import {
-  getMaxOutputTokensForModel,
-  getProviderDefaultModel,
-  getProviderModels,
-} from '@/providers/models'
+import { getProviderDefaultModel, getProviderModels } from '@/providers/models'
 import type {
   ProviderConfig,
   ProviderRequest,
@@ -261,11 +257,11 @@ export const bedrockProvider: ProviderConfig = {
 
     const systemPromptWithSchema = systemContent
 
-    const inferenceConfig = {
+    const inferenceConfig: { temperature: number; maxTokens?: number } = {
       temperature: Number.parseFloat(String(request.temperature ?? 0.7)),
-      maxTokens:
-        Number.parseInt(String(request.maxTokens)) ||
-        getMaxOutputTokensForModel(request.model, request.stream ?? false),
+    }
+    if (request.maxTokens != null) {
+      inferenceConfig.maxTokens = Number.parseInt(String(request.maxTokens))
     }
 
     const shouldStreamToolCalls = request.streamToolCalls ?? false
diff --git a/apps/sim/providers/models.ts b/apps/sim/providers/models.ts
@@ -34,10 +34,15 @@ export interface ModelCapabilities {
   toolUsageControl?: boolean
   computerUse?: boolean
   nativeStructuredOutputs?: boolean
+  /**
+   * Max output tokens configuration for Anthropic SDK's streaming timeout workaround.
+   * The Anthropic SDK throws an error for non-streaming requests that may take >10 minutes.
+   * This only applies to direct Anthropic API calls, not Bedrock (which uses AWS SDK).
+   */
   maxOutputTokens?: {
     /** Maximum tokens for streaming requests */
     max: number
-    /** Safe default for non-streaming requests (to avoid timeout issues) */
+    /** Safe default for non-streaming requests (to avoid Anthropic SDK timeout errors) */
     default: number
   }
   reasoningEffort?: {
@@ -1709,7 +1714,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
         capabilities: {
           temperature: { min: 0, max: 1 },
           nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
         },
         contextWindow: 200000,
       },
@@ -1723,7 +1727,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
         capabilities: {
           temperature: { min: 0, max: 1 },
           nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
         },
         contextWindow: 200000,
       },
@@ -1737,7 +1740,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
         capabilities: {
           temperature: { min: 0, max: 1 },
           nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
         },
         contextWindow: 200000,
       },
@@ -1751,7 +1753,6 @@ export const PROVIDER_DEFINITIONS: Record<string, ProviderDefinition> = {
         capabilities: {
           temperature: { min: 0, max: 1 },
           nativeStructuredOutputs: true,
-          maxOutputTokens: { max: 64000, default: 8192 },
         },
         contextWindow: 200000,
       },