Implement the 'max_tool_calls' parameter for the Responses API

s-akhtar-baig · s-akhtar-baig · commit 835e6c60ad2c · 2025-11-08T17:50:07.000-05:00
Test max_tool_calls with builtin and mcp tools

Update input prompt for more consistent tool calling

Resolve merge conflicts

Update integration test

Handle review comments
diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
@@ -6882,6 +6882,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
         input:
           type: array
           items:
@@ -7240,6 +7245,11 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response.
       additionalProperties: false
       required:
         - input
@@ -7321,6 +7331,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
@@ -6166,6 +6166,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
         input:
           type: array
           items:
@@ -6524,6 +6529,11 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response.
       additionalProperties: false
       required:
         - input
@@ -6605,6 +6615,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
@@ -6882,6 +6882,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
         input:
           type: array
           items:
@@ -7240,6 +7245,11 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response.
       additionalProperties: false
       required:
         - input
@@ -7321,6 +7331,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
       additionalProperties: false
       required:
         - created_at
diff --git a/src/llama_stack/apis/agents/agents.py b/src/llama_stack/apis/agents/agents.py
@@ -87,6 +87,7 @@ async def create_openai_response(
                 "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
             ),
         ] = None,
+        max_tool_calls: int | None = None,
     ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
         """Create a model response.
 
@@ -97,6 +98,7 @@ async def create_openai_response(
         :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
         :param include: (Optional) Additional fields to include in the response.
         :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
+        :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response.
         :returns: An OpenAIResponseObject.
         """
         ...
diff --git a/src/llama_stack/apis/agents/openai_responses.py b/src/llama_stack/apis/agents/openai_responses.py
@@ -594,6 +594,7 @@ class OpenAIResponseObject(BaseModel):
     :param truncation: (Optional) Truncation strategy applied to the response
     :param usage: (Optional) Token usage information for the response
     :param instructions: (Optional) System message inserted into the model's context
+    :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response
     """
 
     created_at: int
@@ -615,6 +616,7 @@ class OpenAIResponseObject(BaseModel):
     truncation: str | None = None
     usage: OpenAIResponseUsage | None = None
     instructions: str | None = None
+    max_tool_calls: int | None = None
 
 
 @json_schema_type
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -102,6 +102,7 @@ async def create_openai_response(
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[ResponseGuardrail] | None = None,
+        max_tool_calls: int | None = None,
     ) -> OpenAIResponseObject:
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
         result = await self.openai_responses_impl.create_openai_response(
@@ -119,6 +120,7 @@ async def create_openai_response(
             include,
             max_infer_iters,
             guardrails,
+            max_tool_calls,
         )
         return result  # type: ignore[no-any-return]
 
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -255,6 +255,7 @@ async def create_openai_response(
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[str | ResponseGuardrailSpec] | None = None,
+        max_tool_calls: int | None = None,
     ):
         stream = bool(stream)
         text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@@ -270,6 +271,9 @@ async def create_openai_response(
             if not conversation.startswith("conv_"):
                 raise InvalidConversationIdError(conversation)
 
+        if max_tool_calls is not None and max_tool_calls < 1:
+            raise ValueError(f"Invalid {max_tool_calls=}; should be >= 1")
+
         stream_gen = self._create_streaming_response(
             input=input,
             conversation=conversation,
@@ -282,6 +286,7 @@ async def create_openai_response(
             tools=tools,
             max_infer_iters=max_infer_iters,
             guardrail_ids=guardrail_ids,
+            max_tool_calls=max_tool_calls,
         )
 
         if stream:
@@ -331,6 +336,7 @@ async def _create_streaming_response(
         tools: list[OpenAIResponseInputTool] | None = None,
         max_infer_iters: int | None = 10,
         guardrail_ids: list[str] | None = None,
+        max_tool_calls: int | None = None,
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
         # These should never be None when called from create_openai_response (which sets defaults)
         # but we assert here to help mypy understand the types
@@ -373,6 +379,7 @@ async def _create_streaming_response(
             safety_api=self.safety_api,
             guardrail_ids=guardrail_ids,
             instructions=instructions,
+            max_tool_calls=max_tool_calls,
         )
 
         # Stream the response
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -115,6 +115,7 @@ def __init__(
         safety_api,
         guardrail_ids: list[str] | None = None,
         prompt: OpenAIResponsePrompt | None = None,
+        max_tool_calls: int | None = None,
     ):
         self.inference_api = inference_api
         self.ctx = ctx
@@ -126,6 +127,10 @@ def __init__(
         self.safety_api = safety_api
         self.guardrail_ids = guardrail_ids or []
         self.prompt = prompt
+        # System message that is inserted into the model's context
+        self.instructions = instructions
+        # Max number of total calls to built-in tools that can be processed in a response
+        self.max_tool_calls = max_tool_calls
         self.sequence_number = 0
         # Store MCP tool mapping that gets built during tool processing
         self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = (
@@ -139,8 +144,8 @@ def __init__(
         self.accumulated_usage: OpenAIResponseUsage | None = None
         # Track if we've sent a refusal response
         self.violation_detected = False
-        # system message that is inserted into the model's context
-        self.instructions = instructions
+        # Track total calls made to built-in tools
+        self.accumulated_builtin_tool_calls = 0
 
     async def _create_refusal_response(self, violation_message: str) -> OpenAIResponseObjectStream:
         """Create a refusal response to replace streaming content."""
@@ -186,6 +191,7 @@ def _snapshot_response(
             usage=self.accumulated_usage,
             instructions=self.instructions,
             prompt=self.prompt,
+            max_tool_calls=self.max_tool_calls,
         )
 
     async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -894,6 +900,11 @@ async def _coordinate_tool_execution(
         """Coordinate execution of both function and non-function tool calls."""
         # Execute non-function tool calls
         for tool_call in non_function_tool_calls:
+            # Check if total calls made to built-in and mcp tools exceed max_tool_calls
+            if self.max_tool_calls is not None and self.accumulated_builtin_tool_calls >= self.max_tool_calls:
+                logger.info(f"Ignoring built-in and mcp tool call since reached the limit of {self.max_tool_calls=}.")
+                break
+
             # Find the item_id for this tool call
             matching_item_id = None
             for index, item_id in completion_result_data.tool_call_item_ids.items():
@@ -974,6 +985,9 @@ async def _coordinate_tool_execution(
             if tool_response_message:
                 next_turn_messages.append(tool_response_message)
 
+            # Track number of calls made to built-in and mcp tools
+            self.accumulated_builtin_tool_calls += 1
+
         # Execute function tool calls (client-side)
         for tool_call in function_tool_calls:
             # Find the item_id for this tool call from our tracking dictionary
diff --git a/tests/integration/agents/test_openai_responses.py b/tests/integration/agents/test_openai_responses.py