Implement the 'max_tool_calls' parameter for the Responses API

s-akhtar-baig · s-akhtar-baig · commit 87abe8b608f9 · 2025-11-04T11:43:22.000-05:00
Test max_tool_calls with builtin and mcp tools

Update input prompt for more consistent tool calling

Resolve merge conflicts

Update integration test
diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
@@ -7440,6 +7440,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
         input:
           type: array
           items:
@@ -7798,6 +7803,11 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response.
       additionalProperties: false
       required:
         - input
@@ -7879,6 +7889,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
@@ -8204,6 +8204,10 @@
                         "type": "string",
                         "description": "(Optional) System message inserted into the model's context"
                     },
+                    "max_tool_calls": {
+                        "type": "integer",
+                        "description": "(Optional) Max number of total calls to built-in tools that can be processed in a response"
+                    },
                     "input": {
                         "type": "array",
                         "items": {
@@ -8702,6 +8706,10 @@
                     },
                     "max_infer_iters": {
                         "type": "integer"
+                    },
+                    "max_tool_calls": {
+                        "type": "integer",
+                        "description": "(Optional) Max number of total calls to built-in tools that can be processed in a response."
                     }
                 },
                 "additionalProperties": false,
@@ -8790,6 +8798,10 @@
                     "instructions": {
                         "type": "string",
                         "description": "(Optional) System message inserted into the model's context"
+                    },
+                    "max_tool_calls": {
+                        "type": "integer",
+                        "description": "(Optional) Max number of total calls to built-in tools that can be processed in a response"
                     }
                 },
                 "additionalProperties": false,
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
@@ -6227,6 +6227,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
         input:
           type: array
           items:
@@ -6585,6 +6590,11 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response.
       additionalProperties: false
       required:
         - input
@@ -6666,6 +6676,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
@@ -7440,6 +7440,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
         input:
           type: array
           items:
@@ -7798,6 +7803,11 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response.
       additionalProperties: false
       required:
         - input
@@ -7879,6 +7889,11 @@ components:
           type: string
           description: >-
             (Optional) System message inserted into the model's context
+        max_tool_calls:
+          type: integer
+          description: >-
+            (Optional) Max number of total calls to built-in tools that can be processed
+            in a response
       additionalProperties: false
       required:
         - created_at
diff --git a/src/llama_stack/apis/agents/agents.py b/src/llama_stack/apis/agents/agents.py
@@ -750,6 +750,7 @@ async def create_openai_response(
                 "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
             ),
         ] = None,
+        max_tool_calls: int | None = None,
     ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
         """Create a model response.
 
@@ -760,6 +761,7 @@ async def create_openai_response(
         :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
         :param include: (Optional) Additional fields to include in the response.
         :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
+        :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response.
         :returns: An OpenAIResponseObject.
         """
         ...
diff --git a/src/llama_stack/apis/agents/openai_responses.py b/src/llama_stack/apis/agents/openai_responses.py
@@ -591,6 +591,7 @@ class OpenAIResponseObject(BaseModel):
     :param truncation: (Optional) Truncation strategy applied to the response
     :param usage: (Optional) Token usage information for the response
     :param instructions: (Optional) System message inserted into the model's context
+    :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response
     """
 
     created_at: int
@@ -612,6 +613,7 @@ class OpenAIResponseObject(BaseModel):
     truncation: str | None = None
     usage: OpenAIResponseUsage | None = None
     instructions: str | None = None
+    max_tool_calls: int | None = None
 
 
 @json_schema_type
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -347,6 +347,7 @@ async def create_openai_response(
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[ResponseGuardrail] | None = None,
+        max_tool_calls: int | None = None,
     ) -> OpenAIResponseObject:
         assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
         result = await self.openai_responses_impl.create_openai_response(
@@ -364,6 +365,7 @@ async def create_openai_response(
             include,
             max_infer_iters,
             guardrails,
+            max_tool_calls,
         )
         return result  # type: ignore[no-any-return]
 
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -255,6 +255,7 @@ async def create_openai_response(
         include: list[str] | None = None,
         max_infer_iters: int | None = 10,
         guardrails: list[str | ResponseGuardrailSpec] | None = None,
+        max_tool_calls: int | None = None,
     ):
         stream = bool(stream)
         text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@@ -270,6 +271,11 @@ async def create_openai_response(
             if not conversation.startswith("conv_"):
                 raise InvalidConversationIdError(conversation)
 
+        if max_tool_calls is not None and max_tool_calls < 1:
+            raise ValueError(
+                f"Invalid 'max_tool_calls': integer below minimum value. Expected a value >= 1, but got {max_tool_calls} instead."
+            )
+
         stream_gen = self._create_streaming_response(
             input=input,
             conversation=conversation,
@@ -282,6 +288,7 @@ async def create_openai_response(
             tools=tools,
             max_infer_iters=max_infer_iters,
             guardrail_ids=guardrail_ids,
+            max_tool_calls=max_tool_calls,
         )
 
         if stream:
@@ -331,6 +338,7 @@ async def _create_streaming_response(
         tools: list[OpenAIResponseInputTool] | None = None,
         max_infer_iters: int | None = 10,
         guardrail_ids: list[str] | None = None,
+        max_tool_calls: int | None = None,
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
         # These should never be None when called from create_openai_response (which sets defaults)
         # but we assert here to help mypy understand the types
@@ -373,6 +381,7 @@ async def _create_streaming_response(
             safety_api=self.safety_api,
             guardrail_ids=guardrail_ids,
             instructions=instructions,
+            max_tool_calls=max_tool_calls,
         )
 
         # Stream the response
diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -115,6 +115,7 @@ def __init__(
         safety_api,
         guardrail_ids: list[str] | None = None,
         prompt: OpenAIResponsePrompt | None = None,
+        max_tool_calls: int | None = None,
     ):
         self.inference_api = inference_api
         self.ctx = ctx
@@ -141,6 +142,8 @@ def __init__(
         self.violation_detected = False
         # system message that is inserted into the model's context
         self.instructions = instructions
+        # max number of total calls to built-in tools that can be processed in a response
+        self.max_tool_calls = max_tool_calls
 
     async def _create_refusal_response(self, violation_message: str) -> OpenAIResponseObjectStream:
         """Create a refusal response to replace streaming content."""
@@ -186,6 +189,7 @@ def _snapshot_response(
             usage=self.accumulated_usage,
             instructions=self.instructions,
             prompt=self.prompt,
+            max_tool_calls=self.max_tool_calls,
         )
 
     async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
@@ -893,7 +897,12 @@ async def _coordinate_tool_execution(
     ) -> AsyncIterator[OpenAIResponseObjectStream]:
         """Coordinate execution of both function and non-function tool calls."""
         # Execute non-function tool calls
-        for tool_call in non_function_tool_calls:
+        for idx, tool_call in enumerate(non_function_tool_calls):
+            # Check if total calls to built-in and mcp tools exceeds max_tool_calls
+            if self.max_tool_calls is not None and idx >= self.max_tool_calls:
+                logger.info(f"Ignoring built-in and mcp tool call since {idx + 1} exceeds {self.max_tool_calls}.")
+                break
+
             # Find the item_id for this tool call
             matching_item_id = None
             for index, item_id in completion_result_data.tool_call_item_ids.items():
diff --git a/tests/integration/agents/test_openai_responses.py b/tests/integration/agents/test_openai_responses.py