|
10 | 10 | import pytest |
11 | 11 | from pytest_mock import MockerFixture |
12 | 12 | from fastapi import HTTPException, Request, status |
| 13 | +from litellm.exceptions import RateLimitError |
13 | 14 |
|
14 | 15 | from llama_stack_client import APIConnectionError |
15 | 16 | from llama_stack_client.types import UserMessage # type: ignore |
@@ -2261,3 +2262,46 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) - |
2261 | 2262 | stream=False, |
2262 | 2263 | toolgroups=None, |
2263 | 2264 | ) |
| 2265 | + |
| 2266 | + |
| 2267 | +@pytest.mark.asyncio |
| 2268 | +async def test_query_endpoint_quota_exceeded( |
| 2269 | + mocker: MockerFixture, dummy_request: Request |
| 2270 | +) -> None: |
| 2271 | + """Test that query endpoint raises HTTP 429 when model quota is exceeded.""" |
| 2272 | + query_request = QueryRequest( |
| 2273 | + query="What is OpenStack?", |
| 2274 | + provider="openai", |
| 2275 | + model="gpt-4-turbo", |
| 2276 | + ) # type: ignore |
| 2277 | + mock_client = mocker.AsyncMock() |
| 2278 | + mock_agent = mocker.AsyncMock() |
| 2279 | + mock_agent.create_turn.side_effect = RateLimitError( |
| 2280 | + model="gpt-4-turbo", llm_provider="openai", message="" |
| 2281 | + ) |
| 2282 | + mocker.patch( |
| 2283 | + "app.endpoints.query.get_agent", |
| 2284 | + return_value=(mock_agent, "conv-123", "sess-123"), |
| 2285 | + ) |
| 2286 | + mocker.patch( |
| 2287 | + "app.endpoints.query.select_model_and_provider_id", |
| 2288 | + return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"), |
| 2289 | + ) |
| 2290 | + mocker.patch("app.endpoints.query.validate_model_provider_override") |
| 2291 | + mocker.patch( |
| 2292 | + "client.AsyncLlamaStackClientHolder.get_client", |
| 2293 | + return_value=mock_client, |
| 2294 | + ) |
| 2295 | + mocker.patch( |
| 2296 | + "app.endpoints.query.handle_mcp_headers_with_toolgroups", return_value={} |
| 2297 | + ) |
| 2298 | + |
| 2299 | + with pytest.raises(HTTPException) as exc_info: |
| 2300 | + await query_endpoint_handler( |
| 2301 | + dummy_request, query_request=query_request, auth=MOCK_AUTH |
| 2302 | + ) |
| 2303 | + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS |
| 2304 | + detail = exc_info.value.detail |
| 2305 | + assert isinstance(detail, dict) |
| 2306 | + assert detail["response"] == "Model quota exceeded" # type: ignore |
| 2307 | + assert "gpt-4-turbo" in detail["cause"] # type: ignore |
0 commit comments