|
2 | 2 | """Unit tests for the /query (v2) REST API endpoint using Responses API.""" |
3 | 3 |
|
4 | 4 | from typing import Any |
| 5 | +from litellm.exceptions import RateLimitError |
5 | 6 | import pytest |
6 | 7 | from pytest_mock import MockerFixture |
7 | 8 | from fastapi import HTTPException, status, Request |
|
18 | 19 | query_endpoint_handler_v2, |
19 | 20 | ) |
20 | 21 |
|
| 22 | +# User ID must be proper UUID |
| 23 | +MOCK_AUTH = ( |
| 24 | + "00000001-0001-0001-0001-000000000001", |
| 25 | + "mock_username", |
| 26 | + False, |
| 27 | + "mock_token", |
| 28 | +) |
| 29 | + |
21 | 30 |
|
22 | 31 | @pytest.fixture |
23 | 32 | def dummy_request() -> Request: |
@@ -432,3 +441,43 @@ def _raise(*_args: Any, **_kwargs: Any) -> Exception: |
432 | 441 | assert exc.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR |
433 | 442 | assert "Unable to connect to Llama Stack" in str(exc.value.detail) |
434 | 443 | fail_metric.inc.assert_called_once() |
| 444 | + |
| 445 | + |
| 446 | +@pytest.mark.asyncio |
| 447 | +async def test_query_endpoint_quota_exceeded( |
| 448 | + mocker: MockerFixture, dummy_request: Request) -> None: |
| 449 | + """Test that query endpoint raises HTTP 429 when model quota is exceeded.""" |
| 450 | + query_request = QueryRequest( |
| 451 | + query="What is OpenStack?", |
| 452 | + provider="openai", |
| 453 | + model="gpt-4-turbo", |
| 454 | + attachments=[], |
| 455 | + ) |
| 456 | + mock_client = mocker.AsyncMock() |
| 457 | + mock_client.responses.create.side_effect = RateLimitError( |
| 458 | + model="gpt-4-turbo", llm_provider="openai", message="" |
| 459 | + ) |
| 460 | + mocker.patch( |
| 461 | + "app.endpoints.query.select_model_and_provider_id", |
| 462 | + return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai") |
| 463 | + ) |
| 464 | + mocker.patch("app.endpoints.query.validate_model_provider_override") |
| 465 | + mocker.patch( |
| 466 | + "app.endpoints.query.AsyncLlamaStackClientHolder.get_client", |
| 467 | + return_value=mock_client |
| 468 | + ) |
| 469 | + mocker.patch( |
| 470 | + "app.endpoints.query.handle_mcp_headers_with_toolgroups", |
| 471 | + return_value={} |
| 472 | + ) |
| 473 | + |
| 474 | + with pytest.raises(HTTPException) as exc_info: |
| 475 | + await query_endpoint_handler_v2( |
| 476 | + dummy_request, |
| 477 | + query_request=query_request, |
| 478 | + auth=MOCK_AUTH |
| 479 | + ) |
| 480 | + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS |
| 481 | + detail = exc_info.value.detail |
| 482 | + assert detail["response"] == "Model quota exceeded" |
| 483 | + assert "gpt-4-turbo" in detail["cause"] |
0 commit comments