|
1 | 1 | # pylint: disable=redefined-outer-name, import-error |
2 | 2 | """Unit tests for the /query (v2) REST API endpoint using Responses API.""" |
3 | 3 |
|
| 4 | +from litellm.exceptions import RateLimitError |
4 | 5 | import pytest |
5 | 6 | from fastapi import HTTPException, status, Request |
6 | 7 |
|
7 | 8 | from llama_stack_client import APIConnectionError |
| 9 | +from pytest_mock import MockerFixture |
8 | 10 |
|
9 | 11 | from models.requests import QueryRequest, Attachment |
10 | 12 | from models.config import ModelContextProtocolServer |
|
16 | 18 | query_endpoint_handler_v2, |
17 | 19 | ) |
18 | 20 |
|
| 21 | +# User ID must be proper UUID |
| 22 | +MOCK_AUTH = ( |
| 23 | + "00000001-0001-0001-0001-000000000001", |
| 24 | + "mock_username", |
| 25 | + False, |
| 26 | + "mock_token", |
| 27 | +) |
| 28 | + |
19 | 29 |
|
20 | 30 | @pytest.fixture |
21 | 31 | def dummy_request() -> Request: |
@@ -421,3 +431,43 @@ def _raise(*_args, **_kwargs): |
421 | 431 | assert exc.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR |
422 | 432 | assert "Unable to connect to Llama Stack" in str(exc.value.detail) |
423 | 433 | fail_metric.inc.assert_called_once() |
| 434 | + |
| 435 | + |
| 436 | +@pytest.mark.asyncio |
| 437 | +async def test_query_endpoint_quota_exceeded( |
| 438 | + mocker: MockerFixture, dummy_request: Request) -> None: |
| 439 | + """Test that query endpoint raises HTTP 429 when model quota is exceeded.""" |
| 440 | + query_request = QueryRequest( |
| 441 | + query="What is OpenStack?", |
| 442 | + provider="openai", |
| 443 | + model="gpt-4-turbo", |
| 444 | + attachments=[], |
| 445 | + ) |
| 446 | + mock_client = mocker.AsyncMock() |
| 447 | + mock_client.responses.create.side_effect = RateLimitError( |
| 448 | + model="gpt-4-turbo", llm_provider="openai", message="" |
| 449 | + ) |
| 450 | + mocker.patch( |
| 451 | + "app.endpoints.query.select_model_and_provider_id", |
| 452 | + return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai") |
| 453 | + ) |
| 454 | + mocker.patch("app.endpoints.query.validate_model_provider_override") |
| 455 | + mocker.patch( |
| 456 | + "app.endpoints.query.AsyncLlamaStackClientHolder.get_client", |
| 457 | + return_value=mock_client |
| 458 | + ) |
| 459 | + mocker.patch( |
| 460 | + "app.endpoints.query.handle_mcp_headers_with_toolgroups", |
| 461 | + return_value={} |
| 462 | + ) |
| 463 | + |
| 464 | + with pytest.raises(HTTPException) as exc_info: |
| 465 | + await query_endpoint_handler_v2( |
| 466 | + dummy_request, |
| 467 | + query_request=query_request, |
| 468 | + auth=MOCK_AUTH |
| 469 | + ) |
| 470 | + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS |
| 471 | + detail = exc_info.value.detail |
| 472 | + assert detail["response"] == "Model quota exceeded" |
| 473 | + assert "gpt-4-turbo" in detail["cause"] |
0 commit comments