Skip to content

Commit 2b65716

Browse files
author
Andrej Simurka
committed
Added quota limit exception handling
1 parent 4f39c33 commit 2b65716

File tree

8 files changed

+444
-125
lines changed

8 files changed

+444
-125
lines changed

docs/openapi.json

Lines changed: 205 additions & 124 deletions
Large diffs are not rendered by default.

src/app/endpoints/query.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Annotated, Any, Optional, cast
99

1010
from fastapi import APIRouter, Depends, HTTPException, Request, status
11+
from litellm.exceptions import RateLimitError
1112
from llama_stack_client import (
1213
APIConnectionError,
1314
AsyncLlamaStackClient, # type: ignore
@@ -42,6 +43,7 @@
4243
ReferencedDocument,
4344
ToolCall,
4445
UnauthorizedResponse,
46+
QuotaExceededResponse
4547
)
4648
from utils.endpoints import (
4749
check_configuration_loaded,
@@ -86,6 +88,10 @@
8688
"description": "User is not authorized",
8789
"model": ForbiddenResponse,
8890
},
91+
429: {
92+
"description": "The quota has been exceeded",
93+
"model": QuotaExceededResponse,
94+
},
8995
500: {
9096
"detail": {
9197
"response": "Unable to connect to Llama Stack",
@@ -421,6 +427,14 @@ async def query_endpoint_handler_base( # pylint: disable=R0914
421427
"cause": str(e),
422428
},
423429
) from e
430+
except RateLimitError as e:
431+
raise HTTPException(
432+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
433+
detail={
434+
"response": "Model quota exceeded",
435+
"cause": f"The token quota for model {e.model} has been exceeded.",
436+
},
437+
) from e
424438

425439

426440
@router.post("/query", responses=query_response)

src/app/endpoints/query_v2.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
QueryResponse,
2828
ReferencedDocument,
2929
UnauthorizedResponse,
30+
QuotaExceededResponse
3031
)
3132
from utils.endpoints import (
3233
get_system_prompt,
@@ -59,6 +60,10 @@
5960
"description": "User is not authorized",
6061
"model": ForbiddenResponse,
6162
},
63+
429: {
64+
"description": "The quota has been exceeded",
65+
"model": QuotaExceededResponse,
66+
},
6267
500: {
6368
"detail": {
6469
"response": "Unable to connect to Llama Stack",

src/app/endpoints/streaming_query.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from datetime import UTC, datetime
99
from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast
1010

11+
from litellm.exceptions import RateLimitError
1112
from fastapi import APIRouter, Depends, HTTPException, Request, status
1213
from fastapi.responses import StreamingResponse
1314
from llama_stack_client import (
@@ -48,7 +49,7 @@
4849
from models.config import Action
4950
from models.database.conversations import UserConversation
5051
from models.requests import QueryRequest
51-
from models.responses import ForbiddenResponse, UnauthorizedResponse
52+
from models.responses import ForbiddenResponse, UnauthorizedResponse, QuotaExceededResponse
5253
from utils.endpoints import (
5354
check_configuration_loaded,
5455
create_referenced_documents_with_metadata,
@@ -104,6 +105,10 @@
104105
"description": "User is not authorized",
105106
"model": ForbiddenResponse,
106107
},
108+
429: {
109+
"description": "The quota has been exceeded",
110+
"model": QuotaExceededResponse,
111+
},
107112
500: {
108113
"detail": {
109114
"response": "Unable to connect to Llama Stack",
@@ -922,6 +927,14 @@ async def response_generator(
922927
"cause": str(e),
923928
},
924929
) from e
930+
except RateLimitError as e:
931+
raise HTTPException(
932+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
933+
detail={
934+
"response": "Model quota exceeded",
935+
"cause": f"The token quota for model {e.model} has been exceeded.",
936+
},
937+
) from e
925938
except Exception as e: # pylint: disable=broad-except
926939
# Handle other errors with OLS-compatible error response
927940
# This broad exception catch is intentional to ensure all errors

src/models/responses.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,71 @@ def __init__(self, user_id: str, resource: str, resource_id: str):
11421142
}
11431143

11441144

1145+
class QuotaExceededResponse(AbstractErrorResponse):
1146+
"""429 Too Many Requests - LLM quota exceeded."""
1147+
1148+
def __init__(self, user_id: str, model_name: str, limit: int, period: str):
1149+
"""Initialize a QuotaExceededResponse."""
1150+
super().__init__(
1151+
detail=DetailModel(
1152+
response="Quota exceeded",
1153+
cause=(
1154+
f"User {user_id} has exceeded the {limit}-request {period} quota "
1155+
f"for {model_name}. Please upgrade your plan or wait until the next cycle."
1156+
),
1157+
)
1158+
)
1159+
1160+
model_config = {
1161+
"json_schema_extra": {
1162+
"examples": [
1163+
{
1164+
"detail": {
1165+
"response": "The quota has been exceeded",
1166+
"cause": "User 123 has no available tokens."
1167+
}
1168+
},
1169+
{
1170+
"detail": {
1171+
"response": "The quota has been exceeded",
1172+
"cause": "Cluster has no available tokens."
1173+
}
1174+
},
1175+
{
1176+
"detail": {
1177+
"response": "The quota has been exceeded",
1178+
"cause": "Unknown subject 999 has no available tokens."
1179+
}
1180+
},
1181+
{
1182+
"detail": {
1183+
"response": "The quota has been exceeded",
1184+
"cause": "User 123 has 5 tokens, but 10 tokens are needed."
1185+
}
1186+
},
1187+
{
1188+
"detail": {
1189+
"response": "The quota has been exceeded",
1190+
"cause": "Cluster has 500 tokens, but 900 tokens are needed."
1191+
}
1192+
},
1193+
{
1194+
"detail": {
1195+
"response": "The quota has been exceeded",
1196+
"cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed."
1197+
}
1198+
},
1199+
{
1200+
"detail": {
1201+
"response": "The model quota has been exceeded",
1202+
"cause": "The token quota for model gpt-4-turbo has been exceeded."
1203+
}
1204+
}
1205+
]
1206+
}
1207+
}
1208+
1209+
11451210
class InvalidFeedbackStoragePathResponse(AbstractErrorResponse):
11461211
"""500 Internal Error - Invalid feedback storage path."""
11471212

tests/unit/app/endpoints/test_query.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest
1111
from pytest_mock import MockerFixture
1212
from fastapi import HTTPException, Request, status
13+
from litellm.exceptions import RateLimitError
1314

1415
from llama_stack_client import APIConnectionError
1516
from llama_stack_client.types import UserMessage # type: ignore
@@ -2261,3 +2262,47 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) -
22612262
stream=False,
22622263
toolgroups=None,
22632264
)
2265+
2266+
2267+
@pytest.mark.asyncio
2268+
async def test_query_endpoint_quota_exceeded(
2269+
mocker: MockerFixture, dummy_request: Request) -> None:
2270+
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
2271+
query_request = QueryRequest(
2272+
query="What is OpenStack?",
2273+
provider="openai",
2274+
model="gpt-4-turbo",
2275+
)
2276+
mock_client = mocker.AsyncMock()
2277+
mock_agent = mocker.AsyncMock()
2278+
mock_agent.create_turn.side_effect = RateLimitError(
2279+
model="gpt-4-turbo", llm_provider="openai", message=""
2280+
)
2281+
mocker.patch(
2282+
"app.endpoints.query.get_agent",
2283+
return_value=(mock_agent, "conv-123", "sess-123")
2284+
)
2285+
mocker.patch(
2286+
"app.endpoints.query.select_model_and_provider_id",
2287+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai")
2288+
)
2289+
mocker.patch("app.endpoints.query.validate_model_provider_override")
2290+
mocker.patch(
2291+
"app.endpoints.query.AsyncLlamaStackClientHolder.get_client",
2292+
return_value=mock_client
2293+
)
2294+
mocker.patch(
2295+
"app.endpoints.query.handle_mcp_headers_with_toolgroups",
2296+
return_value={}
2297+
)
2298+
2299+
with pytest.raises(HTTPException) as exc_info:
2300+
await query_endpoint_handler(
2301+
dummy_request,
2302+
query_request=query_request,
2303+
auth=MOCK_AUTH
2304+
)
2305+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
2306+
detail = exc_info.value.detail
2307+
assert detail["response"] == "Model quota exceeded"
2308+
assert "gpt-4-turbo" in detail["cause"]

tests/unit/app/endpoints/test_query_v2.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# pylint: disable=redefined-outer-name, import-error
22
"""Unit tests for the /query (v2) REST API endpoint using Responses API."""
33

4+
from litellm.exceptions import RateLimitError
45
import pytest
56
from fastapi import HTTPException, status, Request
67

78
from llama_stack_client import APIConnectionError
9+
from pytest_mock import MockerFixture
810

911
from models.requests import QueryRequest, Attachment
1012
from models.config import ModelContextProtocolServer
@@ -16,6 +18,14 @@
1618
query_endpoint_handler_v2,
1719
)
1820

21+
# User ID must be proper UUID
22+
MOCK_AUTH = (
23+
"00000001-0001-0001-0001-000000000001",
24+
"mock_username",
25+
False,
26+
"mock_token",
27+
)
28+
1929

2030
@pytest.fixture
2131
def dummy_request() -> Request:
@@ -421,3 +431,43 @@ def _raise(*_args, **_kwargs):
421431
assert exc.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR
422432
assert "Unable to connect to Llama Stack" in str(exc.value.detail)
423433
fail_metric.inc.assert_called_once()
434+
435+
436+
@pytest.mark.asyncio
437+
async def test_query_endpoint_quota_exceeded(
438+
mocker: MockerFixture, dummy_request: Request) -> None:
439+
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
440+
query_request = QueryRequest(
441+
query="What is OpenStack?",
442+
provider="openai",
443+
model="gpt-4-turbo",
444+
attachments=[],
445+
)
446+
mock_client = mocker.AsyncMock()
447+
mock_client.responses.create.side_effect = RateLimitError(
448+
model="gpt-4-turbo", llm_provider="openai", message=""
449+
)
450+
mocker.patch(
451+
"app.endpoints.query.select_model_and_provider_id",
452+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai")
453+
)
454+
mocker.patch("app.endpoints.query.validate_model_provider_override")
455+
mocker.patch(
456+
"app.endpoints.query.AsyncLlamaStackClientHolder.get_client",
457+
return_value=mock_client
458+
)
459+
mocker.patch(
460+
"app.endpoints.query.handle_mcp_headers_with_toolgroups",
461+
return_value={}
462+
)
463+
464+
with pytest.raises(HTTPException) as exc_info:
465+
await query_endpoint_handler_v2(
466+
dummy_request,
467+
query_request=query_request,
468+
auth=MOCK_AUTH
469+
)
470+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
471+
detail = exc_info.value.detail
472+
assert detail["response"] == "Model quota exceeded"
473+
assert "gpt-4-turbo" in detail["cause"]

tests/unit/app/endpoints/test_streaming_query.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import json
88

9+
from litellm.exceptions import RateLimitError
910
import pytest
1011
from pytest_mock import MockerFixture
1112

@@ -1789,6 +1790,51 @@ async def test_streaming_query_handles_none_event(mocker: MockerFixture) -> None
17891790
assert isinstance(response, StreamingResponse)
17901791

17911792

1793+
@pytest.mark.asyncio
1794+
async def test_query_endpoint_quota_exceeded(
1795+
mocker: MockerFixture) -> None:
1796+
"""Test that streaming query endpoint raises HTTP 429 when model quota is exceeded."""
1797+
query_request = QueryRequest(
1798+
query="What is OpenStack?",
1799+
provider="openai",
1800+
model="gpt-4-turbo",
1801+
)
1802+
request = Request(scope={"type": "http"})
1803+
mock_client = mocker.AsyncMock()
1804+
mock_agent = mocker.AsyncMock()
1805+
mock_agent.create_turn.side_effect = RateLimitError(
1806+
model="gpt-4-turbo", llm_provider="openai", message=""
1807+
)
1808+
mocker.patch(
1809+
"app.endpoints.streaming_query.get_agent",
1810+
return_value=(mock_agent, "conv-123", "sess-123")
1811+
)
1812+
mocker.patch(
1813+
"app.endpoints.streaming_query.select_model_and_provider_id",
1814+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai")
1815+
)
1816+
mocker.patch("app.endpoints.streaming_query.validate_model_provider_override")
1817+
mocker.patch(
1818+
"app.endpoints.streaming_query.AsyncLlamaStackClientHolder.get_client",
1819+
return_value=mock_client
1820+
)
1821+
mocker.patch(
1822+
"app.endpoints.streaming_query.handle_mcp_headers_with_toolgroups",
1823+
return_value={}
1824+
)
1825+
1826+
with pytest.raises(HTTPException) as exc_info:
1827+
await streaming_query_endpoint_handler(
1828+
request,
1829+
query_request=query_request,
1830+
auth=MOCK_AUTH
1831+
)
1832+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
1833+
detail = exc_info.value.detail
1834+
assert detail["response"] == "Model quota exceeded"
1835+
assert "gpt-4-turbo" in detail["cause"]
1836+
1837+
17921838
# ============================================================================
17931839
# OLS Compatibility Tests
17941840
# ============================================================================

0 commit comments

Comments
 (0)