Skip to content

Commit 8b933b4

Browse files
author
Andrej Simurka
committed
Added quota limit exception handling
1 parent 1a5255a commit 8b933b4

File tree

8 files changed

+443
-125
lines changed

8 files changed

+443
-125
lines changed

docs/openapi.json

Lines changed: 205 additions & 124 deletions
Large diffs are not rendered by default.

src/app/endpoints/query.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Annotated, Any, Optional, cast
99

1010
from fastapi import APIRouter, Depends, HTTPException, Request, status
11+
from litellm.exceptions import RateLimitError
1112
from llama_stack_client import (
1213
APIConnectionError,
1314
AsyncLlamaStackClient, # type: ignore
@@ -42,6 +43,7 @@
4243
ReferencedDocument,
4344
ToolCall,
4445
UnauthorizedResponse,
46+
QuotaExceededResponse
4547
)
4648
from utils.endpoints import (
4749
check_configuration_loaded,
@@ -86,6 +88,10 @@
8688
"description": "User is not authorized",
8789
"model": ForbiddenResponse,
8890
},
91+
429: {
92+
"description": "The quota has been exceeded",
93+
"model": QuotaExceededResponse,
94+
},
8995
500: {
9096
"detail": {
9197
"response": "Unable to connect to Llama Stack",
@@ -421,6 +427,14 @@ async def query_endpoint_handler_base( # pylint: disable=R0914
421427
"cause": str(e),
422428
},
423429
) from e
430+
except RateLimitError as e:
431+
raise HTTPException(
432+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
433+
detail={
434+
"response": "Model quota exceeded",
435+
"cause": f"The token quota for model {e.model} has been exceeded.",
436+
},
437+
) from e
424438

425439

426440
@router.post("/query", responses=query_response)

src/app/endpoints/query_v2.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
QueryResponse,
2828
ReferencedDocument,
2929
UnauthorizedResponse,
30+
QuotaExceededResponse
3031
)
3132
from utils.endpoints import (
3233
get_system_prompt,
@@ -59,6 +60,10 @@
5960
"description": "User is not authorized",
6061
"model": ForbiddenResponse,
6162
},
63+
429: {
64+
"description": "The quota has been exceeded",
65+
"model": QuotaExceededResponse,
66+
},
6267
500: {
6368
"detail": {
6469
"response": "Unable to connect to Llama Stack",

src/app/endpoints/streaming_query.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from datetime import UTC, datetime
99
from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast
1010

11+
from litellm.exceptions import RateLimitError
1112
from fastapi import APIRouter, Depends, HTTPException, Request, status
1213
from fastapi.responses import StreamingResponse
1314
from llama_stack_client import (
@@ -48,7 +49,7 @@
4849
from models.config import Action
4950
from models.database.conversations import UserConversation
5051
from models.requests import QueryRequest
51-
from models.responses import ForbiddenResponse, UnauthorizedResponse
52+
from models.responses import ForbiddenResponse, UnauthorizedResponse, QuotaExceededResponse
5253
from utils.endpoints import (
5354
check_configuration_loaded,
5455
create_referenced_documents_with_metadata,
@@ -104,6 +105,10 @@
104105
"description": "User is not authorized",
105106
"model": ForbiddenResponse,
106107
},
108+
429: {
109+
"description": "The quota has been exceeded",
110+
"model": QuotaExceededResponse,
111+
},
107112
500: {
108113
"detail": {
109114
"response": "Unable to connect to Llama Stack",
@@ -922,6 +927,14 @@ async def response_generator(
922927
"cause": str(e),
923928
},
924929
) from e
930+
except RateLimitError as e:
931+
raise HTTPException(
932+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
933+
detail={
934+
"response": "Model quota exceeded",
935+
"cause": f"The token quota for model {e.model} has been exceeded.",
936+
},
937+
) from e
925938
except Exception as e: # pylint: disable=broad-except
926939
# Handle other errors with OLS-compatible error response
927940
# This broad exception catch is intentional to ensure all errors

src/models/responses.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,71 @@ def __init__(self, user_id: str, resource: str, resource_id: str):
11421142
}
11431143

11441144

1145+
class QuotaExceededResponse(AbstractErrorResponse):
1146+
"""429 Too Many Requests - LLM quota exceeded."""
1147+
1148+
def __init__(self, user_id: str, model_name: str, limit: int, period: str):
1149+
"""Initialize a QuotaExceededResponse."""
1150+
super().__init__(
1151+
detail=DetailModel(
1152+
response="Quota exceeded",
1153+
cause=(
1154+
f"User {user_id} has exceeded the {limit}-request {period} quota "
1155+
f"for {model_name}. Please upgrade your plan or wait until the next cycle."
1156+
),
1157+
)
1158+
)
1159+
1160+
model_config = {
1161+
"json_schema_extra": {
1162+
"examples": [
1163+
{
1164+
"detail": {
1165+
"response": "The quota has been exceeded",
1166+
"cause": "User 123 has no available tokens."
1167+
}
1168+
},
1169+
{
1170+
"detail": {
1171+
"response": "The quota has been exceeded",
1172+
"cause": "Cluster has no available tokens."
1173+
}
1174+
},
1175+
{
1176+
"detail": {
1177+
"response": "The quota has been exceeded",
1178+
"cause": "Unknown subject 999 has no available tokens."
1179+
}
1180+
},
1181+
{
1182+
"detail": {
1183+
"response": "The quota has been exceeded",
1184+
"cause": "User 123 has 5 tokens, but 10 tokens are needed."
1185+
}
1186+
},
1187+
{
1188+
"detail": {
1189+
"response": "The quota has been exceeded",
1190+
"cause": "Cluster has 500 tokens, but 900 tokens are needed."
1191+
}
1192+
},
1193+
{
1194+
"detail": {
1195+
"response": "The quota has been exceeded",
1196+
"cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed."
1197+
}
1198+
},
1199+
{
1200+
"detail": {
1201+
"response": "The model quota has been exceeded",
1202+
"cause": "The token quota for model gpt-4-turbo has been exceeded."
1203+
}
1204+
}
1205+
]
1206+
}
1207+
}
1208+
1209+
11451210
class InvalidFeedbackStoragePathResponse(AbstractErrorResponse):
11461211
"""500 Internal Error - Invalid feedback storage path."""
11471212

tests/unit/app/endpoints/test_query.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest
1111
from pytest_mock import MockerFixture
1212
from fastapi import HTTPException, Request, status
13+
from litellm.exceptions import RateLimitError
1314

1415
from llama_stack_client import APIConnectionError
1516
from llama_stack_client.types import UserMessage # type: ignore
@@ -2261,3 +2262,47 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) -
22612262
stream=False,
22622263
toolgroups=None,
22632264
)
2265+
2266+
2267+
@pytest.mark.asyncio
2268+
async def test_query_endpoint_quota_exceeded(
2269+
mocker: MockerFixture, dummy_request: Request) -> None:
2270+
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
2271+
query_request = QueryRequest(
2272+
query="What is OpenStack?",
2273+
provider="openai",
2274+
model="gpt-4-turbo",
2275+
)
2276+
mock_client = mocker.AsyncMock()
2277+
mock_agent = mocker.AsyncMock()
2278+
mock_agent.create_turn.side_effect = RateLimitError(
2279+
model="gpt-4-turbo", llm_provider="openai", message=""
2280+
)
2281+
mocker.patch(
2282+
"app.endpoints.query.get_agent",
2283+
return_value=(mock_agent, "conv-123", "sess-123")
2284+
)
2285+
mocker.patch(
2286+
"app.endpoints.query.select_model_and_provider_id",
2287+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai")
2288+
)
2289+
mocker.patch("app.endpoints.query.validate_model_provider_override")
2290+
mocker.patch(
2291+
"app.endpoints.query.AsyncLlamaStackClientHolder.get_client",
2292+
return_value=mock_client
2293+
)
2294+
mocker.patch(
2295+
"app.endpoints.query.handle_mcp_headers_with_toolgroups",
2296+
return_value={}
2297+
)
2298+
2299+
with pytest.raises(HTTPException) as exc_info:
2300+
await query_endpoint_handler(
2301+
dummy_request,
2302+
query_request=query_request,
2303+
auth=MOCK_AUTH
2304+
)
2305+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
2306+
detail = exc_info.value.detail
2307+
assert detail["response"] == "Model quota exceeded"
2308+
assert "gpt-4-turbo" in detail["cause"]

tests/unit/app/endpoints/test_query_v2.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""Unit tests for the /query (v2) REST API endpoint using Responses API."""
33

44
from typing import Any
5+
from litellm.exceptions import RateLimitError
56
import pytest
67
from pytest_mock import MockerFixture
78
from fastapi import HTTPException, status, Request
@@ -18,6 +19,14 @@
1819
query_endpoint_handler_v2,
1920
)
2021

22+
# User ID must be proper UUID
23+
MOCK_AUTH = (
24+
"00000001-0001-0001-0001-000000000001",
25+
"mock_username",
26+
False,
27+
"mock_token",
28+
)
29+
2130

2231
@pytest.fixture
2332
def dummy_request() -> Request:
@@ -432,3 +441,43 @@ def _raise(*_args: Any, **_kwargs: Any) -> Exception:
432441
assert exc.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR
433442
assert "Unable to connect to Llama Stack" in str(exc.value.detail)
434443
fail_metric.inc.assert_called_once()
444+
445+
446+
@pytest.mark.asyncio
447+
async def test_query_endpoint_quota_exceeded(
448+
mocker: MockerFixture, dummy_request: Request) -> None:
449+
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
450+
query_request = QueryRequest(
451+
query="What is OpenStack?",
452+
provider="openai",
453+
model="gpt-4-turbo",
454+
attachments=[],
455+
)
456+
mock_client = mocker.AsyncMock()
457+
mock_client.responses.create.side_effect = RateLimitError(
458+
model="gpt-4-turbo", llm_provider="openai", message=""
459+
)
460+
mocker.patch(
461+
"app.endpoints.query.select_model_and_provider_id",
462+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai")
463+
)
464+
mocker.patch("app.endpoints.query.validate_model_provider_override")
465+
mocker.patch(
466+
"app.endpoints.query.AsyncLlamaStackClientHolder.get_client",
467+
return_value=mock_client
468+
)
469+
mocker.patch(
470+
"app.endpoints.query.handle_mcp_headers_with_toolgroups",
471+
return_value={}
472+
)
473+
474+
with pytest.raises(HTTPException) as exc_info:
475+
await query_endpoint_handler_v2(
476+
dummy_request,
477+
query_request=query_request,
478+
auth=MOCK_AUTH
479+
)
480+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
481+
detail = exc_info.value.detail
482+
assert detail["response"] == "Model quota exceeded"
483+
assert "gpt-4-turbo" in detail["cause"]

tests/unit/app/endpoints/test_streaming_query.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import json
88

9+
from litellm.exceptions import RateLimitError
910
import pytest
1011
from pytest_mock import MockerFixture
1112

@@ -1795,6 +1796,51 @@ async def test_streaming_query_handles_none_event(mocker: MockerFixture) -> None
17951796
assert isinstance(response, StreamingResponse)
17961797

17971798

1799+
@pytest.mark.asyncio
1800+
async def test_query_endpoint_quota_exceeded(
1801+
mocker: MockerFixture) -> None:
1802+
"""Test that streaming query endpoint raises HTTP 429 when model quota is exceeded."""
1803+
query_request = QueryRequest(
1804+
query="What is OpenStack?",
1805+
provider="openai",
1806+
model="gpt-4-turbo",
1807+
)
1808+
request = Request(scope={"type": "http"})
1809+
mock_client = mocker.AsyncMock()
1810+
mock_agent = mocker.AsyncMock()
1811+
mock_agent.create_turn.side_effect = RateLimitError(
1812+
model="gpt-4-turbo", llm_provider="openai", message=""
1813+
)
1814+
mocker.patch(
1815+
"app.endpoints.streaming_query.get_agent",
1816+
return_value=(mock_agent, "conv-123", "sess-123")
1817+
)
1818+
mocker.patch(
1819+
"app.endpoints.streaming_query.select_model_and_provider_id",
1820+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai")
1821+
)
1822+
mocker.patch("app.endpoints.streaming_query.validate_model_provider_override")
1823+
mocker.patch(
1824+
"app.endpoints.streaming_query.AsyncLlamaStackClientHolder.get_client",
1825+
return_value=mock_client
1826+
)
1827+
mocker.patch(
1828+
"app.endpoints.streaming_query.handle_mcp_headers_with_toolgroups",
1829+
return_value={}
1830+
)
1831+
1832+
with pytest.raises(HTTPException) as exc_info:
1833+
await streaming_query_endpoint_handler(
1834+
request,
1835+
query_request=query_request,
1836+
auth=MOCK_AUTH
1837+
)
1838+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
1839+
detail = exc_info.value.detail
1840+
assert detail["response"] == "Model quota exceeded"
1841+
assert "gpt-4-turbo" in detail["cause"]
1842+
1843+
17981844
# ============================================================================
17991845
# OLS Compatibility Tests
18001846
# ============================================================================

0 commit comments

Comments
 (0)