Skip to content

Commit 3ed6095

Browse files
author
Andrej Simurka
committed
Added quota limit exception handling
1 parent 584cd25 commit 3ed6095

File tree

10 files changed

+330
-5
lines changed

10 files changed

+330
-5
lines changed

docs/openapi.json

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,16 @@
375375
}
376376
}
377377
},
378+
"429": {
379+
"description": "The quota has been exceeded",
380+
"content": {
381+
"application/json": {
382+
"schema": {
383+
"$ref": "#/components/schemas/QuotaExceededResponse"
384+
}
385+
}
386+
}
387+
},
378388
"500": {
379389
"description": "Internal Server Error",
380390
"detail": {
@@ -461,6 +471,16 @@
461471
}
462472
}
463473
},
474+
"429": {
475+
"description": "The quota has been exceeded",
476+
"content": {
477+
"application/json": {
478+
"schema": {
479+
"$ref": "#/components/schemas/QuotaExceededResponse"
480+
}
481+
}
482+
}
483+
},
464484
"500": {
465485
"description": "Internal Server Error",
466486
"detail": {
@@ -1256,6 +1276,16 @@
12561276
}
12571277
}
12581278
},
1279+
"429": {
1280+
"description": "The quota has been exceeded",
1281+
"content": {
1282+
"application/json": {
1283+
"schema": {
1284+
"$ref": "#/components/schemas/QuotaExceededResponse"
1285+
}
1286+
}
1287+
}
1288+
},
12591289
"500": {
12601290
"description": "Internal Server Error",
12611291
"detail": {
@@ -3577,6 +3607,63 @@
35773607
}
35783608
]
35793609
},
3610+
"QuotaExceededResponse": {
3611+
"properties": {
3612+
"detail": {
3613+
"$ref": "#/components/schemas/DetailModel"
3614+
}
3615+
},
3616+
"type": "object",
3617+
"required": [
3618+
"detail"
3619+
],
3620+
"title": "QuotaExceededResponse",
3621+
"description": "429 Too Many Requests - LLM quota exceeded.",
3622+
"examples": [
3623+
{
3624+
"detail": {
3625+
"cause": "User 123 has no available tokens.",
3626+
"response": "The quota has been exceeded"
3627+
}
3628+
},
3629+
{
3630+
"detail": {
3631+
"cause": "Cluster has no available tokens.",
3632+
"response": "The quota has been exceeded"
3633+
}
3634+
},
3635+
{
3636+
"detail": {
3637+
"cause": "Unknown subject 999 has no available tokens.",
3638+
"response": "The quota has been exceeded"
3639+
}
3640+
},
3641+
{
3642+
"detail": {
3643+
"cause": "User 123 has 5 tokens, but 10 tokens are needed.",
3644+
"response": "The quota has been exceeded"
3645+
}
3646+
},
3647+
{
3648+
"detail": {
3649+
"cause": "Cluster has 500 tokens, but 900 tokens are needed.",
3650+
"response": "The quota has been exceeded"
3651+
}
3652+
},
3653+
{
3654+
"detail": {
3655+
"cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.",
3656+
"response": "The quota has been exceeded"
3657+
}
3658+
},
3659+
{
3660+
"detail": {
3661+
"cause": "The token quota for model gpt-4-turbo has been exceeded.",
3662+
"response": "The model quota has been exceeded"
3663+
}
3664+
}
3665+
]
3666+
},
35803667
"QuotaHandlersConfiguration": {
35813668
"properties": {
35823669
"sqlite": {

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ dependencies = [
5151
# Used by authorization resolvers
5252
"jsonpath-ng>=1.6.1",
5353
"psycopg2-binary>=2.9.10",
54+
"litellm>=1.75.5.post1",
5455
]
5556

5657

@@ -129,8 +130,6 @@ llslibdev = [
129130
"langdetect>=1.0.9",
130131
"emoji>=2.1.0",
131132
"nltk>=3.8.1",
132-
# API inference: remote::gemini
133-
"litellm>=1.75.5.post1",
134133
# API inference: inline::sentence-transformers
135134
"sentence-transformers>=5.0.0",
136135
# API vector_io: inline::faiss

src/app/endpoints/query.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Annotated, Any, Optional, cast
99

1010
from fastapi import APIRouter, Depends, HTTPException, Request, status
11+
from litellm.exceptions import RateLimitError
1112
from llama_stack_client import (
1213
APIConnectionError,
1314
AsyncLlamaStackClient, # type: ignore
@@ -42,6 +43,7 @@
4243
ReferencedDocument,
4344
ToolCall,
4445
UnauthorizedResponse,
46+
QuotaExceededResponse,
4547
)
4648
from utils.endpoints import (
4749
check_configuration_loaded,
@@ -86,6 +88,10 @@
8688
"description": "Client does not have permission to access conversation",
8789
"model": ForbiddenResponse,
8890
},
91+
429: {
92+
"description": "The quota has been exceeded",
93+
"model": QuotaExceededResponse,
94+
},
8995
500: {
9096
"detail": {
9197
"response": "Unable to connect to Llama Stack",
@@ -421,6 +427,15 @@ async def query_endpoint_handler_base( # pylint: disable=R0914
421427
"cause": str(e),
422428
},
423429
) from e
430+
except RateLimitError as e:
431+
used_model = getattr(e, "model", "unknown")
432+
raise HTTPException(
433+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
434+
detail={
435+
"response": "Model quota exceeded",
436+
"cause": f"The token quota for model {used_model} has been exceeded.",
437+
},
438+
) from e
424439

425440

426441
@router.post("/query", responses=query_response)

src/app/endpoints/query_v2.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
QueryResponse,
2828
ReferencedDocument,
2929
UnauthorizedResponse,
30+
QuotaExceededResponse,
3031
)
3132
from utils.endpoints import (
3233
get_system_prompt,
@@ -59,6 +60,10 @@
5960
"description": "Client does not have permission to access conversation",
6061
"model": ForbiddenResponse,
6162
},
63+
429: {
64+
"description": "The quota has been exceeded",
65+
"model": QuotaExceededResponse,
66+
},
6267
500: {
6368
"detail": {
6469
"response": "Unable to connect to Llama Stack",

src/app/endpoints/streaming_query.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from datetime import UTC, datetime
99
from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast
1010

11+
from litellm.exceptions import RateLimitError
1112
from fastapi import APIRouter, Depends, HTTPException, Request, status
1213
from fastapi.responses import StreamingResponse
1314
from llama_stack_client import (
@@ -48,7 +49,11 @@
4849
from models.config import Action
4950
from models.database.conversations import UserConversation
5051
from models.requests import QueryRequest
51-
from models.responses import ForbiddenResponse, UnauthorizedResponse
52+
from models.responses import (
53+
ForbiddenResponse,
54+
UnauthorizedResponse,
55+
QuotaExceededResponse,
56+
)
5257
from utils.endpoints import (
5358
check_configuration_loaded,
5459
create_referenced_documents_with_metadata,
@@ -104,6 +109,10 @@
104109
"description": "Client does not have permission to access conversation",
105110
"model": ForbiddenResponse,
106111
},
112+
429: {
113+
"description": "The quota has been exceeded",
114+
"model": QuotaExceededResponse,
115+
},
107116
500: {
108117
"detail": {
109118
"response": "Unable to connect to Llama Stack",
@@ -922,6 +931,15 @@ async def response_generator(
922931
"cause": str(e),
923932
},
924933
) from e
934+
except RateLimitError as e:
935+
used_model = getattr(e, "model", "unknown")
936+
raise HTTPException(
937+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
938+
detail={
939+
"response": "Model quota exceeded",
940+
"cause": f"The token quota for model {used_model} has been exceeded.",
941+
},
942+
) from e
925943
except Exception as e: # pylint: disable=broad-except
926944
# Handle other errors with OLS-compatible error response
927945
# This broad exception catch is intentional to ensure all errors

src/models/responses.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,74 @@ def __init__(self, user_id: str, resource: str, resource_id: str):
11421142
}
11431143

11441144

1145+
class QuotaExceededResponse(AbstractErrorResponse):
1146+
"""429 Too Many Requests - LLM quota exceeded."""
1147+
1148+
def __init__(
1149+
self,
1150+
user_id: str,
1151+
model_name: str, # pylint: disable=unused-argument
1152+
limit: int, # pylint: disable=unused-argument
1153+
):
1154+
"""Initialize a QuotaExceededResponse."""
1155+
super().__init__(
1156+
detail=DetailModel(
1157+
response="The quota has been exceeded",
1158+
cause=(f"User {user_id} has no available tokens."),
1159+
)
1160+
)
1161+
# TODO(LCORE-837): add factories for custom cause creation
1162+
1163+
model_config = {
1164+
"json_schema_extra": {
1165+
"examples": [
1166+
{
1167+
"detail": {
1168+
"response": "The quota has been exceeded",
1169+
"cause": "User 123 has no available tokens.",
1170+
}
1171+
},
1172+
{
1173+
"detail": {
1174+
"response": "The quota has been exceeded",
1175+
"cause": "Cluster has no available tokens.",
1176+
}
1177+
},
1178+
{
1179+
"detail": {
1180+
"response": "The quota has been exceeded",
1181+
"cause": "Unknown subject 999 has no available tokens.",
1182+
}
1183+
},
1184+
{
1185+
"detail": {
1186+
"response": "The quota has been exceeded",
1187+
"cause": "User 123 has 5 tokens, but 10 tokens are needed.",
1188+
}
1189+
},
1190+
{
1191+
"detail": {
1192+
"response": "The quota has been exceeded",
1193+
"cause": "Cluster has 500 tokens, but 900 tokens are needed.",
1194+
}
1195+
},
1196+
{
1197+
"detail": {
1198+
"response": "The quota has been exceeded",
1199+
"cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.",
1200+
}
1201+
},
1202+
{
1203+
"detail": {
1204+
"response": "The model quota has been exceeded",
1205+
"cause": "The token quota for model gpt-4-turbo has been exceeded.",
1206+
}
1207+
},
1208+
]
1209+
}
1210+
}
1211+
1212+
11451213
class InvalidFeedbackStoragePathResponse(AbstractErrorResponse):
11461214
"""500 Internal Error - Invalid feedback storage path."""
11471215

tests/unit/app/endpoints/test_query.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest
1111
from pytest_mock import MockerFixture
1212
from fastapi import HTTPException, Request, status
13+
from litellm.exceptions import RateLimitError
1314

1415
from llama_stack_client import APIConnectionError
1516
from llama_stack_client.types import UserMessage # type: ignore
@@ -2261,3 +2262,46 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) -
22612262
stream=False,
22622263
toolgroups=None,
22632264
)
2265+
2266+
2267+
@pytest.mark.asyncio
2268+
async def test_query_endpoint_quota_exceeded(
2269+
mocker: MockerFixture, dummy_request: Request
2270+
) -> None:
2271+
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
2272+
query_request = QueryRequest(
2273+
query="What is OpenStack?",
2274+
provider="openai",
2275+
model="gpt-4-turbo",
2276+
) # type: ignore
2277+
mock_client = mocker.AsyncMock()
2278+
mock_agent = mocker.AsyncMock()
2279+
mock_agent.create_turn.side_effect = RateLimitError(
2280+
model="gpt-4-turbo", llm_provider="openai", message=""
2281+
)
2282+
mocker.patch(
2283+
"app.endpoints.query.get_agent",
2284+
return_value=(mock_agent, "conv-123", "sess-123"),
2285+
)
2286+
mocker.patch(
2287+
"app.endpoints.query.select_model_and_provider_id",
2288+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"),
2289+
)
2290+
mocker.patch("app.endpoints.query.validate_model_provider_override")
2291+
mocker.patch(
2292+
"client.AsyncLlamaStackClientHolder.get_client",
2293+
return_value=mock_client,
2294+
)
2295+
mocker.patch(
2296+
"app.endpoints.query.handle_mcp_headers_with_toolgroups", return_value={}
2297+
)
2298+
2299+
with pytest.raises(HTTPException) as exc_info:
2300+
await query_endpoint_handler(
2301+
dummy_request, query_request=query_request, auth=MOCK_AUTH
2302+
)
2303+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
2304+
detail = exc_info.value.detail
2305+
assert isinstance(detail, dict)
2306+
assert detail["response"] == "Model quota exceeded" # type: ignore
2307+
assert "gpt-4-turbo" in detail["cause"] # type: ignore

0 commit comments

Comments
 (0)