Skip to content

Commit adc06b4

Browse files
author
Andrej Simurka
committed
Added quota limit exception handling
1 parent 1a5255a commit adc06b4

File tree

10 files changed

+448
-125
lines changed

10 files changed

+448
-125
lines changed

docs/openapi.json

Lines changed: 205 additions & 124 deletions
Large diffs are not rendered by default.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ dependencies = [
5151
# Used by authorization resolvers
5252
"jsonpath-ng>=1.6.1",
5353
"psycopg2-binary>=2.9.10",
54+
"litellm",
5455
]
5556

5657

src/app/endpoints/query.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Annotated, Any, Optional, cast
99

1010
from fastapi import APIRouter, Depends, HTTPException, Request, status
11+
from litellm.exceptions import RateLimitError
1112
from llama_stack_client import (
1213
APIConnectionError,
1314
AsyncLlamaStackClient, # type: ignore
@@ -42,6 +43,7 @@
4243
ReferencedDocument,
4344
ToolCall,
4445
UnauthorizedResponse,
46+
QuotaExceededResponse,
4547
)
4648
from utils.endpoints import (
4749
check_configuration_loaded,
@@ -86,6 +88,10 @@
8688
"description": "User is not authorized",
8789
"model": ForbiddenResponse,
8890
},
91+
429: {
92+
"description": "The quota has been exceeded",
93+
"model": QuotaExceededResponse,
94+
},
8995
500: {
9096
"detail": {
9197
"response": "Unable to connect to Llama Stack",
@@ -421,6 +427,15 @@ async def query_endpoint_handler_base( # pylint: disable=R0914
421427
"cause": str(e),
422428
},
423429
) from e
430+
except RateLimitError as e:
431+
used_model = getattr(e, "model", "unknown")
432+
raise HTTPException(
433+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
434+
detail={
435+
"response": "Model quota exceeded",
436+
"cause": f"The token quota for model {used_model} has been exceeded.",
437+
},
438+
) from e
424439

425440

426441
@router.post("/query", responses=query_response)

src/app/endpoints/query_v2.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
QueryResponse,
2828
ReferencedDocument,
2929
UnauthorizedResponse,
30+
QuotaExceededResponse,
3031
)
3132
from utils.endpoints import (
3233
get_system_prompt,
@@ -59,6 +60,10 @@
5960
"description": "User is not authorized",
6061
"model": ForbiddenResponse,
6162
},
63+
429: {
64+
"description": "The quota has been exceeded",
65+
"model": QuotaExceededResponse,
66+
},
6267
500: {
6368
"detail": {
6469
"response": "Unable to connect to Llama Stack",

src/app/endpoints/streaming_query.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from datetime import UTC, datetime
99
from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast
1010

11+
from litellm.exceptions import RateLimitError
1112
from fastapi import APIRouter, Depends, HTTPException, Request, status
1213
from fastapi.responses import StreamingResponse
1314
from llama_stack_client import (
@@ -48,7 +49,11 @@
4849
from models.config import Action
4950
from models.database.conversations import UserConversation
5051
from models.requests import QueryRequest
51-
from models.responses import ForbiddenResponse, UnauthorizedResponse
52+
from models.responses import (
53+
ForbiddenResponse,
54+
UnauthorizedResponse,
55+
QuotaExceededResponse,
56+
)
5257
from utils.endpoints import (
5358
check_configuration_loaded,
5459
create_referenced_documents_with_metadata,
@@ -104,6 +109,10 @@
104109
"description": "User is not authorized",
105110
"model": ForbiddenResponse,
106111
},
112+
429: {
113+
"description": "The quota has been exceeded",
114+
"model": QuotaExceededResponse,
115+
},
107116
500: {
108117
"detail": {
109118
"response": "Unable to connect to Llama Stack",
@@ -922,6 +931,15 @@ async def response_generator(
922931
"cause": str(e),
923932
},
924933
) from e
934+
except RateLimitError as e:
935+
used_model = getattr(e, "model", "unknown")
936+
raise HTTPException(
937+
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
938+
detail={
939+
"response": "Model quota exceeded",
940+
"cause": f"The token quota for model {used_model} has been exceeded.",
941+
},
942+
) from e
925943
except Exception as e: # pylint: disable=broad-except
926944
# Handle other errors with OLS-compatible error response
927945
# This broad exception catch is intentional to ensure all errors

src/models/responses.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,74 @@ def __init__(self, user_id: str, resource: str, resource_id: str):
11421142
}
11431143

11441144

1145+
class QuotaExceededResponse(AbstractErrorResponse):
1146+
"""429 Too Many Requests - LLM quota exceeded."""
1147+
1148+
def __init__(
1149+
self,
1150+
user_id: str,
1151+
model_name: str, # pylint: disable=unused-argument
1152+
limit: int, # pylint: disable=unused-argument
1153+
):
1154+
"""Initialize a QuotaExceededResponse."""
1155+
super().__init__(
1156+
detail=DetailModel(
1157+
response="The quota has been exceeded",
1158+
cause=(f"User {user_id} has no available tokens."),
1159+
)
1160+
)
1161+
# TODO(LCORE-837): add factories for custom cause creation
1162+
1163+
model_config = {
1164+
"json_schema_extra": {
1165+
"examples": [
1166+
{
1167+
"detail": {
1168+
"response": "The quota has been exceeded",
1169+
"cause": "User 123 has no available tokens.",
1170+
}
1171+
},
1172+
{
1173+
"detail": {
1174+
"response": "The quota has been exceeded",
1175+
"cause": "Cluster has no available tokens.",
1176+
}
1177+
},
1178+
{
1179+
"detail": {
1180+
"response": "The quota has been exceeded",
1181+
"cause": "Unknown subject 999 has no available tokens.",
1182+
}
1183+
},
1184+
{
1185+
"detail": {
1186+
"response": "The quota has been exceeded",
1187+
"cause": "User 123 has 5 tokens, but 10 tokens are needed.",
1188+
}
1189+
},
1190+
{
1191+
"detail": {
1192+
"response": "The quota has been exceeded",
1193+
"cause": "Cluster has 500 tokens, but 900 tokens are needed.",
1194+
}
1195+
},
1196+
{
1197+
"detail": {
1198+
"response": "The quota has been exceeded",
1199+
"cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.",
1200+
}
1201+
},
1202+
{
1203+
"detail": {
1204+
"response": "The model quota has been exceeded",
1205+
"cause": "The token quota for model gpt-4-turbo has been exceeded.",
1206+
}
1207+
},
1208+
]
1209+
}
1210+
}
1211+
1212+
11451213
class InvalidFeedbackStoragePathResponse(AbstractErrorResponse):
11461214
"""500 Internal Error - Invalid feedback storage path."""
11471215

tests/unit/app/endpoints/test_query.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest
1111
from pytest_mock import MockerFixture
1212
from fastapi import HTTPException, Request, status
13+
from litellm.exceptions import RateLimitError
1314

1415
from llama_stack_client import APIConnectionError
1516
from llama_stack_client.types import UserMessage # type: ignore
@@ -2261,3 +2262,46 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) -
22612262
stream=False,
22622263
toolgroups=None,
22632264
)
2265+
2266+
2267+
@pytest.mark.asyncio
2268+
async def test_query_endpoint_quota_exceeded(
2269+
mocker: MockerFixture, dummy_request: Request
2270+
) -> None:
2271+
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
2272+
query_request = QueryRequest(
2273+
query="What is OpenStack?",
2274+
provider="openai",
2275+
model="gpt-4-turbo",
2276+
) # type: ignore
2277+
mock_client = mocker.AsyncMock()
2278+
mock_agent = mocker.AsyncMock()
2279+
mock_agent.create_turn.side_effect = RateLimitError(
2280+
model="gpt-4-turbo", llm_provider="openai", message=""
2281+
)
2282+
mocker.patch(
2283+
"app.endpoints.query.get_agent",
2284+
return_value=(mock_agent, "conv-123", "sess-123"),
2285+
)
2286+
mocker.patch(
2287+
"app.endpoints.query.select_model_and_provider_id",
2288+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"),
2289+
)
2290+
mocker.patch("app.endpoints.query.validate_model_provider_override")
2291+
mocker.patch(
2292+
"client.AsyncLlamaStackClientHolder.get_client",
2293+
return_value=mock_client,
2294+
)
2295+
mocker.patch(
2296+
"app.endpoints.query.handle_mcp_headers_with_toolgroups", return_value={}
2297+
)
2298+
2299+
with pytest.raises(HTTPException) as exc_info:
2300+
await query_endpoint_handler(
2301+
dummy_request, query_request=query_request, auth=MOCK_AUTH
2302+
)
2303+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
2304+
detail = exc_info.value.detail
2305+
assert isinstance(detail, dict)
2306+
assert detail["response"] == "Model quota exceeded" # type: ignore
2307+
assert "gpt-4-turbo" in detail["cause"] # type: ignore

tests/unit/app/endpoints/test_query_v2.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""Unit tests for the /query (v2) REST API endpoint using Responses API."""
33

44
from typing import Any
5+
from litellm.exceptions import RateLimitError
56
import pytest
67
from pytest_mock import MockerFixture
78
from fastapi import HTTPException, status, Request
@@ -18,6 +19,14 @@
1819
query_endpoint_handler_v2,
1920
)
2021

22+
# User ID must be proper UUID
23+
MOCK_AUTH = (
24+
"00000001-0001-0001-0001-000000000001",
25+
"mock_username",
26+
False,
27+
"mock_token",
28+
)
29+
2130

2231
@pytest.fixture
2332
def dummy_request() -> Request:
@@ -432,3 +441,39 @@ def _raise(*_args: Any, **_kwargs: Any) -> Exception:
432441
assert exc.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR
433442
assert "Unable to connect to Llama Stack" in str(exc.value.detail)
434443
fail_metric.inc.assert_called_once()
444+
445+
446+
@pytest.mark.asyncio
447+
async def test_query_endpoint_quota_exceeded(
448+
mocker: MockerFixture, dummy_request: Request
449+
) -> None:
450+
"""Test that query endpoint raises HTTP 429 when model quota is exceeded."""
451+
query_request = QueryRequest(
452+
query="What is OpenStack?",
453+
provider="openai",
454+
model="gpt-4-turbo",
455+
attachments=[],
456+
) # type: ignore
457+
mock_client = mocker.AsyncMock()
458+
mock_client.responses.create.side_effect = RateLimitError(
459+
model="gpt-4-turbo", llm_provider="openai", message=""
460+
)
461+
mocker.patch(
462+
"app.endpoints.query.select_model_and_provider_id",
463+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"),
464+
)
465+
mocker.patch("app.endpoints.query.validate_model_provider_override")
466+
mocker.patch(
467+
"client.AsyncLlamaStackClientHolder.get_client",
468+
return_value=mock_client,
469+
)
470+
471+
with pytest.raises(HTTPException) as exc_info:
472+
await query_endpoint_handler_v2(
473+
dummy_request, query_request=query_request, auth=MOCK_AUTH
474+
)
475+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
476+
detail = exc_info.value.detail
477+
assert isinstance(detail, dict)
478+
assert detail["response"] == "Model quota exceeded" # type: ignore
479+
assert "gpt-4-turbo" in detail["cause"] # type: ignore

tests/unit/app/endpoints/test_streaming_query.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import json
88

9+
from litellm.exceptions import RateLimitError
910
import pytest
1011
from pytest_mock import MockerFixture
1112

@@ -1795,6 +1796,49 @@ async def test_streaming_query_handles_none_event(mocker: MockerFixture) -> None
17951796
assert isinstance(response, StreamingResponse)
17961797

17971798

1799+
@pytest.mark.asyncio
1800+
async def test_query_endpoint_quota_exceeded(mocker: MockerFixture) -> None:
1801+
"""Test that streaming query endpoint raises HTTP 429 when model quota is exceeded."""
1802+
query_request = QueryRequest(
1803+
query="What is OpenStack?",
1804+
provider="openai",
1805+
model="gpt-4-turbo",
1806+
) # type: ignore
1807+
request = Request(scope={"type": "http"})
1808+
mock_client = mocker.AsyncMock()
1809+
mock_agent = mocker.AsyncMock()
1810+
mock_agent.create_turn.side_effect = RateLimitError(
1811+
model="gpt-4-turbo", llm_provider="openai", message=""
1812+
)
1813+
mocker.patch(
1814+
"app.endpoints.streaming_query.get_agent",
1815+
return_value=(mock_agent, "conv-123", "sess-123"),
1816+
)
1817+
mocker.patch(
1818+
"app.endpoints.streaming_query.select_model_and_provider_id",
1819+
return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"),
1820+
)
1821+
mocker.patch("app.endpoints.streaming_query.validate_model_provider_override")
1822+
mocker.patch(
1823+
"client.AsyncLlamaStackClientHolder.get_client",
1824+
return_value=mock_client,
1825+
)
1826+
mocker.patch(
1827+
"app.endpoints.streaming_query.handle_mcp_headers_with_toolgroups",
1828+
return_value={},
1829+
)
1830+
1831+
with pytest.raises(HTTPException) as exc_info:
1832+
await streaming_query_endpoint_handler(
1833+
request, query_request=query_request, auth=MOCK_AUTH
1834+
)
1835+
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
1836+
detail = exc_info.value.detail
1837+
assert isinstance(detail, dict)
1838+
assert detail["response"] == "Model quota exceeded" # type: ignore
1839+
assert "gpt-4-turbo" in detail["cause"] # type: ignore
1840+
1841+
17981842
# ============================================================================
17991843
# OLS Compatibility Tests
18001844
# ============================================================================

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)