Added quota limit exception handling

Andrej Simurka · Andrej Simurka · commit 3ed60950ada2 · 2025-11-11T09:11:05.000+01:00
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -375,6 +375,16 @@
                             }
                         }
                     },
+                    "429": {
+                        "description": "The quota has been exceeded",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/QuotaExceededResponse"
+                                }
+                            }
+                        }
+                    },
                     "500": {
                         "description": "Internal Server Error",
                         "detail": {
@@ -461,6 +471,16 @@
                             }
                         }
                     },
+                    "429": {
+                        "description": "The quota has been exceeded",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/QuotaExceededResponse"
+                                }
+                            }
+                        }
+                    },
                     "500": {
                         "description": "Internal Server Error",
                         "detail": {
@@ -1256,6 +1276,16 @@
                             }
                         }
                     },
+                    "429": {
+                        "description": "The quota has been exceeded",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/QuotaExceededResponse"
+                                }
+                            }
+                        }
+                    },
                     "500": {
                         "description": "Internal Server Error",
                         "detail": {
@@ -3577,6 +3607,63 @@
                     }
                 ]
             },
+            "QuotaExceededResponse": {
+                "properties": {
+                    "detail": {
+                        "$ref": "#/components/schemas/DetailModel"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "detail"
+                ],
+                "title": "QuotaExceededResponse",
+                "description": "429 Too Many Requests - LLM quota exceeded.",
+                "examples": [
+                    {
+                        "detail": {
+                            "cause": "User 123 has no available tokens.",
+                            "response": "The quota has been exceeded"
+                        }
+                    },
+                    {
+                        "detail": {
+                            "cause": "Cluster has no available tokens.",
+                            "response": "The quota has been exceeded"
+                        }
+                    },
+                    {
+                        "detail": {
+                            "cause": "Unknown subject 999 has no available tokens.",
+                            "response": "The quota has been exceeded"
+                        }
+                    },
+                    {
+                        "detail": {
+                            "cause": "User 123 has 5 tokens, but 10 tokens are needed.",
+                            "response": "The quota has been exceeded"
+                        }
+                    },
+                    {
+                        "detail": {
+                            "cause": "Cluster has 500 tokens, but 900 tokens are needed.",
+                            "response": "The quota has been exceeded"
+                        }
+                    },
+                    {
+                        "detail": {
+                            "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.",
+                            "response": "The quota has been exceeded"
+                        }
+                    },
+                    {
+                        "detail": {
+                            "cause": "The token quota for model gpt-4-turbo has been exceeded.",
+                            "response": "The model quota has been exceeded"
+                        }
+                    }
+                ]
+            },
             "QuotaHandlersConfiguration": {
                 "properties": {
                     "sqlite": {
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ dependencies = [
     # Used by authorization resolvers
     "jsonpath-ng>=1.6.1",
     "psycopg2-binary>=2.9.10",
+    "litellm>=1.75.5.post1",
 ]
 
 
@@ -129,8 +130,6 @@ llslibdev = [
     "langdetect>=1.0.9",
     "emoji>=2.1.0",
     "nltk>=3.8.1",
-    # API inference: remote::gemini
-    "litellm>=1.75.5.post1",
     # API inference: inline::sentence-transformers
     "sentence-transformers>=5.0.0",
     # API vector_io: inline::faiss
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -8,6 +8,7 @@
 from typing import Annotated, Any, Optional, cast
 
 from fastapi import APIRouter, Depends, HTTPException, Request, status
+from litellm.exceptions import RateLimitError
 from llama_stack_client import (
     APIConnectionError,
     AsyncLlamaStackClient,  # type: ignore
@@ -42,6 +43,7 @@
     ReferencedDocument,
     ToolCall,
     UnauthorizedResponse,
+    QuotaExceededResponse,
 )
 from utils.endpoints import (
     check_configuration_loaded,
@@ -86,6 +88,10 @@
         "description": "Client does not have permission to access conversation",
         "model": ForbiddenResponse,
     },
+    429: {
+        "description": "The quota has been exceeded",
+        "model": QuotaExceededResponse,
+    },
     500: {
         "detail": {
             "response": "Unable to connect to Llama Stack",
@@ -421,6 +427,15 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
                 "cause": str(e),
             },
         ) from e
+    except RateLimitError as e:
+        used_model = getattr(e, "model", "unknown")
+        raise HTTPException(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            detail={
+                "response": "Model quota exceeded",
+                "cause": f"The token quota for model {used_model} has been exceeded.",
+            },
+        ) from e
 
 
 @router.post("/query", responses=query_response)
diff --git a/src/app/endpoints/query_v2.py b/src/app/endpoints/query_v2.py
@@ -27,6 +27,7 @@
     QueryResponse,
     ReferencedDocument,
     UnauthorizedResponse,
+    QuotaExceededResponse,
 )
 from utils.endpoints import (
     get_system_prompt,
@@ -59,6 +60,10 @@
         "description": "Client does not have permission to access conversation",
         "model": ForbiddenResponse,
     },
+    429: {
+        "description": "The quota has been exceeded",
+        "model": QuotaExceededResponse,
+    },
     500: {
         "detail": {
             "response": "Unable to connect to Llama Stack",
diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py
@@ -8,6 +8,7 @@
 from datetime import UTC, datetime
 from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast
 
+from litellm.exceptions import RateLimitError
 from fastapi import APIRouter, Depends, HTTPException, Request, status
 from fastapi.responses import StreamingResponse
 from llama_stack_client import (
@@ -48,7 +49,11 @@
 from models.config import Action
 from models.database.conversations import UserConversation
 from models.requests import QueryRequest
-from models.responses import ForbiddenResponse, UnauthorizedResponse
+from models.responses import (
+    ForbiddenResponse,
+    UnauthorizedResponse,
+    QuotaExceededResponse,
+)
 from utils.endpoints import (
     check_configuration_loaded,
     create_referenced_documents_with_metadata,
@@ -104,6 +109,10 @@
         "description": "Client does not have permission to access conversation",
         "model": ForbiddenResponse,
     },
+    429: {
+        "description": "The quota has been exceeded",
+        "model": QuotaExceededResponse,
+    },
     500: {
         "detail": {
             "response": "Unable to connect to Llama Stack",
@@ -922,6 +931,15 @@ async def response_generator(
                 "cause": str(e),
             },
         ) from e
+    except RateLimitError as e:
+        used_model = getattr(e, "model", "unknown")
+        raise HTTPException(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            detail={
+                "response": "Model quota exceeded",
+                "cause": f"The token quota for model {used_model} has been exceeded.",
+            },
+        ) from e
     except Exception as e:  # pylint: disable=broad-except
         # Handle other errors with OLS-compatible error response
         # This broad exception catch is intentional to ensure all errors
diff --git a/src/models/responses.py b/src/models/responses.py
@@ -1142,6 +1142,74 @@ def __init__(self, user_id: str, resource: str, resource_id: str):
     }
 
 
+class QuotaExceededResponse(AbstractErrorResponse):
+    """429 Too Many Requests - LLM quota exceeded."""
+
+    def __init__(
+        self,
+        user_id: str,
+        model_name: str,  # pylint: disable=unused-argument
+        limit: int,  # pylint: disable=unused-argument
+    ):
+        """Initialize a QuotaExceededResponse."""
+        super().__init__(
+            detail=DetailModel(
+                response="The quota has been exceeded",
+                cause=(f"User {user_id} has no available tokens."),
+            )
+        )
+        # TODO(LCORE-837): add factories for custom cause creation
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "detail": {
+                        "response": "The quota has been exceeded",
+                        "cause": "User 123 has no available tokens.",
+                    }
+                },
+                {
+                    "detail": {
+                        "response": "The quota has been exceeded",
+                        "cause": "Cluster has no available tokens.",
+                    }
+                },
+                {
+                    "detail": {
+                        "response": "The quota has been exceeded",
+                        "cause": "Unknown subject 999 has no available tokens.",
+                    }
+                },
+                {
+                    "detail": {
+                        "response": "The quota has been exceeded",
+                        "cause": "User 123 has 5 tokens, but 10 tokens are needed.",
+                    }
+                },
+                {
+                    "detail": {
+                        "response": "The quota has been exceeded",
+                        "cause": "Cluster has 500 tokens, but 900 tokens are needed.",
+                    }
+                },
+                {
+                    "detail": {
+                        "response": "The quota has been exceeded",
+                        "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.",
+                    }
+                },
+                {
+                    "detail": {
+                        "response": "The model quota has been exceeded",
+                        "cause": "The token quota for model gpt-4-turbo has been exceeded.",
+                    }
+                },
+            ]
+        }
+    }
+
+
 class InvalidFeedbackStoragePathResponse(AbstractErrorResponse):
     """500 Internal Error - Invalid feedback storage path."""
 
diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py
@@ -10,6 +10,7 @@
 import pytest
 from pytest_mock import MockerFixture
 from fastapi import HTTPException, Request, status
+from litellm.exceptions import RateLimitError
 
 from llama_stack_client import APIConnectionError
 from llama_stack_client.types import UserMessage  # type: ignore
@@ -2261,3 +2262,46 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) -
         stream=False,
         toolgroups=None,
     )
+
+
+@pytest.mark.asyncio
+async def test_query_endpoint_quota_exceeded(
+    mocker: MockerFixture, dummy_request: Request
+) -> None:
+    """Test that query endpoint raises HTTP 429 when model quota is exceeded."""
+    query_request = QueryRequest(
+        query="What is OpenStack?",
+        provider="openai",
+        model="gpt-4-turbo",
+    )  # type: ignore
+    mock_client = mocker.AsyncMock()
+    mock_agent = mocker.AsyncMock()
+    mock_agent.create_turn.side_effect = RateLimitError(
+        model="gpt-4-turbo", llm_provider="openai", message=""
+    )
+    mocker.patch(
+        "app.endpoints.query.get_agent",
+        return_value=(mock_agent, "conv-123", "sess-123"),
+    )
+    mocker.patch(
+        "app.endpoints.query.select_model_and_provider_id",
+        return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"),
+    )
+    mocker.patch("app.endpoints.query.validate_model_provider_override")
+    mocker.patch(
+        "client.AsyncLlamaStackClientHolder.get_client",
+        return_value=mock_client,
+    )
+    mocker.patch(
+        "app.endpoints.query.handle_mcp_headers_with_toolgroups", return_value={}
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await query_endpoint_handler(
+            dummy_request, query_request=query_request, auth=MOCK_AUTH
+        )
+    assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
+    detail = exc_info.value.detail
+    assert isinstance(detail, dict)
+    assert detail["response"] == "Model quota exceeded"  # type: ignore
+    assert "gpt-4-turbo" in detail["cause"]  # type: ignore
diff --git a/tests/unit/app/endpoints/test_query_v2.py b/tests/unit/app/endpoints/test_query_v2.py
diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py
diff --git a/uv.lock b/uv.lock