lightspeed-core · thoraxe · Oct 31, 2025 · luis5tb · Nov 3, 2025 · maysunfaisal
diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py
@@ -0,0 +1,192 @@
+"""Handler for REST API call to provide OpenAI-compatible responses endpoint."""
+
+import logging
+from typing import Annotated, Any
+
+from fastapi import APIRouter, Depends, HTTPException, Request, status
+from llama_stack_client import APIConnectionError
+
+import constants
+import metrics
+from authentication import get_auth_dependency
+from authentication.interface import AuthTuple
+from authorization.middleware import authorize
+from client import AsyncLlamaStackClientHolder
+from configuration import configuration
+from models.config import Action
+from models.requests import CreateResponseRequest
+from models.responses import (
+    OpenAIResponse,
+    ForbiddenResponse,
+    UnauthorizedResponse,
+    QueryResponse,
+)
+from utils.endpoints import check_configuration_loaded
+from utils.openai_mapping import (
+    map_openai_to_query_request,
+    map_query_to_openai_response,
+)
+from app.endpoints.query import retrieve_response
+
+logger = logging.getLogger("app.endpoints.handlers")
+router = APIRouter(tags=["responses"])
+
+# Response definitions for OpenAPI documentation
+responses_response_definitions: dict[int | str, dict[str, Any]] = {
+    200: {
+        "description": "OpenAI-compatible response generated successfully",
+        "model": OpenAIResponse,
+    },
+    400: {
+        "description": "Missing or invalid credentials provided by client",
+        "model": UnauthorizedResponse,
+    },
+    403: {
+        "description": "User is not authorized",
+        "model": ForbiddenResponse,
+    },
+    422: {
+        "description": "Request validation failed",
+        "content": {
+            "application/json": {
+                "example": {
+                    "response": constants.UNABLE_TO_PROCESS_RESPONSE,
+                    "cause": "Invalid input parameters or request format",
+                }
+            }
+        },
+    },
+    500: {
+        "description": "Internal server error",
+        "content": {
+            "application/json": {
+                "example": {
+                    "response": "Unable to connect to Llama Stack",
+                    "cause": "Connection error.",
+                }
+            }
+        },
+    },
+}
+
+
+@router.post("/responses", responses=responses_response_definitions)
+@authorize(Action.RESPONSES)
+async def responses_endpoint_handler(
+    request: Request,  # pylint: disable=unused-argument
+    responses_request: CreateResponseRequest,
+    auth: Annotated[AuthTuple, Depends(get_auth_dependency())],
+) -> OpenAIResponse:
+    """
+    Handle request to the /responses endpoint.
+
+    Processes a POST request to the /responses endpoint, providing OpenAI-compatible
+    API responses while using Lightspeed's internal RAG and LLM integration.
+    Converts OpenAI request format to internal QueryRequest, processes it through
+    existing Lightspeed logic, and converts the response back to OpenAI format.
+
+    This endpoint maintains full compatibility with the OpenAI Responses API
+    specification while leveraging all existing Lightspeed functionality including
+    authentication, authorization, RAG database queries, and LLM integration.
+
+    Args:
+        request: FastAPI Request object containing HTTP request details.
+        responses_request: OpenAI-compatible request containing model, input, and options.
+        auth: Authentication tuple containing user information and token.
+
+    Returns:
+        OpenAIResponse: OpenAI-compatible response with generated content and metadata.
+
+    Raises:
+        HTTPException: For connection errors (500) or other processing failures.
+
+    Example:
+        ```python
+        # Request
+        {
+            "model": "gpt-4",
+            "input": "What is Kubernetes?",
+            "instructions": "You are a helpful DevOps assistant"
+        }
+
+        # Response
+        {
+            "id": "resp_67ccd2bed1ec8190b14f964abc0542670bb6a6b452d3795b",
+            "object": "response",
+            "created_at": 1640995200,
+            "status": "completed",
+            "model": "gpt-4",
+            "output": [...],
+            "usage": {...},
+            "metadata": {"referenced_documents": [...]}
+        }
+        ```
+    """
+    check_configuration_loaded(configuration)
+
+    # Extract authentication details
+    user_id, _, _skip_userid_check, token = auth  # pylint: disable=unused-variable
+
+    try:
+        # Convert OpenAI request to internal QueryRequest format
+        query_request = map_openai_to_query_request(responses_request)
+
+        # Get Llama Stack client and retrieve response using existing logic
+        client = AsyncLlamaStackClientHolder().get_client()
+
+        # For MVP simplicity, use default model/provider selection logic from query.py
+        # This will be enhanced in Phase 2 to support explicit model mapping
+        summary, conversation_id, referenced_documents, token_usage = (
+            await retrieve_response(
+                client,
+                responses_request.model,  # Pass model directly for now
+                query_request,
+                token,
+                mcp_headers={},  # Empty for MVP
+                provider_id="",  # Will be determined by existing logic
+            )
+        )
+
+        # Create QueryResponse structure from TurnSummary for mapping
+
+        internal_query_response = QueryResponse(
+            conversation_id=conversation_id,
+            response=summary.llm_response,
+            rag_chunks=[],  # MVP: use empty list (summary.rag_chunks if available)
+            tool_calls=None,  # MVP: simplified (summary.tool_calls if available)
+            referenced_documents=referenced_documents,
+            truncated=False,  # MVP: default to False
+            input_tokens=token_usage.input_tokens,
+            output_tokens=token_usage.output_tokens,
+            available_quotas={},  # MVP: empty quotas
+        )
+
+        # Convert internal response to OpenAI format
+        openai_response = map_query_to_openai_response(
+            query_response=internal_query_response,
+            openai_request=responses_request,
+        )
+
+        return openai_response
+
+    except APIConnectionError as e:
+        # Update metrics for the LLM call failure
+        metrics.llm_calls_failures_total.inc()
+        logger.error("Unable to connect to Llama Stack: %s", e)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail={
+                "response": "Unable to connect to Llama Stack",
+                "cause": str(e),
+            },
+        ) from e
+    except (ValueError, AttributeError, TypeError) as e:
+        # Handle validation and mapping errors
+        logger.error("Request validation or processing error: %s", e)
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail={
+                "response": constants.UNABLE_TO_PROCESS_RESPONSE,
+                "cause": f"Invalid input parameters or request format: {str(e)}",
+            },
+        ) from e
diff --git a/src/app/routers.py b/src/app/routers.py
@@ -18,6 +18,7 @@
     conversations_v2,
     metrics,
     tools,
+    responses,
 )
 
 
@@ -35,6 +36,7 @@ def include_routers(app: FastAPI) -> None:
     app.include_router(providers.router, prefix="/v1")
     app.include_router(query.router, prefix="/v1")
     app.include_router(streaming_query.router, prefix="/v1")
+    app.include_router(responses.router, prefix="/v1")
     app.include_router(config.router, prefix="/v1")
     app.include_router(feedback.router, prefix="/v1")
     app.include_router(conversations.router, prefix="/v1")

diff --git a/src/models/config.py b/src/models/config.py
@@ -350,6 +350,9 @@ class Action(str, Enum):
     # Access the streaming query endpoint
     STREAMING_QUERY = "streaming_query"
 
+    # Access the responses endpoint
+    RESPONSES = "responses"
+
     # Access the conversation endpoint
     GET_CONVERSATION = "get_conversation"
 

diff --git a/src/models/requests.py b/src/models/requests.py
@@ -415,6 +415,103 @@ def get_value(self) -> bool:
         return self.status
 
 
+class CreateResponseRequest(BaseModel):
+    """Model representing an OpenAI-compatible request for the Responses API.
+
+    This model follows the OpenAI API specification for the /v1/responses endpoint,
+    allowing clients to send requests in OpenAI format while maintaining internal
+    compatibility with Lightspeed's existing RAG and LLM integration.
+
+    Attributes:
+        model: The model to use for the response generation.
+        input: The input text or array of texts to process.
+        instructions: Optional instructions to guide the response generation.
+        temperature: Optional temperature for controlling randomness (0.0 to 2.0).
+        max_output_tokens: Optional maximum number of tokens in the response.
+
+    Example:
+        ```python
+        request = CreateResponseRequest(
+            model="gpt-4",
+            input="What is Kubernetes?"
+        )
+        ```
+    """
+
+    model: str = Field(
+        description="The model to use for response generation",
+        examples=["gpt-4", "gpt-3.5-turbo"],
+        min_length=1,
+    )
+
+    input: str | list[str] = Field(
+        description="The input text or array of texts to process",
+        examples=["What is Kubernetes?", ["Explain containers", "How do they work?"]],
+    )
+
+    instructions: Optional[str] = Field(
+        None,
+        description="Optional instructions to guide the response generation",
+        examples=["You are a helpful DevOps assistant"],
+    )
+
+    temperature: Optional[float] = Field(
+        None,
+        description="Temperature for controlling randomness (0.0 to 2.0)",
+        examples=[0.7, 1.0],
+        ge=0.0,
+        le=2.0,
+    )
+
+    max_output_tokens: Optional[int] = Field(
+        None,
+        description="Maximum number of tokens in the response",
+        examples=[1000, 2000],
+        gt=0,
+    )
+
+    model_config = {
+        "extra": "forbid",
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "model": "gpt-4",
+                    "input": "What is Kubernetes?",
+                },
+                {
+                    "model": "gpt-3.5-turbo",
+                    "input": "Explain Docker containers",
+                    "instructions": "You are a helpful DevOps assistant",
+                    "temperature": 0.7,
+                    "max_output_tokens": 1000,
+                },
+                {
+                    "model": "gpt-4",
+                    "input": ["What is Kubernetes?", "How does it work?"],
+                    "temperature": 0.5,
+                },
+            ]
+        },
+    }
+
+    @field_validator("input")
+    @classmethod
+    def validate_input(cls, value: str | list[str]) -> str | list[str]:
+        """Validate that input is not empty."""
+        if isinstance(value, str):
+            if not value.strip():
+                raise ValueError("Input string cannot be empty")
+        elif isinstance(value, list):
+            if not value:
+                raise ValueError("Input array cannot be empty")
+            for item in value:
+                if not isinstance(item, str) or not item.strip():
+                    raise ValueError(
+                        "All items in input array must be non-empty strings"
+                    )
+        return value
+
+
 class ConversationUpdateRequest(BaseModel):
     """Model representing a request to update a conversation topic summary.