Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 192 additions & 0 deletions src/app/endpoints/responses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""Handler for REST API call to provide OpenAI-compatible responses endpoint."""

import logging
from typing import Annotated, Any

from fastapi import APIRouter, Depends, HTTPException, Request, status
from llama_stack_client import APIConnectionError

import constants
import metrics
from authentication import get_auth_dependency
from authentication.interface import AuthTuple
from authorization.middleware import authorize
from client import AsyncLlamaStackClientHolder
from configuration import configuration
from models.config import Action
from models.requests import CreateResponseRequest
from models.responses import (
OpenAIResponse,
ForbiddenResponse,
UnauthorizedResponse,
QueryResponse,
)
from utils.endpoints import check_configuration_loaded
from utils.openai_mapping import (
map_openai_to_query_request,
map_query_to_openai_response,
)
from app.endpoints.query import retrieve_response

logger = logging.getLogger("app.endpoints.handlers")
router = APIRouter(tags=["responses"])

# Response definitions for OpenAPI documentation
responses_response_definitions: dict[int | str, dict[str, Any]] = {
200: {
"description": "OpenAI-compatible response generated successfully",
"model": OpenAIResponse,
},
400: {
"description": "Missing or invalid credentials provided by client",
"model": UnauthorizedResponse,
},
403: {
"description": "User is not authorized",
"model": ForbiddenResponse,
},
422: {
"description": "Request validation failed",
"content": {
"application/json": {
"example": {
"response": constants.UNABLE_TO_PROCESS_RESPONSE,
"cause": "Invalid input parameters or request format",
}
}
},
},
500: {
"description": "Internal server error",
"content": {
"application/json": {
"example": {
"response": "Unable to connect to Llama Stack",
"cause": "Connection error.",
}
}
},
},
}


@router.post("/responses", responses=responses_response_definitions)
@authorize(Action.RESPONSES)
async def responses_endpoint_handler(
request: Request, # pylint: disable=unused-argument
responses_request: CreateResponseRequest,
auth: Annotated[AuthTuple, Depends(get_auth_dependency())],
) -> OpenAIResponse:
"""
Handle request to the /responses endpoint.

Processes a POST request to the /responses endpoint, providing OpenAI-compatible
API responses while using Lightspeed's internal RAG and LLM integration.
Converts OpenAI request format to internal QueryRequest, processes it through
existing Lightspeed logic, and converts the response back to OpenAI format.

This endpoint maintains full compatibility with the OpenAI Responses API
specification while leveraging all existing Lightspeed functionality including
authentication, authorization, RAG database queries, and LLM integration.

Args:
request: FastAPI Request object containing HTTP request details.
responses_request: OpenAI-compatible request containing model, input, and options.
auth: Authentication tuple containing user information and token.

Returns:
OpenAIResponse: OpenAI-compatible response with generated content and metadata.

Raises:
HTTPException: For connection errors (500) or other processing failures.

Example:
```python
# Request
{
"model": "gpt-4",
"input": "What is Kubernetes?",
"instructions": "You are a helpful DevOps assistant"
}

# Response
{
"id": "resp_67ccd2bed1ec8190b14f964abc0542670bb6a6b452d3795b",
"object": "response",
"created_at": 1640995200,
"status": "completed",
"model": "gpt-4",
"output": [...],
"usage": {...},
"metadata": {"referenced_documents": [...]}
}
```
"""
check_configuration_loaded(configuration)

# Extract authentication details
user_id, _, _skip_userid_check, token = auth # pylint: disable=unused-variable

try:
# Convert OpenAI request to internal QueryRequest format
query_request = map_openai_to_query_request(responses_request)

# Get Llama Stack client and retrieve response using existing logic
client = AsyncLlamaStackClientHolder().get_client()

# For MVP simplicity, use default model/provider selection logic from query.py
# This will be enhanced in Phase 2 to support explicit model mapping
summary, conversation_id, referenced_documents, token_usage = (
await retrieve_response(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note this is not using the responses API from llamastack

Copy link
Contributor

@maysunfaisal maysunfaisal Nov 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldnt this PR require moving Llama Stack to 0.3.x to use the new Llama Stack Responses API https://llamastack.github.io/docs/api/agents? As the previous Llama Stack Agent APIs are deprecated https://llamastack.github.io/docs/api-deprecated/agents ... i.e.; do we need an explicit LCORE /responses endpoint if we switch to Llama Stack 0.3.x?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

depends on what level of completeness LCORE is willing to live with @maysunfaisal .... the responses API was introduced a few months ago in 0.2.x but was labeled work in progress, with some known bugs and missing pieces

so there could be some staging in play

but charting a roadmap that after some intermediate stages ends up having LCORE leverage the llama stack openai api compatible endpoint, vs. the deprecated agent apis or some other responses api endpoint, should be the end goal

client,
responses_request.model, # Pass model directly for now
query_request,
token,
mcp_headers={}, # Empty for MVP
provider_id="", # Will be determined by existing logic
)
)

# Create QueryResponse structure from TurnSummary for mapping

internal_query_response = QueryResponse(
conversation_id=conversation_id,
response=summary.llm_response,
rag_chunks=[], # MVP: use empty list (summary.rag_chunks if available)
tool_calls=None, # MVP: simplified (summary.tool_calls if available)
referenced_documents=referenced_documents,
truncated=False, # MVP: default to False
input_tokens=token_usage.input_tokens,
output_tokens=token_usage.output_tokens,
available_quotas={}, # MVP: empty quotas
)

# Convert internal response to OpenAI format
openai_response = map_query_to_openai_response(
query_response=internal_query_response,
openai_request=responses_request,
)

return openai_response

except APIConnectionError as e:
# Update metrics for the LLM call failure
metrics.llm_calls_failures_total.inc()
logger.error("Unable to connect to Llama Stack: %s", e)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail={
"response": "Unable to connect to Llama Stack",
"cause": str(e),
},
) from e
except (ValueError, AttributeError, TypeError) as e:
# Handle validation and mapping errors
logger.error("Request validation or processing error: %s", e)
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail={
"response": constants.UNABLE_TO_PROCESS_RESPONSE,
"cause": f"Invalid input parameters or request format: {str(e)}",
},
) from e
2 changes: 2 additions & 0 deletions src/app/routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
conversations_v2,
metrics,
tools,
responses,
)


Expand All @@ -35,6 +36,7 @@ def include_routers(app: FastAPI) -> None:
app.include_router(providers.router, prefix="/v1")
app.include_router(query.router, prefix="/v1")
app.include_router(streaming_query.router, prefix="/v1")
app.include_router(responses.router, prefix="/v1")
app.include_router(config.router, prefix="/v1")
app.include_router(feedback.router, prefix="/v1")
app.include_router(conversations.router, prefix="/v1")
Expand Down
3 changes: 3 additions & 0 deletions src/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,9 @@ class Action(str, Enum):
# Access the streaming query endpoint
STREAMING_QUERY = "streaming_query"

# Access the responses endpoint
RESPONSES = "responses"

# Access the conversation endpoint
GET_CONVERSATION = "get_conversation"

Expand Down
97 changes: 97 additions & 0 deletions src/models/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,103 @@ def get_value(self) -> bool:
return self.status


class CreateResponseRequest(BaseModel):
"""Model representing an OpenAI-compatible request for the Responses API.

This model follows the OpenAI API specification for the /v1/responses endpoint,
allowing clients to send requests in OpenAI format while maintaining internal
compatibility with Lightspeed's existing RAG and LLM integration.

Attributes:
model: The model to use for the response generation.
input: The input text or array of texts to process.
instructions: Optional instructions to guide the response generation.
temperature: Optional temperature for controlling randomness (0.0 to 2.0).
max_output_tokens: Optional maximum number of tokens in the response.

Example:
```python
request = CreateResponseRequest(
model="gpt-4",
input="What is Kubernetes?"
)
```
"""

model: str = Field(
description="The model to use for response generation",
examples=["gpt-4", "gpt-3.5-turbo"],
min_length=1,
)

input: str | list[str] = Field(
description="The input text or array of texts to process",
examples=["What is Kubernetes?", ["Explain containers", "How do they work?"]],
)

instructions: Optional[str] = Field(
None,
description="Optional instructions to guide the response generation",
examples=["You are a helpful DevOps assistant"],
)

temperature: Optional[float] = Field(
None,
description="Temperature for controlling randomness (0.0 to 2.0)",
examples=[0.7, 1.0],
ge=0.0,
le=2.0,
)

max_output_tokens: Optional[int] = Field(
None,
description="Maximum number of tokens in the response",
examples=[1000, 2000],
gt=0,
)

model_config = {
"extra": "forbid",
"json_schema_extra": {
"examples": [
{
"model": "gpt-4",
"input": "What is Kubernetes?",
},
{
"model": "gpt-3.5-turbo",
"input": "Explain Docker containers",
"instructions": "You are a helpful DevOps assistant",
"temperature": 0.7,
"max_output_tokens": 1000,
},
{
"model": "gpt-4",
"input": ["What is Kubernetes?", "How does it work?"],
"temperature": 0.5,
},
]
},
}

@field_validator("input")
@classmethod
def validate_input(cls, value: str | list[str]) -> str | list[str]:
"""Validate that input is not empty."""
if isinstance(value, str):
if not value.strip():
raise ValueError("Input string cannot be empty")
elif isinstance(value, list):
if not value:
raise ValueError("Input array cannot be empty")
for item in value:
if not isinstance(item, str) or not item.strip():
raise ValueError(
"All items in input array must be non-empty strings"
)
return value


class ConversationUpdateRequest(BaseModel):
"""Model representing a request to update a conversation topic summary.

Expand Down
Loading
Loading