Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions src/app/diagnostic_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Minimal diagnostic FastAPI app for when configuration fails."""

from fastapi import FastAPI
from app.endpoints import health
import version


def create_diagnostic_app() -> FastAPI:
"""
Create a minimal diagnostic FastAPI app with only health endpoints.

This app is used when configuration loading fails, providing basic
health reporting capabilities for troubleshooting.

Returns:
FastAPI: Minimal app with only health endpoints
"""
app = FastAPI(
title="Lightspeed Stack - Diagnostic Mode",
summary="Minimal diagnostic server for troubleshooting",
description="Limited service running in diagnostic mode due to configuration issues",
version=version.__version__,
contact={
"name": "Red Hat",
"url": "https://www.redhat.com/",
},
license_info={
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0.html",
},
)

# Only include health endpoints - no authentication required
app.include_router(health.router)

return app


# Export the diagnostic app instance
diagnostic_app = create_diagnostic_app()
2 changes: 1 addition & 1 deletion src/app/endpoints/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@

logger = logging.getLogger(__name__)
router = APIRouter(tags=["config"])

auth_dependency = get_auth_dependency()



get_config_responses: dict[int | str, dict[str, Any]] = {
200: {
"name": "foo bar baz",
Expand Down
181 changes: 145 additions & 36 deletions src/app/endpoints/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,125 @@
"""

import logging
from typing import Annotated, Any
import re
from typing import Any, Dict, List

from llama_stack.providers.datatypes import HealthStatus

from fastapi import APIRouter, status, Response, Depends
from fastapi import APIRouter, status, Response
from client import AsyncLlamaStackClientHolder
from authentication.interface import AuthTuple
from authentication import get_auth_dependency
from authorization.middleware import authorize
from models.config import Action
from models.responses import (
LivenessResponse,
ReadinessResponse,
ProviderHealthStatus,
)
from configuration import configuration
from app.state import app_state

logger = logging.getLogger("app.endpoints.handlers")
router = APIRouter(tags=["health"])

auth_dependency = get_auth_dependency()
def find_unresolved_template_placeholders(obj: Any, path: str = "") -> List[tuple[str, str]]:
r"""
Recursively search for unresolved template placeholders in configuration.

Detects patterns like:
- ${VARIABLE_NAME} (basic template format)
- ${\{VARIABLE_NAME}} (malformed template)
- ${env.VARIABLE_NAME} (llama-stack format)

Returns list of (path, value) tuples for any unresolved placeholders.
"""
unresolved = []

# Patterns that indicate unresolved template placeholders
template_patterns = [
r'\$\{\\?\{[^}]+\}\\?\}', # Malformed: ${\{VARIABLE}} (check first)
r'\$\{env\.[^}]+\}', # llama-stack env: ${env.VARIABLE}
r'\$\{[^}]+\}', # Basic: ${VARIABLE} (check last)
]

def check_string_for_patterns(value: str, current_path: str):
"""Check if a string contains unresolved template patterns."""
for pattern in template_patterns:
matches = re.findall(pattern, value)
if matches:
unresolved.append((current_path, matches[0]))
break # Stop after first match to avoid duplicates

def walk_object(obj: Any, current_path: str = ""):
"""Recursively walk the configuration object."""
if isinstance(obj, dict):
for key, value in obj.items():
new_path = f"{current_path}.{key}" if current_path else key
walk_object(value, new_path)
elif isinstance(obj, list):
for i, item in enumerate(obj):
new_path = f"{current_path}[{i}]"
walk_object(item, new_path)
elif isinstance(obj, str):
check_string_for_patterns(obj, current_path)

walk_object(obj, path)
return unresolved


def check_comprehensive_readiness() -> tuple[bool, str]:
"""
Comprehensive readiness check that validates configuration and initialization.

Checks in order of importance:
1. Configuration loading and validation
2. Application initialization state
3. Template placeholder resolution

Returns:
tuple[bool, str]: (is_ready, detailed_reason)
"""
try:
# Check 1: Configuration loading
if not configuration.is_loaded():
# Check if we have detailed error from app_state
status = app_state.initialization_status
for error in status['errors']:
if 'configuration' in error.lower():
return False, f"Configuration loading failed: {error.split(':', 1)[1].strip()}"
return False, "Configuration not loaded"

# Check 2: Template placeholders (critical - causes pydantic errors)
unresolved_placeholders = find_unresolved_template_placeholders(configuration.configuration)
if unresolved_placeholders:
# Prioritize showing the most problematic placeholders
example_path, example_value = unresolved_placeholders[0]
count = len(unresolved_placeholders)
if count == 1:
return False, f"Unresolved template placeholder in {example_path}: {example_value}"
else:
return False, f"Found {count} unresolved template placeholders (e.g., {example_path}: {example_value})"

# Check 3: Application initialization state
if not app_state.is_fully_initialized:
status = app_state.initialization_status
failed_checks = [k for k, v in status['checks'].items() if not v]

# Return specific error if available
for error in status['errors']:
# Return first non-configuration error (those are already handled above)
if not any(check in error.lower() for check in ['configuration']):
error_detail = error.split(':', 1)[1].strip() if ':' in error else error
return False, f"Initialization failed: {error_detail}"

# Fallback to listing failed checks
if failed_checks:
failed_names = [check.replace('_', ' ').title() for check in failed_checks]
return False, f"Incomplete initialization: {', '.join(failed_names)}"

return False, "Application initialization not complete"

return True, "Service ready"

except Exception as e:
return False, f"Readiness check error: {str(e)}"


async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
Expand Down Expand Up @@ -78,40 +177,55 @@ async def get_providers_health_statuses() -> list[ProviderHealthStatus]:


@router.get("/readiness", responses=get_readiness_responses)
@authorize(Action.INFO)
async def readiness_probe_get_method(
auth: Annotated[AuthTuple, Depends(auth_dependency)],
response: Response,
) -> ReadinessResponse:
"""
Handle the readiness probe endpoint, returning service readiness.

If any provider reports an error status, responds with HTTP 503
and details of unhealthy providers; otherwise, indicates the
service is ready.
Enhanced readiness probe that validates complete application readiness.

This probe performs comprehensive checks including:
1. Configuration loading and validation (detects unresolved template placeholders)
2. Application initialization state (startup sequence completion)
3. LLM provider health status (existing functionality)

The probe helps detect issues like:
- Configuration loading failures (pydantic validation errors)
- Unresolved environment variables (${VARIABLE} patterns)
- Incomplete application startup (llama client, MCP servers, etc.)
- Provider connectivity problems

Returns 200 when fully ready, 503 when any issues are detected.
Each failure mode provides specific diagnostic information in the response.
"""
# Used only for authorization
_ = auth

logger.info("Response to /v1/readiness endpoint")

provider_statuses = await get_providers_health_statuses()
# Comprehensive configuration and initialization check
config_and_init_ready, reason = check_comprehensive_readiness()
if not config_and_init_ready:
# Configuration/initialization issues are critical - return immediately
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return ReadinessResponse(ready=False, reason=reason, providers=[])

# Check if any provider is unhealthy (not counting not_implemented as unhealthy)
unhealthy_providers = [
p for p in provider_statuses if p.status == HealthStatus.ERROR.value
]
# Provider health check (only if configuration/initialization is ready)
try:
provider_statuses = await get_providers_health_statuses()
unhealthy_providers = [
p for p in provider_statuses if p.status == HealthStatus.ERROR.value
]

if unhealthy_providers:
unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
reason = f"Unhealthy providers: {', '.join(unhealthy_provider_names)}"
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return ReadinessResponse(ready=False, reason=reason, providers=unhealthy_providers)

if unhealthy_providers:
ready = False
unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
except Exception as e:
reason = f"Provider health check failed: {str(e)}"
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
else:
ready = True
reason = "All providers are healthy"
return ReadinessResponse(ready=False, reason=reason, providers=[])

return ReadinessResponse(ready=ready, reason=reason, providers=unhealthy_providers)
# All checks passed
return ReadinessResponse(ready=True, reason="Application fully initialized and ready", providers=[])


get_liveness_responses: dict[int | str, dict[str, Any]] = {
Expand All @@ -124,18 +238,13 @@ async def readiness_probe_get_method(


@router.get("/liveness", responses=get_liveness_responses)
@authorize(Action.INFO)
async def liveness_probe_get_method(
auth: Annotated[AuthTuple, Depends(auth_dependency)],
) -> LivenessResponse:
async def liveness_probe_get_method() -> LivenessResponse:
"""
Return the liveness status of the service.

Returns:
LivenessResponse: Indicates that the service is alive.
"""
# Used only for authorization
_ = auth

logger.info("Response to /v1/liveness endpoint")

Expand Down
1 change: 0 additions & 1 deletion src/app/endpoints/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

logger = logging.getLogger("app.endpoints.handlers")
router = APIRouter(tags=["info"])

auth_dependency = get_auth_dependency()


Expand Down
2 changes: 1 addition & 1 deletion src/app/endpoints/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from metrics.utils import setup_model_metrics

router = APIRouter(tags=["metrics"])

auth_dependency = get_auth_dependency()



@router.get("/metrics", response_class=PlainTextResponse)
@authorize(Action.GET_METRICS)
async def metrics_endpoint_handler(
Expand Down
3 changes: 1 addition & 2 deletions src/app/endpoints/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@

logger = logging.getLogger(__name__)
router = APIRouter(tags=["models"])


auth_dependency = get_auth_dependency()



models_responses: dict[int | str, dict[str, Any]] = {
200: {
"models": [
Expand Down
1 change: 0 additions & 1 deletion src/app/endpoints/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

logger = logging.getLogger("app.endpoints.handlers")
router = APIRouter(tags=["root"])

auth_dependency = get_auth_dependency()


Expand Down
Loading
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.