lightspeed-core · zszabo-rh · Sep 25, 2025
diff --git a/src/app/diagnostic_app.py b/src/app/diagnostic_app.py
@@ -0,0 +1,40 @@
+"""Minimal diagnostic FastAPI app for when configuration fails."""
+
+from fastapi import FastAPI
+from app.endpoints import health
+import version
+
+
+def create_diagnostic_app() -> FastAPI:
+    """
+    Create a minimal diagnostic FastAPI app with only health endpoints.
+
+    This app is used when configuration loading fails, providing basic
+    health reporting capabilities for troubleshooting.
+
+    Returns:
+        FastAPI: Minimal app with only health endpoints
+    """
+    app = FastAPI(
+        title="Lightspeed Stack - Diagnostic Mode",
+        summary="Minimal diagnostic server for troubleshooting",
+        description="Limited service running in diagnostic mode due to configuration issues",
+        version=version.__version__,
+        contact={
+            "name": "Red Hat",
+            "url": "https://www.redhat.com/",
+        },
+        license_info={
+            "name": "Apache 2.0", 
+            "url": "https://www.apache.org/licenses/LICENSE-2.0.html",
+        },
+    )
+
+    # Only include health endpoints - no authentication required
+    app.include_router(health.router)
+
+    return app
+
+
+# Export the diagnostic app instance
+diagnostic_app = create_diagnostic_app()
diff --git a/src/app/endpoints/config.py b/src/app/endpoints/config.py
@@ -14,10 +14,10 @@
 
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["config"])
-
 auth_dependency = get_auth_dependency()
 
 
+
 get_config_responses: dict[int | str, dict[str, Any]] = {
     200: {
         "name": "foo bar baz",

diff --git a/src/app/endpoints/health.py b/src/app/endpoints/health.py
@@ -6,26 +6,125 @@
 """
 
 import logging
-from typing import Annotated, Any
+import re
+from typing import Any, Dict, List
 
 from llama_stack.providers.datatypes import HealthStatus
 
-from fastapi import APIRouter, status, Response, Depends
+from fastapi import APIRouter, status, Response
 from client import AsyncLlamaStackClientHolder
-from authentication.interface import AuthTuple
-from authentication import get_auth_dependency
-from authorization.middleware import authorize
-from models.config import Action
 from models.responses import (
     LivenessResponse,
     ReadinessResponse,
     ProviderHealthStatus,
 )
+from configuration import configuration
+from app.state import app_state
 
 logger = logging.getLogger("app.endpoints.handlers")
 router = APIRouter(tags=["health"])
 
-auth_dependency = get_auth_dependency()
+def find_unresolved_template_placeholders(obj: Any, path: str = "") -> List[tuple[str, str]]:
+    r"""
+    Recursively search for unresolved template placeholders in configuration.
+
+    Detects patterns like:
+    - ${VARIABLE_NAME} (basic template format) 
+    - ${\{VARIABLE_NAME}} (malformed template)
+    - ${env.VARIABLE_NAME} (llama-stack format)
+
+    Returns list of (path, value) tuples for any unresolved placeholders.
+    """
+    unresolved = []
+
+    # Patterns that indicate unresolved template placeholders
+    template_patterns = [
+        r'\$\{\\?\{[^}]+\}\\?\}',      # Malformed: ${\{VARIABLE}} (check first)
+        r'\$\{env\.[^}]+\}',           # llama-stack env: ${env.VARIABLE}  
+        r'\$\{[^}]+\}',                # Basic: ${VARIABLE} (check last)
+    ]
+
+    def check_string_for_patterns(value: str, current_path: str):
+        """Check if a string contains unresolved template patterns."""
+        for pattern in template_patterns:
+            matches = re.findall(pattern, value)
+            if matches:
+                unresolved.append((current_path, matches[0]))
+                break  # Stop after first match to avoid duplicates
+
+    def walk_object(obj: Any, current_path: str = ""):
+        """Recursively walk the configuration object."""
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                new_path = f"{current_path}.{key}" if current_path else key
+                walk_object(value, new_path)
+        elif isinstance(obj, list):
+            for i, item in enumerate(obj):
+                new_path = f"{current_path}[{i}]"
+                walk_object(item, new_path)
+        elif isinstance(obj, str):
+            check_string_for_patterns(obj, current_path)
+
+    walk_object(obj, path)
+    return unresolved
+
+
+def check_comprehensive_readiness() -> tuple[bool, str]:
+    """
+    Comprehensive readiness check that validates configuration and initialization.
+
+    Checks in order of importance:
+    1. Configuration loading and validation  
+    2. Application initialization state
+    3. Template placeholder resolution
+
+    Returns:
+        tuple[bool, str]: (is_ready, detailed_reason)
+    """
+    try:
+        # Check 1: Configuration loading
+        if not configuration.is_loaded():
+            # Check if we have detailed error from app_state
+            status = app_state.initialization_status
+            for error in status['errors']:
+                if 'configuration' in error.lower():
+                    return False, f"Configuration loading failed: {error.split(':', 1)[1].strip()}"
+            return False, "Configuration not loaded"
+
+        # Check 2: Template placeholders (critical - causes pydantic errors)
+        unresolved_placeholders = find_unresolved_template_placeholders(configuration.configuration)
+        if unresolved_placeholders:
+            # Prioritize showing the most problematic placeholders
+            example_path, example_value = unresolved_placeholders[0]
+            count = len(unresolved_placeholders)
+            if count == 1:
+                return False, f"Unresolved template placeholder in {example_path}: {example_value}"
+            else:
+                return False, f"Found {count} unresolved template placeholders (e.g., {example_path}: {example_value})"
+
+        # Check 3: Application initialization state
+        if not app_state.is_fully_initialized:
+            status = app_state.initialization_status
+            failed_checks = [k for k, v in status['checks'].items() if not v]
+
+            # Return specific error if available
+            for error in status['errors']:
+                # Return first non-configuration error (those are already handled above)
+                if not any(check in error.lower() for check in ['configuration']):
+                    error_detail = error.split(':', 1)[1].strip() if ':' in error else error
+                    return False, f"Initialization failed: {error_detail}"
+
+            # Fallback to listing failed checks
+            if failed_checks:
+                failed_names = [check.replace('_', ' ').title() for check in failed_checks]
+                return False, f"Incomplete initialization: {', '.join(failed_names)}"
+
+            return False, "Application initialization not complete"
+
+        return True, "Service ready"
+
+    except Exception as e:
+        return False, f"Readiness check error: {str(e)}"
 
 
 async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
@@ -78,40 +177,55 @@ async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
 
 
 @router.get("/readiness", responses=get_readiness_responses)
-@authorize(Action.INFO)
 async def readiness_probe_get_method(
-    auth: Annotated[AuthTuple, Depends(auth_dependency)],
     response: Response,
 ) -> ReadinessResponse:
     """
-    Handle the readiness probe endpoint, returning service readiness.
-
-    If any provider reports an error status, responds with HTTP 503
-    and details of unhealthy providers; otherwise, indicates the
-    service is ready.
+    Enhanced readiness probe that validates complete application readiness.
+
+    This probe performs comprehensive checks including:
+    1. Configuration loading and validation (detects unresolved template placeholders)
+    2. Application initialization state (startup sequence completion)
+    3. LLM provider health status (existing functionality)
+
+    The probe helps detect issues like:
+    - Configuration loading failures (pydantic validation errors)
+    - Unresolved environment variables (${VARIABLE} patterns)
+    - Incomplete application startup (llama client, MCP servers, etc.)
+    - Provider connectivity problems
+
+    Returns 200 when fully ready, 503 when any issues are detected.
+    Each failure mode provides specific diagnostic information in the response.
     """
-    # Used only for authorization
-    _ = auth
-
     logger.info("Response to /v1/readiness endpoint")
 
-    provider_statuses = await get_providers_health_statuses()
+    # Comprehensive configuration and initialization check
+    config_and_init_ready, reason = check_comprehensive_readiness()
+    if not config_and_init_ready:
+        # Configuration/initialization issues are critical - return immediately
+        response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+        return ReadinessResponse(ready=False, reason=reason, providers=[])
 
-    # Check if any provider is unhealthy (not counting not_implemented as unhealthy)
-    unhealthy_providers = [
-        p for p in provider_statuses if p.status == HealthStatus.ERROR.value
-    ]
+    # Provider health check (only if configuration/initialization is ready)
+    try:
+        provider_statuses = await get_providers_health_statuses()
+        unhealthy_providers = [
+            p for p in provider_statuses if p.status == HealthStatus.ERROR.value
+        ]
+
+        if unhealthy_providers:
+            unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
+            reason = f"Unhealthy providers: {', '.join(unhealthy_provider_names)}"
+            response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+            return ReadinessResponse(ready=False, reason=reason, providers=unhealthy_providers)
 
-    if unhealthy_providers:
-        ready = False
-        unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
-        reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
+    except Exception as e:
+        reason = f"Provider health check failed: {str(e)}"
         response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
-    else:
-        ready = True
-        reason = "All providers are healthy"
+        return ReadinessResponse(ready=False, reason=reason, providers=[])
 
-    return ReadinessResponse(ready=ready, reason=reason, providers=unhealthy_providers)
+    # All checks passed
+    return ReadinessResponse(ready=True, reason="Application fully initialized and ready", providers=[])
 
 
 get_liveness_responses: dict[int | str, dict[str, Any]] = {
@@ -124,18 +238,13 @@ async def readiness_probe_get_method(
 
 
 @router.get("/liveness", responses=get_liveness_responses)
-@authorize(Action.INFO)
-async def liveness_probe_get_method(
-    auth: Annotated[AuthTuple, Depends(auth_dependency)],
-) -> LivenessResponse:
+async def liveness_probe_get_method() -> LivenessResponse:
     """
     Return the liveness status of the service.
 
     Returns:
         LivenessResponse: Indicates that the service is alive.
     """
-    # Used only for authorization
-    _ = auth
 
     logger.info("Response to /v1/liveness endpoint")
 

diff --git a/src/app/endpoints/info.py b/src/app/endpoints/info.py
@@ -18,7 +18,6 @@
 
 logger = logging.getLogger("app.endpoints.handlers")
 router = APIRouter(tags=["info"])
-
 auth_dependency = get_auth_dependency()
 
 

diff --git a/src/app/endpoints/metrics.py b/src/app/endpoints/metrics.py
@@ -15,10 +15,10 @@
 from metrics.utils import setup_model_metrics
 
 router = APIRouter(tags=["metrics"])
-
 auth_dependency = get_auth_dependency()
 
 
+
 @router.get("/metrics", response_class=PlainTextResponse)
 @authorize(Action.GET_METRICS)
 async def metrics_endpoint_handler(

diff --git a/src/app/endpoints/models.py b/src/app/endpoints/models.py
@@ -18,11 +18,10 @@
 
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["models"])
-
-
 auth_dependency = get_auth_dependency()
 
 
+
 models_responses: dict[int | str, dict[str, Any]] = {
     200: {
         "models": [

diff --git a/src/app/endpoints/root.py b/src/app/endpoints/root.py
@@ -13,7 +13,6 @@
 
 logger = logging.getLogger("app.endpoints.handlers")
 router = APIRouter(tags=["root"])
-
 auth_dependency = get_auth_dependency()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,7 +18,6 @@

		logger = logging.getLogger("app.endpoints.handlers")
		router = APIRouter(tags=["info"])

		auth_dependency = get_auth_dependency()


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,7 +13,6 @@

		logger = logging.getLogger("app.endpoints.handlers")
		router = APIRouter(tags=["root"])

		auth_dependency = get_auth_dependency()


Expand Down