Delay setup_model_metrics() until /metrics is called

umago · umago · commit 576341d24820 · 2025-08-01T14:35:33.000+01:00
This patch moves the setup_model_metrics() from the service startup to
the first time the /metrics endpoint is called. This speed up the
lightspeed-stack service initialization and also make lightspeed-stack
more resilient regarding service initialization order because it no
longer requires llama-stack to be started first (as
setup_model_metrics() tries to connect to llama-stack and fetch the list
of models from it).

Signed-off-by: Lucas Alvares Gomes &lt;lucasagomes@gmail.com&gt;
diff --git a/src/app/endpoints/metrics.py b/src/app/endpoints/metrics.py
@@ -7,10 +7,15 @@
     CONTENT_TYPE_LATEST,
 )
 
+from metrics.utils import setup_model_metrics
+
 router = APIRouter(tags=["metrics"])
 
 
 @router.get("/metrics", response_class=PlainTextResponse)
-def metrics_endpoint_handler(_request: Request) -> PlainTextResponse:
+async def metrics_endpoint_handler(_request: Request) -> PlainTextResponse:
     """Handle request to the /metrics endpoint."""
+    # Setup the model metrics if not already done. This is a one-time setup
+    # and will not be run again on subsequent calls to this endpoint
+    await setup_model_metrics()
     return PlainTextResponse(generate_latest(), media_type=CONTENT_TYPE_LATEST)
diff --git a/src/app/main.py b/src/app/main.py
@@ -10,7 +10,6 @@
 from configuration import configuration
 from log import get_logger
 import metrics
-from metrics.utils import setup_model_metrics
 from utils.common import register_mcp_servers_async
 import version
 
@@ -81,6 +80,4 @@ async def startup_event() -> None:
     logger.info("Registering MCP servers")
     await register_mcp_servers_async(logger, configuration.configuration)
     get_logger("app.endpoints.handlers")
-    logger.info("Setting up model metrics")
-    await setup_model_metrics()
     logger.info("App startup complete")
diff --git a/src/metrics/utils.py b/src/metrics/utils.py
@@ -4,12 +4,15 @@
 from client import LlamaStackClientHolder, AsyncLlamaStackClientHolder
 from log import get_logger
 import metrics
+from utils.common import run_once_async
 
 logger = get_logger(__name__)
 
 
+@run_once_async
 async def setup_model_metrics() -> None:
     """Perform setup of all metrics related to LLM model and provider."""
+    logger.info("Setting up model metrics")
     model_list = []
     if configuration.llama_stack_configuration.use_as_library_client:
         model_list = await AsyncLlamaStackClientHolder().get_client().models.list()
@@ -48,3 +51,4 @@ async def setup_model_metrics() -> None:
                 model_name,
                 default_model_value,
             )
+    logger.info("Model metrics setup complete")
diff --git a/src/utils/common.py b/src/utils/common.py
@@ -1,10 +1,11 @@
 """Common utilities for the project."""
 
-from typing import Any, List, cast
+import asyncio
+from functools import wraps
 from logging import Logger
+from typing import Any, List, cast, Callable
 
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-
 from llama_stack.distribution.library_client import (
     AsyncLlamaStackAsLibraryClient,
 )
@@ -103,3 +104,18 @@ def _register_mcp_toolgroups_sync(
 
             client.toolgroups.register(**registration_params)
             logger.debug("MCP server %s registered successfully", mcp.name)
+
+
+def run_once_async(func: Callable) -> Callable:
+    """Decorate an async function to run only once."""
+    task = None
+
+    @wraps(func)
+    async def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal task
+        if task is None:
+            loop = asyncio.get_running_loop()
+            task = loop.create_task(func(*args, **kwargs))
+        return await task
+
+    return wrapper
diff --git a/tests/unit/app/endpoints/test_metrics.py b/tests/unit/app/endpoints/test_metrics.py
@@ -3,15 +3,20 @@
 from app.endpoints.metrics import metrics_endpoint_handler
 
 
-def test_metrics_endpoint():
+async def test_metrics_endpoint(mocker):
     """Test the metrics endpoint handler."""
-    response = metrics_endpoint_handler(None)
+    mock_setup_metrics = mocker.patch(
+        "app.endpoints.metrics.setup_model_metrics", return_value=None
+    )
+    response = await metrics_endpoint_handler(None)
     assert response is not None
     assert response.status_code == 200
     assert "text/plain" in response.headers["Content-Type"]
 
     response_body = response.body.decode()
 
+    # Assert metrics were set up
+    mock_setup_metrics.assert_called_once()
     # Check if the response contains Prometheus metrics format
     assert "# TYPE ls_rest_api_calls_total counter" in response_body
     assert "# TYPE ls_response_duration_seconds histogram" in response_body