lightspeed-core
diff --git a/‎scripts/generate_openapi_schema.py‎
Lines changed: 4 additions & 2 deletions b/‎scripts/generate_openapi_schema.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/app/endpoints/authorized.py‎
Lines changed: 5 additions & 4 deletions b/‎src/app/endpoints/authorized.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/app/endpoints/conversations.py‎
Lines changed: 9 additions & 7 deletions b/‎src/app/endpoints/conversations.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎src/app/endpoints/models.py‎
Lines changed: 4 additions & 4 deletions b/‎src/app/endpoints/models.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/app/endpoints/query.py‎
Lines changed: 29 additions & 25 deletions b/‎src/app/endpoints/query.py‎
Lines changed: 29 additions & 25 deletions
diff --git a/‎src/client.py‎
Lines changed: 1 addition & 38 deletions b/‎src/client.py‎
Lines changed: 1 addition & 38 deletions
diff --git a/‎src/lightspeed_stack.py‎
Lines changed: 1 addition & 3 deletions b/‎src/lightspeed_stack.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/metrics/utils.py‎
Lines changed: 2 additions & 6 deletions b/‎src/metrics/utils.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎src/models/config.py‎
Lines changed: 1 addition & 1 deletion b/‎src/models/config.py‎
Lines changed: 1 addition & 1 deletion
@@ -10,13 +10,15 @@
 # it is needed to read proper configuration in order to start the app to generate schema
 from configuration import configuration
 
-from client import LlamaStackClientHolder
+from client import AsyncLlamaStackClientHolder
 
 cfg_file = "lightspeed-stack.yaml"
 configuration.load_configuration(cfg_file)
 
 # Llama Stack client needs to be loaded before REST API is fully initialized
-LlamaStackClientHolder().load(configuration.configuration.llama_stack)
+import asyncio  # noqa: E402
+
+asyncio.run(AsyncLlamaStackClientHolder().load(configuration.configuration.llama_stack))
 
 from app.main import app  # noqa: E402  pylint: disable=C0413
 
 
@@ -1,10 +1,9 @@
 """Handler for REST API call to authorized endpoint."""
 
-import asyncio
 import logging
 from typing import Any
 
-from fastapi import APIRouter, Request
+from fastapi import APIRouter, Depends
 
 from auth import get_auth_dependency
 from models.responses import AuthorizedResponse, UnauthorizedResponse, ForbiddenResponse
@@ -31,8 +30,10 @@
 
 
 @router.post("/authorized", responses=authorized_responses)
-def authorized_endpoint_handler(_request: Request) -> AuthorizedResponse:
+async def authorized_endpoint_handler(
+    auth: Any = Depends(auth_dependency),
+) -> AuthorizedResponse:
     """Handle request to the /authorized endpoint."""
     # Ignore the user token, we should not return it in the response
-    user_id, user_name, _ = asyncio.run(auth_dependency(_request))
+    user_id, user_name, _ = auth
     return AuthorizedResponse(user_id=user_id, username=user_name)
@@ -7,7 +7,7 @@
 
 from fastapi import APIRouter, HTTPException, status, Depends
 
-from client import LlamaStackClientHolder
+from client import AsyncLlamaStackClientHolder
 from configuration import configuration
 from models.responses import ConversationResponse, ConversationDeleteResponse
 from auth import get_auth_dependency
@@ -110,7 +110,7 @@ def simplify_session_data(session_data: dict) -> list[dict[str, Any]]:
 
 
 @router.get("/conversations/{conversation_id}", responses=conversation_responses)
-def get_conversation_endpoint_handler(
+async def get_conversation_endpoint_handler(
     conversation_id: str,
     _auth: Any = Depends(auth_dependency),
 ) -> ConversationResponse:
@@ -132,9 +132,9 @@ def get_conversation_endpoint_handler(
     logger.info("Retrieving conversation %s", conversation_id)
 
     try:
-        client = LlamaStackClientHolder().get_client()
+        client = AsyncLlamaStackClientHolder().get_client()
 
-        session_data = client.agents.session.list(agent_id=agent_id).data[0]
+        session_data = (await client.agents.session.list(agent_id=agent_id)).data[0]
 
         logger.info("Successfully retrieved conversation %s", conversation_id)
 
@@ -179,7 +179,7 @@ def get_conversation_endpoint_handler(
 @router.delete(
     "/conversations/{conversation_id}", responses=conversation_delete_responses
 )
-def delete_conversation_endpoint_handler(
+async def delete_conversation_endpoint_handler(
     conversation_id: str,
     _auth: Any = Depends(auth_dependency),
 ) -> ConversationDeleteResponse:
@@ -201,10 +201,12 @@ def delete_conversation_endpoint_handler(
 
     try:
         # Get Llama Stack client
-        client = LlamaStackClientHolder().get_client()
+        client = AsyncLlamaStackClientHolder().get_client()
         # Delete session using the conversation_id as session_id
         # In this implementation, conversation_id and session_id are the same
-        client.agents.session.delete(agent_id=agent_id, session_id=conversation_id)
+        await client.agents.session.delete(
+            agent_id=agent_id, session_id=conversation_id
+        )
 
         logger.info("Successfully deleted conversation %s", conversation_id)
 
 
@@ -6,7 +6,7 @@
 from llama_stack_client import APIConnectionError
 from fastapi import APIRouter, HTTPException, Request, status
 
-from client import LlamaStackClientHolder
+from client import AsyncLlamaStackClientHolder
 from configuration import configuration
 from models.responses import ModelsResponse
 from utils.endpoints import check_configuration_loaded
@@ -43,7 +43,7 @@
 
 
 @router.get("/models", responses=models_responses)
-def models_endpoint_handler(_request: Request) -> ModelsResponse:
+async def models_endpoint_handler(_request: Request) -> ModelsResponse:
     """Handle requests to the /models endpoint."""
     check_configuration_loaded(configuration)
 
@@ -52,9 +52,9 @@ def models_endpoint_handler(_request: Request) -> ModelsResponse:
 
     try:
         # try to get Llama Stack client
-        client = LlamaStackClientHolder().get_client()
+        client = AsyncLlamaStackClientHolder().get_client()
         # retrieve models
-        models = client.models.list()
+        models = await client.models.list()
         m = [dict(m) for m in models]
         return ModelsResponse(models=m)
 
 
@@ -8,9 +8,9 @@
 from pathlib import Path
 from typing import Any
 
-from llama_stack_client.lib.agents.agent import Agent
+from llama_stack_client.lib.agents.agent import AsyncAgent
 from llama_stack_client import APIConnectionError
-from llama_stack_client import LlamaStackClient  # type: ignore
+from llama_stack_client import AsyncLlamaStackClient  # type: ignore
 from llama_stack_client.types import UserMessage, Shield  # type: ignore
 from llama_stack_client.types.agents.turn_create_params import (
     ToolgroupAgentToolGroupWithArgs,
@@ -20,7 +20,7 @@
 
 from fastapi import APIRouter, HTTPException, status, Depends
 
-from client import LlamaStackClientHolder
+from client import AsyncLlamaStackClientHolder
 from configuration import configuration
 import metrics
 from models.responses import QueryResponse, UnauthorizedResponse, ForbiddenResponse
@@ -68,27 +68,27 @@ def is_transcripts_enabled() -> bool:
     return configuration.user_data_collection_configuration.transcripts_enabled
 
 
-def get_agent(  # pylint: disable=too-many-arguments,too-many-positional-arguments
-    client: LlamaStackClient,
+async def get_agent(  # pylint: disable=too-many-arguments,too-many-positional-arguments
+    client: AsyncLlamaStackClient,
     model_id: str,
     system_prompt: str,
     available_input_shields: list[str],
     available_output_shields: list[str],
     conversation_id: str | None,
     no_tools: bool = False,
-) -> tuple[Agent, str, str]:
+) -> tuple[AsyncAgent, str, str]:
     """Get existing agent or create a new one with session persistence."""
     existing_agent_id = None
     if conversation_id:
         with suppress(ValueError):
-            existing_agent_id = client.agents.retrieve(
-                agent_id=conversation_id
+            existing_agent_id = (
+                await client.agents.retrieve(agent_id=conversation_id)
             ).agent_id
 
     logger.debug("Creating new agent")
     # TODO(lucasagomes): move to ReActAgent
-    agent = Agent(
-        client,
+    agent = AsyncAgent(
+        client,  # type: ignore[arg-type]
         model=model_id,
         instructions=system_prompt,
         input_shields=available_input_shields if available_input_shields else [],
@@ -98,20 +98,20 @@ def get_agent(  # pylint: disable=too-many-arguments,too-many-positional-argumen
     )
     if existing_agent_id and conversation_id:
         orphan_agent_id = agent.agent_id
-        agent.agent_id = conversation_id
-        client.agents.delete(agent_id=orphan_agent_id)
-        sessions_response = client.agents.session.list(agent_id=conversation_id)
+        agent.agent_id = conversation_id  # type: ignore[misc]
+        await client.agents.delete(agent_id=orphan_agent_id)
+        sessions_response = await client.agents.session.list(agent_id=conversation_id)
         logger.info("session response: %s", sessions_response)
         session_id = str(sessions_response.data[0]["session_id"])
     else:
         conversation_id = agent.agent_id
-        session_id = agent.create_session(get_suid())
+        session_id = await agent.create_session(get_suid())
 
     return agent, conversation_id, session_id
 
 
 @router.post("/query", responses=query_response)
-def query_endpoint_handler(
+async def query_endpoint_handler(
     query_request: QueryRequest,
     auth: Any = Depends(auth_dependency),
     mcp_headers: dict[str, dict[str, str]] = Depends(mcp_headers_dependency),
@@ -126,11 +126,11 @@ def query_endpoint_handler(
 
     try:
         # try to get Llama Stack client
-        client = LlamaStackClientHolder().get_client()
+        client = AsyncLlamaStackClientHolder().get_client()
         model_id, provider_id = select_model_and_provider_id(
-            client.models.list(), query_request
+            await client.models.list(), query_request
         )
-        response, conversation_id = retrieve_response(
+        response, conversation_id = await retrieve_response(
             client,
             model_id,
             query_request,
@@ -250,19 +250,21 @@ def is_input_shield(shield: Shield) -> bool:
     return _is_inout_shield(shield) or not is_output_shield(shield)
 
 
-def retrieve_response(  # pylint: disable=too-many-locals
-    client: LlamaStackClient,
+async def retrieve_response(  # pylint: disable=too-many-locals
+    client: AsyncLlamaStackClient,
     model_id: str,
     query_request: QueryRequest,
     token: str,
     mcp_headers: dict[str, dict[str, str]] | None = None,
 ) -> tuple[str, str]:
     """Retrieve response from LLMs and agents."""
     available_input_shields = [
-        shield.identifier for shield in filter(is_input_shield, client.shields.list())
+        shield.identifier
+        for shield in filter(is_input_shield, await client.shields.list())
     ]
     available_output_shields = [
-        shield.identifier for shield in filter(is_output_shield, client.shields.list())
+        shield.identifier
+        for shield in filter(is_output_shield, await client.shields.list())
     ]
     if not available_input_shields and not available_output_shields:
         logger.info("No available shields. Disabling safety")
@@ -281,7 +283,7 @@ def retrieve_response(  # pylint: disable=too-many-locals
     if query_request.attachments:
         validate_attachments_metadata(query_request.attachments)
 
-    agent, conversation_id, session_id = get_agent(
+    agent, conversation_id, session_id = await get_agent(
         client,
         model_id,
         system_prompt,
@@ -315,15 +317,17 @@ def retrieve_response(  # pylint: disable=too-many-locals
             ),
         }
 
-        vector_db_ids = [vector_db.identifier for vector_db in client.vector_dbs.list()]
+        vector_db_ids = [
+            vector_db.identifier for vector_db in await client.vector_dbs.list()
+        ]
         toolgroups = (get_rag_toolgroups(vector_db_ids) or []) + [
             mcp_server.name for mcp_server in configuration.mcp_servers
         ]
         # Convert empty list to None for consistency with existing behavior
         if not toolgroups:
             toolgroups = None
 
-    response = agent.create_turn(
+    response = await agent.create_turn(
         messages=[UserMessage(role="user", content=query_request.query)],
         session_id=session_id,
         documents=query_request.get_documents(),
 
@@ -6,52 +6,15 @@
 
 from llama_stack.distribution.library_client import (
     AsyncLlamaStackAsLibraryClient,  # type: ignore
-    LlamaStackAsLibraryClient,  # type: ignore
 )
-from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient  # type: ignore
+from llama_stack_client import AsyncLlamaStackClient  # type: ignore
 from models.config import LlamaStackConfiguration
 from utils.types import Singleton
 
 
 logger = logging.getLogger(__name__)
 
 
-class LlamaStackClientHolder(metaclass=Singleton):
-    """Container for an initialised LlamaStackClient."""
-
-    _lsc: Optional[LlamaStackClient] = None
-
-    def load(self, llama_stack_config: LlamaStackConfiguration) -> None:
-        """Retrieve Llama stack client according to configuration."""
-        if llama_stack_config.use_as_library_client is True:
-            if llama_stack_config.library_client_config_path is not None:
-                logger.info("Using Llama stack as library client")
-                client = LlamaStackAsLibraryClient(
-                    llama_stack_config.library_client_config_path
-                )
-                client.initialize()
-                self._lsc = client
-            else:
-                msg = "Configuration problem: library_client_config_path option is not set"
-                logger.error(msg)
-                # tisnik: use custom exception there - with cause etc.
-                raise ValueError(msg)
-
-        else:
-            logger.info("Using Llama stack running as a service")
-            self._lsc = LlamaStackClient(
-                base_url=llama_stack_config.url, api_key=llama_stack_config.api_key
-            )
-
-    def get_client(self) -> LlamaStackClient:
-        """Return an initialised LlamaStackClient."""
-        if not self._lsc:
-            raise RuntimeError(
-                "LlamaStackClient has not been initialised. Ensure 'load(..)' has been called."
-            )
-        return self._lsc
-
-
 class AsyncLlamaStackClientHolder(metaclass=Singleton):
     """Container for an initialised AsyncLlamaStackClient."""
 
 
@@ -12,7 +12,7 @@
 from runners.uvicorn import start_uvicorn
 from runners.data_collector import start_data_collector
 from configuration import configuration
-from client import LlamaStackClientHolder, AsyncLlamaStackClientHolder
+from client import AsyncLlamaStackClientHolder
 
 FORMAT = "%(message)s"
 logging.basicConfig(
@@ -69,8 +69,6 @@ def main() -> None:
     logger.info(
         "Llama stack configuration: %s", configuration.llama_stack_configuration
     )
-    logger.info("Creating LlamaStackClient")
-    LlamaStackClientHolder().load(configuration.configuration.llama_stack)
     logger.info("Creating AsyncLlamaStackClient")
     asyncio.run(
         AsyncLlamaStackClientHolder().load(configuration.configuration.llama_stack)
 
@@ -1,7 +1,7 @@
 """Utility functions for metrics handling."""
 
 from configuration import configuration
-from client import LlamaStackClientHolder, AsyncLlamaStackClientHolder
+from client import AsyncLlamaStackClientHolder
 from log import get_logger
 import metrics
 from utils.common import run_once_async
@@ -13,11 +13,7 @@
 async def setup_model_metrics() -> None:
     """Perform setup of all metrics related to LLM model and provider."""
     logger.info("Setting up model metrics")
-    model_list = []
-    if configuration.llama_stack_configuration.use_as_library_client:
-        model_list = await AsyncLlamaStackClientHolder().get_client().models.list()
-    else:
-        model_list = LlamaStackClientHolder().get_client().models.list()
+    model_list = await AsyncLlamaStackClientHolder().get_client().models.list()
 
     models = [
         model
 
@@ -92,7 +92,7 @@ def check_llama_stack_model(self) -> Self:
             if self.library_client_config_path is None:
                 # pylint: disable=line-too-long
                 raise ValueError(
-                    "LLama stack library client mode is enabled but a configuration file path is not specified"  # noqa: C0301
+                    "LLama stack library client mode is enabled but a configuration file path is not specified"  # noqa: E501
                 )
             # the configuration file must exists and be regular readable file
             checks.file_check(
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ def check_llama_stack_model(self) -> Self:`
`92`	`92`	`if self.library_client_config_path is None:`
`93`	`93`	`# pylint: disable=line-too-long`
`94`	`94`	`raise ValueError(`
`95`		`- "LLama stack library client mode is enabled but a configuration file path is not specified" # noqa: C0301`
	`95`	`+ "LLama stack library client mode is enabled but a configuration file path is not specified" # noqa: E501`
`96`	`96`	`)`
`97`	`97`	`# the configuration file must exists and be regular readable file`
`98`	`98`	`checks.file_check(`