Skip to content

Commit ee9f08f

Browse files
authored
Merge pull request #643 from maysunfaisal/rag-caching-2
[RHDHPAI-1143] Implement referenced_documents caching
2 parents d8e41a2 + 54c5c5a commit ee9f08f

File tree

17 files changed

+580
-106
lines changed

17 files changed

+580
-106
lines changed

docs/openapi.json

Lines changed: 108 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1787,6 +1787,9 @@
17871787
},
17881788
"type": "array",
17891789
"title": "Byok Rag"
1790+
},
1791+
"quota_handlers": {
1792+
"$ref": "#/components/schemas/QuotaHandlersConfiguration"
17901793
}
17911794
},
17921795
"additionalProperties": false,
@@ -3590,6 +3593,103 @@
35903593
}
35913594
]
35923595
},
3596+
"QuotaHandlersConfiguration": {
3597+
"properties": {
3598+
"sqlite": {
3599+
"anyOf": [
3600+
{
3601+
"$ref": "#/components/schemas/SQLiteDatabaseConfiguration"
3602+
},
3603+
{
3604+
"type": "null"
3605+
}
3606+
]
3607+
},
3608+
"postgres": {
3609+
"anyOf": [
3610+
{
3611+
"$ref": "#/components/schemas/PostgreSQLDatabaseConfiguration"
3612+
},
3613+
{
3614+
"type": "null"
3615+
}
3616+
]
3617+
},
3618+
"limiters": {
3619+
"items": {
3620+
"$ref": "#/components/schemas/QuotaLimiterConfiguration"
3621+
},
3622+
"type": "array",
3623+
"title": "Limiters"
3624+
},
3625+
"scheduler": {
3626+
"$ref": "#/components/schemas/QuotaSchedulerConfiguration"
3627+
},
3628+
"enable_token_history": {
3629+
"type": "boolean",
3630+
"title": "Enable Token History",
3631+
"default": false
3632+
}
3633+
},
3634+
"additionalProperties": false,
3635+
"type": "object",
3636+
"title": "QuotaHandlersConfiguration",
3637+
"description": "Quota limiter configuration."
3638+
},
3639+
"QuotaLimiterConfiguration": {
3640+
"properties": {
3641+
"type": {
3642+
"type": "string",
3643+
"enum": [
3644+
"user_limiter",
3645+
"cluster_limiter"
3646+
],
3647+
"title": "Type"
3648+
},
3649+
"name": {
3650+
"type": "string",
3651+
"title": "Name"
3652+
},
3653+
"initial_quota": {
3654+
"type": "integer",
3655+
"minimum": 0.0,
3656+
"title": "Initial Quota"
3657+
},
3658+
"quota_increase": {
3659+
"type": "integer",
3660+
"minimum": 0.0,
3661+
"title": "Quota Increase"
3662+
},
3663+
"period": {
3664+
"type": "string",
3665+
"title": "Period"
3666+
}
3667+
},
3668+
"additionalProperties": false,
3669+
"type": "object",
3670+
"required": [
3671+
"type",
3672+
"name",
3673+
"initial_quota",
3674+
"quota_increase",
3675+
"period"
3676+
],
3677+
"title": "QuotaLimiterConfiguration",
3678+
"description": "Configuration for one quota limiter."
3679+
},
3680+
"QuotaSchedulerConfiguration": {
3681+
"properties": {
3682+
"period": {
3683+
"type": "integer",
3684+
"exclusiveMinimum": 0.0,
3685+
"title": "Period",
3686+
"default": 1
3687+
}
3688+
},
3689+
"type": "object",
3690+
"title": "QuotaSchedulerConfiguration",
3691+
"description": "Quota scheduler configuration."
3692+
},
35933693
"RAGChunk": {
35943694
"properties": {
35953695
"content": {
@@ -3691,15 +3791,19 @@
36913791
"description": "URL of the referenced document"
36923792
},
36933793
"doc_title": {
3694-
"type": "string",
3794+
"anyOf": [
3795+
{
3796+
"type": "string"
3797+
},
3798+
{
3799+
"type": "null"
3800+
}
3801+
],
36953802
"title": "Doc Title",
36963803
"description": "Title of the referenced document"
36973804
}
36983805
},
36993806
"type": "object",
3700-
"required": [
3701-
"doc_title"
3702-
],
37033807
"title": "ReferencedDocument",
37043808
"description": "Model representing a document referenced in generating a response.\n\nAttributes:\n doc_url: Url to the referenced doc.\n doc_title: Title of the referenced doc."
37053809
},

src/app/endpoints/conversations_v2.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -314,13 +314,19 @@ def check_conversation_existence(user_id: str, conversation_id: str) -> None:
314314

315315
def transform_chat_message(entry: CacheEntry) -> dict[str, Any]:
316316
"""Transform the message read from cache into format used by response payload."""
317+
user_message = {"content": entry.query, "type": "user"}
318+
assistant_message: dict[str, Any] = {"content": entry.response, "type": "assistant"}
319+
320+
# If referenced_documents exist on the entry, add them to the assistant message
321+
if entry.referenced_documents is not None:
322+
assistant_message["referenced_documents"] = [
323+
doc.model_dump(mode="json") for doc in entry.referenced_documents
324+
]
325+
317326
return {
318327
"provider": entry.provider,
319328
"model": entry.model,
320-
"messages": [
321-
{"content": entry.query, "type": "user"},
322-
{"content": entry.response, "type": "assistant"},
323-
],
329+
"messages": [user_message, assistant_message],
324330
"started_at": entry.started_at,
325331
"completed_at": entry.completed_at,
326332
}

src/app/endpoints/query.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from authorization.middleware import authorize
3232
from client import AsyncLlamaStackClientHolder
3333
from configuration import configuration
34+
from models.cache_entry import CacheEntry
3435
from models.config import Action
3536
from models.database.conversations import UserConversation
3637
from models.requests import Attachment, QueryRequest
@@ -331,16 +332,22 @@ async def query_endpoint_handler( # pylint: disable=R0914
331332
)
332333

333334
completed_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
335+
336+
cache_entry = CacheEntry(
337+
query=query_request.query,
338+
response=summary.llm_response,
339+
provider=provider_id,
340+
model=model_id,
341+
started_at=started_at,
342+
completed_at=completed_at,
343+
referenced_documents=referenced_documents if referenced_documents else None,
344+
)
345+
334346
store_conversation_into_cache(
335347
configuration,
336348
user_id,
337349
conversation_id,
338-
provider_id,
339-
model_id,
340-
query_request.query,
341-
summary.llm_response,
342-
started_at,
343-
completed_at,
350+
cache_entry,
344351
_skip_userid_check,
345352
topic_summary,
346353
)

src/app/endpoints/streaming_query.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,14 @@
4343
from constants import DEFAULT_RAG_TOOL, MEDIA_TYPE_JSON, MEDIA_TYPE_TEXT
4444
import metrics
4545
from metrics.utils import update_llm_token_count_from_turn
46+
from models.cache_entry import CacheEntry
4647
from models.config import Action
4748
from models.database.conversations import UserConversation
4849
from models.requests import QueryRequest
4950
from models.responses import ForbiddenResponse, UnauthorizedResponse
5051
from utils.endpoints import (
5152
check_configuration_loaded,
53+
create_referenced_documents_with_metadata,
5254
create_rag_chunks_dict,
5355
get_agent,
5456
get_system_prompt,
@@ -863,16 +865,28 @@ async def response_generator(
863865
)
864866

865867
completed_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
868+
869+
referenced_documents = create_referenced_documents_with_metadata(
870+
summary, metadata_map
871+
)
872+
873+
cache_entry = CacheEntry(
874+
query=query_request.query,
875+
response=summary.llm_response,
876+
provider=provider_id,
877+
model=model_id,
878+
started_at=started_at,
879+
completed_at=completed_at,
880+
referenced_documents=(
881+
referenced_documents if referenced_documents else None
882+
),
883+
)
884+
866885
store_conversation_into_cache(
867886
configuration,
868887
user_id,
869888
conversation_id,
870-
provider_id,
871-
model_id,
872-
query_request.query,
873-
summary.llm_response,
874-
started_at,
875-
completed_at,
889+
cache_entry,
876890
_skip_userid_check,
877891
topic_summary,
878892
)

src/cache/cache.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
from abc import ABC, abstractmethod
44

5-
from models.cache_entry import CacheEntry, ConversationData
5+
from models.cache_entry import CacheEntry
6+
from models.responses import ConversationData
67
from utils.suid import check_suid
78

89

src/cache/in_memory_cache.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
"""In-memory cache implementation."""
22

33
from cache.cache import Cache
4-
from models.cache_entry import CacheEntry, ConversationData
4+
from models.cache_entry import CacheEntry
55
from models.config import InMemoryCacheConfig
6+
from models.responses import ConversationData
67
from log import get_logger
78
from utils.connection_decorator import connection
89

src/cache/noop_cache.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""No-operation cache implementation."""
22

33
from cache.cache import Cache
4-
from models.cache_entry import CacheEntry, ConversationData
4+
from models.cache_entry import CacheEntry
5+
from models.responses import ConversationData
56
from log import get_logger
67
from utils.connection_decorator import connection
78

0 commit comments

Comments
 (0)