-
Notifications
You must be signed in to change notification settings - Fork 52
update store transcript function #447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,29 +5,28 @@ | |
| """ | ||
|
|
||
| from datetime import UTC, datetime | ||
| import fcntl | ||
| import json | ||
| import logging | ||
| import os | ||
| from pathlib import Path | ||
|
|
||
| from configuration import configuration | ||
| from models.requests import Attachment, QueryRequest | ||
| from utils.suid import get_suid | ||
| from utils.types import TurnSummary | ||
|
|
||
| logger = logging.getLogger("utils.transcripts") | ||
|
|
||
|
|
||
| def construct_transcripts_path(user_id: str, conversation_id: str) -> Path: | ||
| def construct_transcripts_path(user_id: str) -> Path: | ||
| """Construct path to transcripts.""" | ||
| # these two normalizations are required by Snyk as it detects | ||
| # this Path sanitization pattern | ||
| uid = os.path.normpath("/" + user_id).lstrip("/") | ||
| cid = os.path.normpath("/" + conversation_id).lstrip("/") | ||
| file_path = ( | ||
| configuration.user_data_collection_configuration.transcripts_storage or "" | ||
| ) | ||
| return Path(file_path, uid, cid) | ||
| return Path(file_path, uid) | ||
|
|
||
|
|
||
| def store_transcript( # pylint: disable=too-many-arguments,too-many-positional-arguments,too-many-locals | ||
|
|
@@ -45,9 +44,14 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- | |
| ) -> None: | ||
| """Store transcript in the local filesystem. | ||
|
|
||
| All turns for a single conversation are stored in the same file, | ||
| named after the conversation_id. | ||
|
|
||
| Args: | ||
| user_id: The user ID (UUID). | ||
| conversation_id: The conversation ID (UUID). | ||
| model_id: The model ID. | ||
| provider_id: The provider ID. | ||
| query_is_valid: The result of the query validation. | ||
| query: The query (without attachments). | ||
| query_request: The request containing a query. | ||
|
|
@@ -56,17 +60,18 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- | |
| truncated: The flag indicating if the history was truncated. | ||
| attachments: The list of `Attachment` objects. | ||
| """ | ||
| transcripts_path = construct_transcripts_path(user_id, conversation_id) | ||
| transcripts_path = construct_transcripts_path(user_id) | ||
| transcripts_path.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| data_to_store = { | ||
| # Use conversation_id as filename instead of random UUID | ||
| transcript_file_path = transcripts_path / f"{conversation_id}.json" | ||
| # Prepare turn data | ||
| turn_data = { | ||
| "metadata": { | ||
| "provider": provider_id, | ||
| "model": model_id, | ||
| "query_provider": query_request.provider, | ||
| "query_model": query_request.model, | ||
| "user_id": user_id, | ||
| "conversation_id": conversation_id, | ||
| "timestamp": datetime.now(UTC).isoformat(), | ||
| }, | ||
| "redacted_query": query, | ||
|
|
@@ -78,9 +83,39 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- | |
| "tool_calls": [tc.model_dump() for tc in summary.tool_calls], | ||
| } | ||
|
|
||
| # stores feedback in a file under unique uuid | ||
| transcript_file_path = transcripts_path / f"{get_suid()}.json" | ||
| with open(transcript_file_path, "w", encoding="utf-8") as transcript_file: | ||
| json.dump(data_to_store, transcript_file) | ||
| # Use file locking to handle concurrent writes safely | ||
| with open(transcript_file_path, "a+", encoding="utf-8") as transcript_file: | ||
| fcntl.flock(transcript_file.fileno(), fcntl.LOCK_EX) | ||
| try: | ||
| # Move to beginning to read existing content | ||
| transcript_file.seek(0) | ||
| file_content = transcript_file.read() | ||
| if file_content.strip(): | ||
| # File has existing content, load it | ||
| transcript_file.seek(0) | ||
| conversation_data = json.load(transcript_file) | ||
| else: | ||
| # First turn for this conversation | ||
| conversation_data = { | ||
| "conversation_metadata": { | ||
| "conversation_id": conversation_id, | ||
| "user_id": user_id, | ||
| "created_at": datetime.now(UTC).isoformat(), | ||
| "last_updated": datetime.now(UTC).isoformat(), | ||
| }, | ||
| "turns": [], | ||
| } | ||
| # Add new turn | ||
| conversation_data["turns"].append(turn_data) | ||
| conversation_data["conversation_metadata"]["last_updated"] = datetime.now( | ||
| UTC | ||
| ).isoformat() | ||
|
|
||
| # Write updated data back to file | ||
| transcript_file.seek(0) | ||
| transcript_file.truncate() | ||
| json.dump(conversation_data, transcript_file, indent=2) | ||
| finally: | ||
| fcntl.flock(transcript_file.fileno(), fcntl.LOCK_UN) | ||
|
|
||
| logger.info("Transcript successfully stored at: %s", transcript_file_path) | ||
| logger.info("Transcript turn successfully stored at: %s", transcript_file_path) | ||
|
Comment on lines
+86
to
+121
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Harden against corrupted/partial JSON and improve crash-safety
Apply minimal recovery + atomic write: @@
- with open(transcript_file_path, "a+", encoding="utf-8") as transcript_file:
+ with open(transcript_file_path, "a+", encoding="utf-8") as transcript_file:
fcntl.flock(transcript_file.fileno(), fcntl.LOCK_EX)
try:
# Move to beginning to read existing content
transcript_file.seek(0)
file_content = transcript_file.read()
- if file_content.strip():
- # File has existing content, load it
- transcript_file.seek(0)
- conversation_data = json.load(transcript_file)
+ if file_content.strip():
+ # File has existing content, load it (recover if corrupted)
+ transcript_file.seek(0)
+ try:
+ conversation_data = json.load(transcript_file)
+ except json.JSONDecodeError:
+ logger.warning("Corrupted transcript detected at %s; backing up and reinitializing.", transcript_file_path)
+ backup_path = transcript_file_path.with_suffix(
+ transcript_file_path.suffix + f".corrupt-{datetime.now(UTC).strftime('%Y%m%dT%H%M%S%fZ')}"
+ )
+ try:
+ # Use the content we already read for backup
+ with open(backup_path, "w", encoding="utf-8") as backup:
+ backup.write(file_content)
+ except Exception: # best-effort
+ logger.exception("Failed to write backup for corrupted transcript: %s", backup_path)
+ conversation_data = {
+ "conversation_metadata": {
+ "conversation_id": conversation_id,
+ "user_id": user_id,
+ "created_at": datetime.now(UTC).isoformat(),
+ "last_updated": datetime.now(UTC).isoformat(),
+ },
+ "turns": [],
+ }
else:
# First turn for this conversation
conversation_data = {
@@
- # Write updated data back to file
- transcript_file.seek(0)
- transcript_file.truncate()
- json.dump(conversation_data, transcript_file, indent=2)
+ # Write updated data back to file atomically
+ import tempfile
+ with tempfile.NamedTemporaryFile(
+ "w", dir=transcripts_path, prefix=f".{cid}.", suffix=".tmp", delete=False, encoding="utf-8"
+ ) as tmp:
+ json.dump(conversation_data, tmp, indent=2, ensure_ascii=False)
+ tmp.flush()
+ os.fsync(tmp.fileno())
+ tmp_path = Path(tmp.name)
+ os.replace(tmp_path, transcript_file_path)
finally:
fcntl.flock(transcript_file.fileno(), fcntl.LOCK_UN)Note: We keep the lock on the original file descriptor during the prepare/replace window; if readers exist, consider enforcing shared locks for reads or a separate lock file for stricter semantics. If you don’t want atomic replace, at minimum keep the JSONDecodeError recovery.
🤖 Prompt for AI Agents |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Path traversal via conversation_id filename — sanitize or validate before using
conversation_id is interpolated directly into the filename. Inputs like "../x" or "foo/../../bar" would escape the user directory and write elsewhere. Sanitize like user_id or enforce a strict format (UUID/SUID).
Apply:
Optionally, if a SUID/UUID utility exists, prefer a strict validator instead of path heuristics.
🤖 Prompt for AI Agents