joshsoftware · satyamshukla20 · Aug 31, 2025 · Sep 2, 2025 · Sep 10, 2025 · Sep 15, 2025
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,5 @@ nltk
 chroma
 container.db
 .next-build
-.cursor
+.cursor
+venv/
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -6,7 +6,43 @@ RUN apt-get update && apt-get install -y \
   curl \
   libreoffice \
   fontconfig \
-  imagemagick
+  imagemagick \
+  ca-certificates \
+  fonts-liberation \
+  libasound2 \
+  libatk-bridge2.0-0 \
+  libatk1.0-0 \
+  libc6 \
+  libcairo2 \
+  libcups2 \
+  libdbus-1-3 \
+  libexpat1 \
+  libfontconfig1 \
+  libgbm1 \
+  libgcc1 \
+  libglib2.0-0 \
+  libgtk-3-0 \
+  libnspr4 \
+  libnss3 \
+  libpango-1.0-0 \
+  libpangocairo-1.0-0 \
+  libstdc++6 \
+  libx11-6 \
+  libx11-xcb1 \
+  libxcb1 \
+  libxcomposite1 \
+  libxcursor1 \
+  libxdamage1 \
+  libxext6 \
+  libxfixes3 \
+  libxi6 \
+  libxrandr2 \
+  libxrender1 \
+  libxss1 \
+  libxtst6 \
+  lsb-release \
+  wget \
+  xdg-utils
 
 RUN sed -i 's/rights="none" pattern="PDF"/rights="read|write" pattern="PDF"/' /etc/ImageMagick-6/policy.xml
 
@@ -31,10 +67,10 @@ RUN curl -fsSL http://ollama.com/install.sh | sh
 
 # Install dependencies for FastAPI
 RUN pip install aiohttp aiomysql aiosqlite asyncpg fastapi[standard] \
-  pathvalidate pdfplumber chromadb sqlmodel \
+  pathvalidate pdfplumber chromadb sqlmodel pgvector \
   anthropic google-genai openai fastmcp \
-  python-jose[cryptography] passlib
-RUN pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
+  python-jose[cryptography] passlib numpy onnxruntime transformers
+RUN pip install docling --find-links https://download.pytorch.org/whl/cpu
 
 # Install dependencies for Next.js
 WORKDIR /node_dependencies

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -98,8 +98,10 @@ services:
       - TOOL_CALLS=${TOOL_CALLS}
       - DISABLE_THINKING=${DISABLE_THINKING}
       - WEB_GROUNDING=${WEB_GROUNDING}
-      - DATABASE_URL=${DATABASE_URL}
+      - DATABASE_URL=postgresql://postgres:postgres@postgres:5432/presenton
       - DISABLE_ANONYMOUS_TRACKING=${DISABLE_ANONYMOUS_TRACKING}
+    depends_on:
+      - postgres
 
   development-gpu:
     build:
@@ -136,5 +138,22 @@ services:
       - TOOL_CALLS=${TOOL_CALLS}
       - DISABLE_THINKING=${DISABLE_THINKING}
       - WEB_GROUNDING=${WEB_GROUNDING}
-      - DATABASE_URL=${DATABASE_URL}
+      - DATABASE_URL=postgresql://postgres:postgres@postgres:5432/presenton
       - DISABLE_ANONYMOUS_TRACKING=${DISABLE_ANONYMOUS_TRACKING}
+    depends_on:
+      - postgres
+
+  postgres:
+    image: pgvector/pgvector:pg15
+    ports:
+      - "5431:5432"
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+    environment:
+      - POSTGRES_DB=presenton
+      - POSTGRES_USER=postgres
+      - POSTGRES_PASSWORD=postgres
+
+volumes:
+  postgres_data:
+    driver: local
diff --git a/servers/fastapi/api/v1/ppt/endpoints/.presentation.py.swp b/servers/fastapi/api/v1/ppt/endpoints/.presentation.py.swp
diff --git a/servers/fastapi/api/v1/ppt/endpoints/documents.py b/servers/fastapi/api/v1/ppt/endpoints/documents.py
@@ -0,0 +1,75 @@
+from typing import Annotated, List
+from fastapi import APIRouter, Depends, UploadFile, HTTPException, Form
+from sqlalchemy.ext.asyncio import AsyncSession
+from dependencies.auth import get_current_user_id
+from services.database import get_async_session
+from services.docling_service import DoclingService
+from services.score_based_chunker import ScoreBasedChunker
+from services.llm_client import LLMClient
+from services import TEMP_FILE_SERVICE
+from models.sql.document_chunk import DocumentChunk
+from utils.randomizers import get_random_uuid
+from sqlmodel import select
+
+DOCUMENTS_ROUTER = APIRouter(prefix="/documents", tags=["Documents"])
+
+
+@DOCUMENTS_ROUTER.get("", response_model=List[str])
+async def list_documents(
+    sql_session: Annotated[AsyncSession, Depends(get_async_session)],
+    user_id: Annotated[str, Depends(get_current_user_id)],
+):
+    result = await sql_session.execute(
+        select(DocumentChunk.doc_id)
+        .where(DocumentChunk.tenant_id == user_id)
+        .distinct()
+    )
+    document_ids = result.scalars().all()
+    return document_ids
+
+
+@DOCUMENTS_ROUTER.post("/upload")
+async def upload_document(
+    files: List[UploadFile],
+    tags: Annotated[str, Form()],
+    sql_session: Annotated[AsyncSession, Depends(get_async_session)],
+    user_id: Annotated[str, Depends(get_current_user_id)],
+):
+    temp_dir = TEMP_FILE_SERVICE.create_temp_dir()
+    docling_service = DoclingService()
+    chunker = ScoreBasedChunker()
+    tag_list = [tag.strip() for tag in tags.split(',')]
+
+    for file in files:
+        doc_uuid = get_random_uuid()
+        file_path = TEMP_FILE_SERVICE.save_file(temp_dir, file.file, file.filename or get_random_uuid())
+        markdown_content = docling_service.parse_to_markdown(file_path)
+
+        # Use the new chunker logic
+        temporary_chunks = await chunker.get_n_chunks(markdown_content, 10) # Using top_k=10 headings
+        chunk_contents = [chunk.content for chunk in temporary_chunks]
+
+        if not chunk_contents:
+            continue
+
+        llm_client = LLMClient()
+        embeddings = await llm_client.generate_embeddings(chunk_contents)
+
+        db_chunks = [
+            DocumentChunk(
+                content=chunk_content,
+                tenant_id=user_id,
+                doc_id=doc_uuid,
+                tags=tag_list,
+                embedding=embedding,
+            )
+            for chunk_content, embedding in zip(chunk_contents, embeddings)
+        ]
+        sql_session.add_all(db_chunks)
+        await sql_session.commit()
+
+        print(f"Processed and saved {len(db_chunks)} chunks for document {doc_uuid} with tags {tag_list} for user {user_id}")
+
+    return {"message": f"{len(files)} documents processed and queued for embedding."}
+
+
diff --git a/servers/fastapi/api/v1/ppt/endpoints/outlines.py b/servers/fastapi/api/v1/ppt/endpoints/outlines.py
@@ -69,9 +69,12 @@ async def inner():
 
                 presentation_outlines_text += chunk
 
+            print(f"LLM Raw Output: {presentation_outlines_text}") # Added print statement
+
             try:
                 presentation_outlines_json = json.loads(presentation_outlines_text)
             except Exception as e:
+                print(f"JSON parsing error: {e}") # Added print statement
                 raise HTTPException(
                     status_code=400,
                     detail="Failed to generate presentation outlines. Please try again.",
@@ -91,7 +94,7 @@ async def inner():
             .content[:50]
             .replace("#", "")
             .replace("/", "")
-            .replace("\\", "")
+            .replace("\\\\", "")
             .replace("\n", "")
         )
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,4 +17,5 @@ nltk @@
     chroma
     container.db
     .next-build
-    .cursor
+    .cursor
+    venv/