Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ nltk
chroma
container.db
.next-build
.cursor
.cursor
venv/
44 changes: 40 additions & 4 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,43 @@ RUN apt-get update && apt-get install -y \
curl \
libreoffice \
fontconfig \
imagemagick
imagemagick \
ca-certificates \
fonts-liberation \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libc6 \
libcairo2 \
libcups2 \
libdbus-1-3 \
libexpat1 \
libfontconfig1 \
libgbm1 \
libgcc1 \
libglib2.0-0 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
libstdc++6 \
libx11-6 \
libx11-xcb1 \
libxcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxi6 \
libxrandr2 \
libxrender1 \
libxss1 \
libxtst6 \
lsb-release \
wget \
xdg-utils

RUN sed -i 's/rights="none" pattern="PDF"/rights="read|write" pattern="PDF"/' /etc/ImageMagick-6/policy.xml

Expand All @@ -31,10 +67,10 @@ RUN curl -fsSL http://ollama.com/install.sh | sh

# Install dependencies for FastAPI
RUN pip install aiohttp aiomysql aiosqlite asyncpg fastapi[standard] \
pathvalidate pdfplumber chromadb sqlmodel \
pathvalidate pdfplumber chromadb sqlmodel pgvector \
anthropic google-genai openai fastmcp \
python-jose[cryptography] passlib
RUN pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
python-jose[cryptography] passlib numpy onnxruntime transformers
RUN pip install docling --find-links https://download.pytorch.org/whl/cpu

# Install dependencies for Next.js
WORKDIR /node_dependencies
Expand Down
23 changes: 21 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,10 @@ services:
- TOOL_CALLS=${TOOL_CALLS}
- DISABLE_THINKING=${DISABLE_THINKING}
- WEB_GROUNDING=${WEB_GROUNDING}
- DATABASE_URL=${DATABASE_URL}
- DATABASE_URL=postgresql://postgres:postgres@postgres:5432/presenton
- DISABLE_ANONYMOUS_TRACKING=${DISABLE_ANONYMOUS_TRACKING}
depends_on:
- postgres

development-gpu:
build:
Expand Down Expand Up @@ -136,5 +138,22 @@ services:
- TOOL_CALLS=${TOOL_CALLS}
- DISABLE_THINKING=${DISABLE_THINKING}
- WEB_GROUNDING=${WEB_GROUNDING}
- DATABASE_URL=${DATABASE_URL}
- DATABASE_URL=postgresql://postgres:postgres@postgres:5432/presenton
- DISABLE_ANONYMOUS_TRACKING=${DISABLE_ANONYMOUS_TRACKING}
depends_on:
- postgres

postgres:
image: pgvector/pgvector:pg15
ports:
- "5431:5432"
volumes:
- postgres_data:/var/lib/postgresql/data
environment:
- POSTGRES_DB=presenton
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres

volumes:
postgres_data:
driver: local
Binary file not shown.
75 changes: 75 additions & 0 deletions servers/fastapi/api/v1/ppt/endpoints/documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Annotated, List
from fastapi import APIRouter, Depends, UploadFile, HTTPException, Form
from sqlalchemy.ext.asyncio import AsyncSession
from dependencies.auth import get_current_user_id
from services.database import get_async_session
from services.docling_service import DoclingService
from services.score_based_chunker import ScoreBasedChunker
from services.llm_client import LLMClient
from services import TEMP_FILE_SERVICE
from models.sql.document_chunk import DocumentChunk
from utils.randomizers import get_random_uuid
from sqlmodel import select

DOCUMENTS_ROUTER = APIRouter(prefix="/documents", tags=["Documents"])


@DOCUMENTS_ROUTER.get("", response_model=List[str])
async def list_documents(
sql_session: Annotated[AsyncSession, Depends(get_async_session)],
user_id: Annotated[str, Depends(get_current_user_id)],
):
result = await sql_session.execute(
select(DocumentChunk.doc_id)
.where(DocumentChunk.tenant_id == user_id)
.distinct()
)
document_ids = result.scalars().all()
return document_ids


@DOCUMENTS_ROUTER.post("/upload")
async def upload_document(
files: List[UploadFile],
tags: Annotated[str, Form()],
sql_session: Annotated[AsyncSession, Depends(get_async_session)],
user_id: Annotated[str, Depends(get_current_user_id)],
):
temp_dir = TEMP_FILE_SERVICE.create_temp_dir()
docling_service = DoclingService()
chunker = ScoreBasedChunker()
tag_list = [tag.strip() for tag in tags.split(',')]

for file in files:
doc_uuid = get_random_uuid()
file_path = TEMP_FILE_SERVICE.save_file(temp_dir, file.file, file.filename or get_random_uuid())
markdown_content = docling_service.parse_to_markdown(file_path)

# Use the new chunker logic
temporary_chunks = await chunker.get_n_chunks(markdown_content, 10) # Using top_k=10 headings
chunk_contents = [chunk.content for chunk in temporary_chunks]

if not chunk_contents:
continue

llm_client = LLMClient()
embeddings = await llm_client.generate_embeddings(chunk_contents)

db_chunks = [
DocumentChunk(
content=chunk_content,
tenant_id=user_id,
doc_id=doc_uuid,
tags=tag_list,
embedding=embedding,
)
for chunk_content, embedding in zip(chunk_contents, embeddings)
]
sql_session.add_all(db_chunks)
await sql_session.commit()

print(f"Processed and saved {len(db_chunks)} chunks for document {doc_uuid} with tags {tag_list} for user {user_id}")

return {"message": f"{len(files)} documents processed and queued for embedding."}


5 changes: 4 additions & 1 deletion servers/fastapi/api/v1/ppt/endpoints/outlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,12 @@ async def inner():

presentation_outlines_text += chunk

print(f"LLM Raw Output: {presentation_outlines_text}") # Added print statement

try:
presentation_outlines_json = json.loads(presentation_outlines_text)
except Exception as e:
print(f"JSON parsing error: {e}") # Added print statement
raise HTTPException(
status_code=400,
detail="Failed to generate presentation outlines. Please try again.",
Expand All @@ -91,7 +94,7 @@ async def inner():
.content[:50]
.replace("#", "")
.replace("/", "")
.replace("\\", "")
.replace("\\\\", "")
.replace("\n", "")
)

Expand Down
Loading