Merge pull request #27 from run-llama/clelia/windows-patch

AstraBert · web-flow · commit cacccffca420 · 2025-07-11T21:52:47.000+02:00
fix: Windows-specific issues related to across-threads IO operations
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "notebookllama"
-version = "0.2.3"
+version = "0.3.0"
 description = "An OSS and LlamaCloud-backed alternative to NotebookLM"
 readme = "README.md"
 requires-python = ">=3.13"
diff --git a/src/notebookllama/Home.py b/src/notebookllama/Home.py
@@ -4,6 +4,7 @@
 import asyncio
 import tempfile as temp
 from dotenv import load_dotenv
+import sys
 import time
 import streamlit.components.v1 as components
 
@@ -20,7 +21,7 @@
 load_dotenv()
 
 # define a custom span exporter
-span_exporter = OTLPSpanExporter("http://0.0.0.0:4318/v1/traces")
+span_exporter = OTLPSpanExporter("http://localhost:4318/v1/traces")
 
 # initialize the instrumentation object
 instrumentor = LlamaIndexOpenTelemetry(
@@ -44,29 +45,64 @@ def read_html_file(file_path: str) -> str:
 
 
 async def run_workflow(file: io.BytesIO) -> Tuple[str, str, str, str, str]:
-    fl = temp.NamedTemporaryFile(suffix=".pdf", delete=False, delete_on_close=False)
-    content = file.getvalue()
-    with open(fl.name, "wb") as f:
-        f.write(content)
-    st_time = int(time.time() * 1000000)
-    ev = FileInputEvent(file=fl.name)
-    result: NotebookOutputEvent = await WF.run(start_event=ev)
-    q_and_a = ""
-    for q, a in zip(result.questions, result.answers):
-        q_and_a += f"**{q}**\n\n{a}\n\n"
-    bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
-    os.remove(fl.name)
-    mind_map = result.mind_map
-    if Path(mind_map).is_file():
-        mind_map = read_html_file(mind_map)
-        os.remove(result.mind_map)
-    end_time = int(time.time() * 1000000)
-    sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
-    return result.md_content, result.summary, q_and_a, bullet_points, mind_map
+    # Create temp file with proper Windows handling
+    with temp.NamedTemporaryFile(suffix=".pdf", delete=False) as fl:
+        content = file.getvalue()
+        fl.write(content)
+        fl.flush()  # Ensure data is written
+        temp_path = fl.name
+
+    try:
+        st_time = int(time.time() * 1000000)
+        ev = FileInputEvent(file=temp_path)
+        result: NotebookOutputEvent = await WF.run(start_event=ev)
+
+        q_and_a = ""
+        for q, a in zip(result.questions, result.answers):
+            q_and_a += f"**{q}**\n\n{a}\n\n"
+        bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
+
+        mind_map = result.mind_map
+        if Path(mind_map).is_file():
+            mind_map = read_html_file(mind_map)
+            try:
+                os.remove(result.mind_map)
+            except OSError:
+                pass  # File might be locked on Windows
+
+        end_time = int(time.time() * 1000000)
+        sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
+        return result.md_content, result.summary, q_and_a, bullet_points, mind_map
+
+    finally:
+        try:
+            os.remove(temp_path)
+        except OSError:
+            await asyncio.sleep(0.1)
+            try:
+                os.remove(temp_path)
+            except OSError:
+                pass  # Give up if still locked
 
 
 def sync_run_workflow(file: io.BytesIO):
-    return asyncio.run(run_workflow(file=file))
+    try:
+        # Try to use existing event loop
+        loop = asyncio.get_event_loop()
+        if loop.is_running():
+            # If loop is already running, schedule the coroutine
+            import concurrent.futures
+
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(asyncio.run, run_workflow(file))
+                return future.result()
+        else:
+            return loop.run_until_complete(run_workflow(file))
+    except RuntimeError:
+        # No event loop exists, create one
+        if sys.platform == "win32":
+            asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+        return asyncio.run(run_workflow(file))
 
 
 async def create_podcast(file_content: str):
diff --git a/src/notebookllama/instrumentation.py b/src/notebookllama/instrumentation.py
@@ -1,9 +1,6 @@
 import requests
 import time
-import csv
 import pandas as pd
-import tempfile as temp
-import os
 
 from sqlalchemy import Engine, create_engine, Connection, Result
 from typing import Optional, Dict, Any, List, Literal, Union, cast
@@ -50,6 +47,7 @@ def _export(
 
     def _to_pandas(self, data: Dict[str, Any]) -> pd.DataFrame:
         rows: List[Dict[str, Any]] = []
+
         # Loop over each trace
         for trace in data.get("data", []):
             trace_id = trace.get("traceID")
@@ -90,28 +88,7 @@ def _to_pandas(self, data: Dict[str, Any]) -> pd.DataFrame:
                     }
                 )
 
-        # Define the CSV header
-        fieldnames = [
-            "trace_id",
-            "span_id",
-            "parent_span_id",
-            "operation_name",
-            "start_time",
-            "duration",
-            "status_code",
-            "service_name",
-        ]
-
-        fl = temp.NamedTemporaryFile(suffix=".csv", delete=False, delete_on_close=False)
-        # Write to CSV
-        with open(fl.name, "w", newline="") as csvfile:
-            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(rows)
-
-        df = pd.read_csv(fl)
-        os.remove(fl.name)
-        return df
+        return pd.DataFrame(rows)
 
     def _to_sql(
         self,
diff --git a/src/notebookllama/utils.py b/src/notebookllama/utils.py
@@ -4,10 +4,10 @@
 import os
 import uuid
 import warnings
-import tempfile as tmp
 from datetime import datetime
 
 from mrkdwn_analysis import MarkdownAnalyzer
+from mrkdwn_analysis.markdown_analyzer import InlineParser, MarkdownParser
 from pydantic import BaseModel, Field, model_validator
 from llama_index.core.llms import ChatMessage
 from llama_cloud_services import LlamaExtract, LlamaParse
@@ -17,13 +17,28 @@
 from llama_index.core.base.response.schema import Response
 from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
 from llama_index.llms.openai import OpenAIResponses
+from typing_extensions import override
 from typing import List, Tuple, Union, Optional, Dict, cast
 from typing_extensions import Self
 from pyvis.network import Network
 
 load_dotenv()
 
 
+class MarkdownTextAnalyzer(MarkdownAnalyzer):
+    @override
+    def __init__(self, text: str):
+        self.text = text
+        parser = MarkdownParser(self.text)
+        self.tokens = parser.parse()
+        self.references = parser.references
+        self.footnotes = parser.footnotes
+        self.inline_parser = InlineParser(
+            references=self.references, footnotes=self.footnotes
+        )
+        self._parse_inline_tokens()
+
+
 class Node(BaseModel):
     id: str
     content: str
@@ -187,12 +202,7 @@ async def parse_file(
         images = rename_and_remove_current_images(imgs)
     if with_tables:
         if text is not None:
-            tmp_file = tmp.NamedTemporaryFile(
-                suffix=".md", delete=False, delete_on_close=False
-            )
-            with open(tmp_file.name, "w") as f:
-                f.write(text)
-            analyzer = MarkdownAnalyzer(tmp_file.name)
+            analyzer = MarkdownTextAnalyzer(text)
             md_tables = analyzer.identify_tables()["Table"]
             tables = []
             for md_table in md_tables:
@@ -204,7 +214,6 @@ async def parse_file(
                         f"data/extracted_tables/table_{datetime.now().strftime('%Y_%d_%m_%H_%M_%S_%f')[:-3]}.csv",
                         index=False,
                     )
-        os.remove(tmp_file.name)
     return text, images, tables
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -3,7 +3,6 @@
 import pandas as pd
 from pathlib import Path
 from dotenv import load_dotenv
-from mrkdwn_analysis import MarkdownAnalyzer
 
 from typing import Callable
 from pydantic import ValidationError
@@ -13,6 +12,7 @@
     md_table_to_pd_dataframe,
     rename_and_remove_current_images,
     rename_and_remove_past_images,
+    MarkdownTextAnalyzer,
 )
 from src.notebookllama.models import Notebook
 
@@ -80,16 +80,16 @@ def dataframe_from_tables() -> pd.DataFrame:
 
 
 @pytest.fixture()
-def file_exists_fn() -> Callable[[os.PathLike[str]], bool]:
-    def file_exists(file_path: os.PathLike[str]) -> bool:
+def file_exists_fn() -> Callable[[str], bool]:
+    def file_exists(file_path: str) -> bool:
         return Path(file_path).exists()
 
     return file_exists
 
 
 @pytest.fixture()
-def is_not_empty_fn() -> Callable[[os.PathLike[str]], bool]:
-    def is_not_empty(file_path: os.PathLike[str]) -> bool:
+def is_not_empty_fn() -> Callable[[str], bool]:
+    def is_not_empty(file_path: str) -> bool:
         return Path(file_path).stat().st_size > 0
 
     return is_not_empty
@@ -131,8 +131,8 @@ def notebook_to_process() -> Notebook:
 @pytest.mark.asyncio
 async def test_mind_map_creation(
     notebook_to_process: Notebook,
-    file_exists_fn: Callable[[os.PathLike[str]], bool],
-    is_not_empty_fn: Callable[[os.PathLike[str]], bool],
+    file_exists_fn: Callable[[str], bool],
+    is_not_empty_fn: Callable[[str], bool],
 ):
     test_mindmap = await get_mind_map(
         summary=notebook_to_process.summary, highlights=notebook_to_process.highlights
@@ -163,7 +163,9 @@ async def test_file_processing(input_file: str) -> None:
 def test_table_to_dataframe(
     markdown_file: str, dataframe_from_tables: pd.DataFrame
 ) -> None:
-    analyzer = MarkdownAnalyzer(markdown_file)
+    with open(markdown_file, "r") as f:
+        text = f.read()
+    analyzer = MarkdownTextAnalyzer(text)
     md_tables = analyzer.identify_tables()["Table"]
     assert len(md_tables) == 2
     for md_table in md_tables:
diff --git a/uv.lock b/uv.lock