Skip to content

Commit cacccff

Browse files
authored
Merge pull request #27 from run-llama/clelia/windows-patch
fix: Windows-specific issues related to across-threads IO operations
2 parents 48d6992 + 22d0cd0 commit cacccff

File tree

6 files changed

+88
-64
lines changed

6 files changed

+88
-64
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "notebookllama"
3-
version = "0.2.3"
3+
version = "0.3.0"
44
description = "An OSS and LlamaCloud-backed alternative to NotebookLM"
55
readme = "README.md"
66
requires-python = ">=3.13"

src/notebookllama/Home.py

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import asyncio
55
import tempfile as temp
66
from dotenv import load_dotenv
7+
import sys
78
import time
89
import streamlit.components.v1 as components
910

@@ -20,7 +21,7 @@
2021
load_dotenv()
2122

2223
# define a custom span exporter
23-
span_exporter = OTLPSpanExporter("http://0.0.0.0:4318/v1/traces")
24+
span_exporter = OTLPSpanExporter("http://localhost:4318/v1/traces")
2425

2526
# initialize the instrumentation object
2627
instrumentor = LlamaIndexOpenTelemetry(
@@ -44,29 +45,64 @@ def read_html_file(file_path: str) -> str:
4445

4546

4647
async def run_workflow(file: io.BytesIO) -> Tuple[str, str, str, str, str]:
47-
fl = temp.NamedTemporaryFile(suffix=".pdf", delete=False, delete_on_close=False)
48-
content = file.getvalue()
49-
with open(fl.name, "wb") as f:
50-
f.write(content)
51-
st_time = int(time.time() * 1000000)
52-
ev = FileInputEvent(file=fl.name)
53-
result: NotebookOutputEvent = await WF.run(start_event=ev)
54-
q_and_a = ""
55-
for q, a in zip(result.questions, result.answers):
56-
q_and_a += f"**{q}**\n\n{a}\n\n"
57-
bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
58-
os.remove(fl.name)
59-
mind_map = result.mind_map
60-
if Path(mind_map).is_file():
61-
mind_map = read_html_file(mind_map)
62-
os.remove(result.mind_map)
63-
end_time = int(time.time() * 1000000)
64-
sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
65-
return result.md_content, result.summary, q_and_a, bullet_points, mind_map
48+
# Create temp file with proper Windows handling
49+
with temp.NamedTemporaryFile(suffix=".pdf", delete=False) as fl:
50+
content = file.getvalue()
51+
fl.write(content)
52+
fl.flush() # Ensure data is written
53+
temp_path = fl.name
54+
55+
try:
56+
st_time = int(time.time() * 1000000)
57+
ev = FileInputEvent(file=temp_path)
58+
result: NotebookOutputEvent = await WF.run(start_event=ev)
59+
60+
q_and_a = ""
61+
for q, a in zip(result.questions, result.answers):
62+
q_and_a += f"**{q}**\n\n{a}\n\n"
63+
bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
64+
65+
mind_map = result.mind_map
66+
if Path(mind_map).is_file():
67+
mind_map = read_html_file(mind_map)
68+
try:
69+
os.remove(result.mind_map)
70+
except OSError:
71+
pass # File might be locked on Windows
72+
73+
end_time = int(time.time() * 1000000)
74+
sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
75+
return result.md_content, result.summary, q_and_a, bullet_points, mind_map
76+
77+
finally:
78+
try:
79+
os.remove(temp_path)
80+
except OSError:
81+
await asyncio.sleep(0.1)
82+
try:
83+
os.remove(temp_path)
84+
except OSError:
85+
pass # Give up if still locked
6686

6787

6888
def sync_run_workflow(file: io.BytesIO):
69-
return asyncio.run(run_workflow(file=file))
89+
try:
90+
# Try to use existing event loop
91+
loop = asyncio.get_event_loop()
92+
if loop.is_running():
93+
# If loop is already running, schedule the coroutine
94+
import concurrent.futures
95+
96+
with concurrent.futures.ThreadPoolExecutor() as executor:
97+
future = executor.submit(asyncio.run, run_workflow(file))
98+
return future.result()
99+
else:
100+
return loop.run_until_complete(run_workflow(file))
101+
except RuntimeError:
102+
# No event loop exists, create one
103+
if sys.platform == "win32":
104+
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
105+
return asyncio.run(run_workflow(file))
70106

71107

72108
async def create_podcast(file_content: str):

src/notebookllama/instrumentation.py

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
import requests
22
import time
3-
import csv
43
import pandas as pd
5-
import tempfile as temp
6-
import os
74

85
from sqlalchemy import Engine, create_engine, Connection, Result
96
from typing import Optional, Dict, Any, List, Literal, Union, cast
@@ -50,6 +47,7 @@ def _export(
5047

5148
def _to_pandas(self, data: Dict[str, Any]) -> pd.DataFrame:
5249
rows: List[Dict[str, Any]] = []
50+
5351
# Loop over each trace
5452
for trace in data.get("data", []):
5553
trace_id = trace.get("traceID")
@@ -90,28 +88,7 @@ def _to_pandas(self, data: Dict[str, Any]) -> pd.DataFrame:
9088
}
9189
)
9290

93-
# Define the CSV header
94-
fieldnames = [
95-
"trace_id",
96-
"span_id",
97-
"parent_span_id",
98-
"operation_name",
99-
"start_time",
100-
"duration",
101-
"status_code",
102-
"service_name",
103-
]
104-
105-
fl = temp.NamedTemporaryFile(suffix=".csv", delete=False, delete_on_close=False)
106-
# Write to CSV
107-
with open(fl.name, "w", newline="") as csvfile:
108-
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
109-
writer.writeheader()
110-
writer.writerows(rows)
111-
112-
df = pd.read_csv(fl)
113-
os.remove(fl.name)
114-
return df
91+
return pd.DataFrame(rows)
11592

11693
def _to_sql(
11794
self,

src/notebookllama/utils.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
import os
55
import uuid
66
import warnings
7-
import tempfile as tmp
87
from datetime import datetime
98

109
from mrkdwn_analysis import MarkdownAnalyzer
10+
from mrkdwn_analysis.markdown_analyzer import InlineParser, MarkdownParser
1111
from pydantic import BaseModel, Field, model_validator
1212
from llama_index.core.llms import ChatMessage
1313
from llama_cloud_services import LlamaExtract, LlamaParse
@@ -17,13 +17,28 @@
1717
from llama_index.core.base.response.schema import Response
1818
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
1919
from llama_index.llms.openai import OpenAIResponses
20+
from typing_extensions import override
2021
from typing import List, Tuple, Union, Optional, Dict, cast
2122
from typing_extensions import Self
2223
from pyvis.network import Network
2324

2425
load_dotenv()
2526

2627

28+
class MarkdownTextAnalyzer(MarkdownAnalyzer):
29+
@override
30+
def __init__(self, text: str):
31+
self.text = text
32+
parser = MarkdownParser(self.text)
33+
self.tokens = parser.parse()
34+
self.references = parser.references
35+
self.footnotes = parser.footnotes
36+
self.inline_parser = InlineParser(
37+
references=self.references, footnotes=self.footnotes
38+
)
39+
self._parse_inline_tokens()
40+
41+
2742
class Node(BaseModel):
2843
id: str
2944
content: str
@@ -187,12 +202,7 @@ async def parse_file(
187202
images = rename_and_remove_current_images(imgs)
188203
if with_tables:
189204
if text is not None:
190-
tmp_file = tmp.NamedTemporaryFile(
191-
suffix=".md", delete=False, delete_on_close=False
192-
)
193-
with open(tmp_file.name, "w") as f:
194-
f.write(text)
195-
analyzer = MarkdownAnalyzer(tmp_file.name)
205+
analyzer = MarkdownTextAnalyzer(text)
196206
md_tables = analyzer.identify_tables()["Table"]
197207
tables = []
198208
for md_table in md_tables:
@@ -204,7 +214,6 @@ async def parse_file(
204214
f"data/extracted_tables/table_{datetime.now().strftime('%Y_%d_%m_%H_%M_%S_%f')[:-3]}.csv",
205215
index=False,
206216
)
207-
os.remove(tmp_file.name)
208217
return text, images, tables
209218

210219

tests/test_utils.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pandas as pd
44
from pathlib import Path
55
from dotenv import load_dotenv
6-
from mrkdwn_analysis import MarkdownAnalyzer
76

87
from typing import Callable
98
from pydantic import ValidationError
@@ -13,6 +12,7 @@
1312
md_table_to_pd_dataframe,
1413
rename_and_remove_current_images,
1514
rename_and_remove_past_images,
15+
MarkdownTextAnalyzer,
1616
)
1717
from src.notebookllama.models import Notebook
1818

@@ -80,16 +80,16 @@ def dataframe_from_tables() -> pd.DataFrame:
8080

8181

8282
@pytest.fixture()
83-
def file_exists_fn() -> Callable[[os.PathLike[str]], bool]:
84-
def file_exists(file_path: os.PathLike[str]) -> bool:
83+
def file_exists_fn() -> Callable[[str], bool]:
84+
def file_exists(file_path: str) -> bool:
8585
return Path(file_path).exists()
8686

8787
return file_exists
8888

8989

9090
@pytest.fixture()
91-
def is_not_empty_fn() -> Callable[[os.PathLike[str]], bool]:
92-
def is_not_empty(file_path: os.PathLike[str]) -> bool:
91+
def is_not_empty_fn() -> Callable[[str], bool]:
92+
def is_not_empty(file_path: str) -> bool:
9393
return Path(file_path).stat().st_size > 0
9494

9595
return is_not_empty
@@ -131,8 +131,8 @@ def notebook_to_process() -> Notebook:
131131
@pytest.mark.asyncio
132132
async def test_mind_map_creation(
133133
notebook_to_process: Notebook,
134-
file_exists_fn: Callable[[os.PathLike[str]], bool],
135-
is_not_empty_fn: Callable[[os.PathLike[str]], bool],
134+
file_exists_fn: Callable[[str], bool],
135+
is_not_empty_fn: Callable[[str], bool],
136136
):
137137
test_mindmap = await get_mind_map(
138138
summary=notebook_to_process.summary, highlights=notebook_to_process.highlights
@@ -163,7 +163,9 @@ async def test_file_processing(input_file: str) -> None:
163163
def test_table_to_dataframe(
164164
markdown_file: str, dataframe_from_tables: pd.DataFrame
165165
) -> None:
166-
analyzer = MarkdownAnalyzer(markdown_file)
166+
with open(markdown_file, "r") as f:
167+
text = f.read()
168+
analyzer = MarkdownTextAnalyzer(text)
167169
md_tables = analyzer.identify_tables()["Table"]
168170
assert len(md_tables) == 2
169171
for md_table in md_tables:

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)