Merge pull request #27 from run-llama/clelia/windows-patch

fix: Windows-specific issues related to across-threads IO operations
This commit is contained in:
Clelia (Astra) Bertelli
2025-07-11 21:52:47 +02:00
committed by GitHub
6 changed files with 88 additions and 64 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
[project]
name = "notebookllama"
version = "0.2.3"
version = "0.3.0"
description = "An OSS and LlamaCloud-backed alternative to NotebookLM"
readme = "README.md"
requires-python = ">=3.13"
+57 -21
View File
@@ -4,6 +4,7 @@ import os
import asyncio
import tempfile as temp
from dotenv import load_dotenv
import sys
import time
import streamlit.components.v1 as components
@@ -20,7 +21,7 @@ from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
load_dotenv()
# define a custom span exporter
span_exporter = OTLPSpanExporter("http://0.0.0.0:4318/v1/traces")
span_exporter = OTLPSpanExporter("http://localhost:4318/v1/traces")
# initialize the instrumentation object
instrumentor = LlamaIndexOpenTelemetry(
@@ -44,29 +45,64 @@ def read_html_file(file_path: str) -> str:
async def run_workflow(file: io.BytesIO) -> Tuple[str, str, str, str, str]:
fl = temp.NamedTemporaryFile(suffix=".pdf", delete=False, delete_on_close=False)
content = file.getvalue()
with open(fl.name, "wb") as f:
f.write(content)
st_time = int(time.time() * 1000000)
ev = FileInputEvent(file=fl.name)
result: NotebookOutputEvent = await WF.run(start_event=ev)
q_and_a = ""
for q, a in zip(result.questions, result.answers):
q_and_a += f"**{q}**\n\n{a}\n\n"
bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
os.remove(fl.name)
mind_map = result.mind_map
if Path(mind_map).is_file():
mind_map = read_html_file(mind_map)
os.remove(result.mind_map)
end_time = int(time.time() * 1000000)
sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
return result.md_content, result.summary, q_and_a, bullet_points, mind_map
# Create temp file with proper Windows handling
with temp.NamedTemporaryFile(suffix=".pdf", delete=False) as fl:
content = file.getvalue()
fl.write(content)
fl.flush() # Ensure data is written
temp_path = fl.name
try:
st_time = int(time.time() * 1000000)
ev = FileInputEvent(file=temp_path)
result: NotebookOutputEvent = await WF.run(start_event=ev)
q_and_a = ""
for q, a in zip(result.questions, result.answers):
q_and_a += f"**{q}**\n\n{a}\n\n"
bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
mind_map = result.mind_map
if Path(mind_map).is_file():
mind_map = read_html_file(mind_map)
try:
os.remove(result.mind_map)
except OSError:
pass # File might be locked on Windows
end_time = int(time.time() * 1000000)
sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
return result.md_content, result.summary, q_and_a, bullet_points, mind_map
finally:
try:
os.remove(temp_path)
except OSError:
await asyncio.sleep(0.1)
try:
os.remove(temp_path)
except OSError:
pass # Give up if still locked
def sync_run_workflow(file: io.BytesIO):
return asyncio.run(run_workflow(file=file))
try:
# Try to use existing event loop
loop = asyncio.get_event_loop()
if loop.is_running():
# If loop is already running, schedule the coroutine
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(asyncio.run, run_workflow(file))
return future.result()
else:
return loop.run_until_complete(run_workflow(file))
except RuntimeError:
# No event loop exists, create one
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
return asyncio.run(run_workflow(file))
async def create_podcast(file_content: str):
+2 -25
View File
@@ -1,9 +1,6 @@
import requests
import time
import csv
import pandas as pd
import tempfile as temp
import os
from sqlalchemy import Engine, create_engine, Connection, Result
from typing import Optional, Dict, Any, List, Literal, Union, cast
@@ -50,6 +47,7 @@ class OtelTracesSqlEngine:
def _to_pandas(self, data: Dict[str, Any]) -> pd.DataFrame:
rows: List[Dict[str, Any]] = []
# Loop over each trace
for trace in data.get("data", []):
trace_id = trace.get("traceID")
@@ -90,28 +88,7 @@ class OtelTracesSqlEngine:
}
)
# Define the CSV header
fieldnames = [
"trace_id",
"span_id",
"parent_span_id",
"operation_name",
"start_time",
"duration",
"status_code",
"service_name",
]
fl = temp.NamedTemporaryFile(suffix=".csv", delete=False, delete_on_close=False)
# Write to CSV
with open(fl.name, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
df = pd.read_csv(fl)
os.remove(fl.name)
return df
return pd.DataFrame(rows)
def _to_sql(
self,
+17 -8
View File
@@ -4,10 +4,10 @@ import json
import os
import uuid
import warnings
import tempfile as tmp
from datetime import datetime
from mrkdwn_analysis import MarkdownAnalyzer
from mrkdwn_analysis.markdown_analyzer import InlineParser, MarkdownParser
from pydantic import BaseModel, Field, model_validator
from llama_index.core.llms import ChatMessage
from llama_cloud_services import LlamaExtract, LlamaParse
@@ -17,6 +17,7 @@ from llama_index.core.query_engine import CitationQueryEngine
from llama_index.core.base.response.schema import Response
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
from llama_index.llms.openai import OpenAIResponses
from typing_extensions import override
from typing import List, Tuple, Union, Optional, Dict, cast
from typing_extensions import Self
from pyvis.network import Network
@@ -24,6 +25,20 @@ from pyvis.network import Network
load_dotenv()
class MarkdownTextAnalyzer(MarkdownAnalyzer):
@override
def __init__(self, text: str):
self.text = text
parser = MarkdownParser(self.text)
self.tokens = parser.parse()
self.references = parser.references
self.footnotes = parser.footnotes
self.inline_parser = InlineParser(
references=self.references, footnotes=self.footnotes
)
self._parse_inline_tokens()
class Node(BaseModel):
id: str
content: str
@@ -187,12 +202,7 @@ async def parse_file(
images = rename_and_remove_current_images(imgs)
if with_tables:
if text is not None:
tmp_file = tmp.NamedTemporaryFile(
suffix=".md", delete=False, delete_on_close=False
)
with open(tmp_file.name, "w") as f:
f.write(text)
analyzer = MarkdownAnalyzer(tmp_file.name)
analyzer = MarkdownTextAnalyzer(text)
md_tables = analyzer.identify_tables()["Table"]
tables = []
for md_table in md_tables:
@@ -204,7 +214,6 @@ async def parse_file(
f"data/extracted_tables/table_{datetime.now().strftime('%Y_%d_%m_%H_%M_%S_%f')[:-3]}.csv",
index=False,
)
os.remove(tmp_file.name)
return text, images, tables
+10 -8
View File
@@ -3,7 +3,6 @@ import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from mrkdwn_analysis import MarkdownAnalyzer
from typing import Callable
from pydantic import ValidationError
@@ -13,6 +12,7 @@ from src.notebookllama.utils import (
md_table_to_pd_dataframe,
rename_and_remove_current_images,
rename_and_remove_past_images,
MarkdownTextAnalyzer,
)
from src.notebookllama.models import Notebook
@@ -80,16 +80,16 @@ def dataframe_from_tables() -> pd.DataFrame:
@pytest.fixture()
def file_exists_fn() -> Callable[[os.PathLike[str]], bool]:
def file_exists(file_path: os.PathLike[str]) -> bool:
def file_exists_fn() -> Callable[[str], bool]:
def file_exists(file_path: str) -> bool:
return Path(file_path).exists()
return file_exists
@pytest.fixture()
def is_not_empty_fn() -> Callable[[os.PathLike[str]], bool]:
def is_not_empty(file_path: os.PathLike[str]) -> bool:
def is_not_empty_fn() -> Callable[[str], bool]:
def is_not_empty(file_path: str) -> bool:
return Path(file_path).stat().st_size > 0
return is_not_empty
@@ -131,8 +131,8 @@ def notebook_to_process() -> Notebook:
@pytest.mark.asyncio
async def test_mind_map_creation(
notebook_to_process: Notebook,
file_exists_fn: Callable[[os.PathLike[str]], bool],
is_not_empty_fn: Callable[[os.PathLike[str]], bool],
file_exists_fn: Callable[[str], bool],
is_not_empty_fn: Callable[[str], bool],
):
test_mindmap = await get_mind_map(
summary=notebook_to_process.summary, highlights=notebook_to_process.highlights
@@ -163,7 +163,9 @@ async def test_file_processing(input_file: str) -> None:
def test_table_to_dataframe(
markdown_file: str, dataframe_from_tables: pd.DataFrame
) -> None:
analyzer = MarkdownAnalyzer(markdown_file)
with open(markdown_file, "r") as f:
text = f.read()
analyzer = MarkdownTextAnalyzer(text)
md_tables = analyzer.identify_tables()["Table"]
assert len(md_tables) == 2
for md_table in md_tables:
Generated
+1 -1
View File
@@ -1744,7 +1744,7 @@ wheels = [
[[package]]
name = "notebookllama"
version = "0.2.3"
version = "0.3.0"
source = { virtual = "." }
dependencies = [
{ name = "audioop-lts" },