mirror of
https://github.com/run-llama/notebookllama.git
synced 2026-07-01 22:14:04 -04:00
Merge pull request #27 from run-llama/clelia/windows-patch
fix: Windows-specific issues related to across-threads IO operations
This commit is contained in:
+1
-1
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "notebookllama"
|
||||
version = "0.2.3"
|
||||
version = "0.3.0"
|
||||
description = "An OSS and LlamaCloud-backed alternative to NotebookLM"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
|
||||
+57
-21
@@ -4,6 +4,7 @@ import os
|
||||
import asyncio
|
||||
import tempfile as temp
|
||||
from dotenv import load_dotenv
|
||||
import sys
|
||||
import time
|
||||
import streamlit.components.v1 as components
|
||||
|
||||
@@ -20,7 +21,7 @@ from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
||||
load_dotenv()
|
||||
|
||||
# define a custom span exporter
|
||||
span_exporter = OTLPSpanExporter("http://0.0.0.0:4318/v1/traces")
|
||||
span_exporter = OTLPSpanExporter("http://localhost:4318/v1/traces")
|
||||
|
||||
# initialize the instrumentation object
|
||||
instrumentor = LlamaIndexOpenTelemetry(
|
||||
@@ -44,29 +45,64 @@ def read_html_file(file_path: str) -> str:
|
||||
|
||||
|
||||
async def run_workflow(file: io.BytesIO) -> Tuple[str, str, str, str, str]:
|
||||
fl = temp.NamedTemporaryFile(suffix=".pdf", delete=False, delete_on_close=False)
|
||||
content = file.getvalue()
|
||||
with open(fl.name, "wb") as f:
|
||||
f.write(content)
|
||||
st_time = int(time.time() * 1000000)
|
||||
ev = FileInputEvent(file=fl.name)
|
||||
result: NotebookOutputEvent = await WF.run(start_event=ev)
|
||||
q_and_a = ""
|
||||
for q, a in zip(result.questions, result.answers):
|
||||
q_and_a += f"**{q}**\n\n{a}\n\n"
|
||||
bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
|
||||
os.remove(fl.name)
|
||||
mind_map = result.mind_map
|
||||
if Path(mind_map).is_file():
|
||||
mind_map = read_html_file(mind_map)
|
||||
os.remove(result.mind_map)
|
||||
end_time = int(time.time() * 1000000)
|
||||
sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
|
||||
return result.md_content, result.summary, q_and_a, bullet_points, mind_map
|
||||
# Create temp file with proper Windows handling
|
||||
with temp.NamedTemporaryFile(suffix=".pdf", delete=False) as fl:
|
||||
content = file.getvalue()
|
||||
fl.write(content)
|
||||
fl.flush() # Ensure data is written
|
||||
temp_path = fl.name
|
||||
|
||||
try:
|
||||
st_time = int(time.time() * 1000000)
|
||||
ev = FileInputEvent(file=temp_path)
|
||||
result: NotebookOutputEvent = await WF.run(start_event=ev)
|
||||
|
||||
q_and_a = ""
|
||||
for q, a in zip(result.questions, result.answers):
|
||||
q_and_a += f"**{q}**\n\n{a}\n\n"
|
||||
bullet_points = "## Bullet Points\n\n- " + "\n- ".join(result.highlights)
|
||||
|
||||
mind_map = result.mind_map
|
||||
if Path(mind_map).is_file():
|
||||
mind_map = read_html_file(mind_map)
|
||||
try:
|
||||
os.remove(result.mind_map)
|
||||
except OSError:
|
||||
pass # File might be locked on Windows
|
||||
|
||||
end_time = int(time.time() * 1000000)
|
||||
sql_engine.to_sql_database(start_time=st_time, end_time=end_time)
|
||||
return result.md_content, result.summary, q_and_a, bullet_points, mind_map
|
||||
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_path)
|
||||
except OSError:
|
||||
await asyncio.sleep(0.1)
|
||||
try:
|
||||
os.remove(temp_path)
|
||||
except OSError:
|
||||
pass # Give up if still locked
|
||||
|
||||
|
||||
def sync_run_workflow(file: io.BytesIO):
|
||||
return asyncio.run(run_workflow(file=file))
|
||||
try:
|
||||
# Try to use existing event loop
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
# If loop is already running, schedule the coroutine
|
||||
import concurrent.futures
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, run_workflow(file))
|
||||
return future.result()
|
||||
else:
|
||||
return loop.run_until_complete(run_workflow(file))
|
||||
except RuntimeError:
|
||||
# No event loop exists, create one
|
||||
if sys.platform == "win32":
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
return asyncio.run(run_workflow(file))
|
||||
|
||||
|
||||
async def create_podcast(file_content: str):
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
import requests
|
||||
import time
|
||||
import csv
|
||||
import pandas as pd
|
||||
import tempfile as temp
|
||||
import os
|
||||
|
||||
from sqlalchemy import Engine, create_engine, Connection, Result
|
||||
from typing import Optional, Dict, Any, List, Literal, Union, cast
|
||||
@@ -50,6 +47,7 @@ class OtelTracesSqlEngine:
|
||||
|
||||
def _to_pandas(self, data: Dict[str, Any]) -> pd.DataFrame:
|
||||
rows: List[Dict[str, Any]] = []
|
||||
|
||||
# Loop over each trace
|
||||
for trace in data.get("data", []):
|
||||
trace_id = trace.get("traceID")
|
||||
@@ -90,28 +88,7 @@ class OtelTracesSqlEngine:
|
||||
}
|
||||
)
|
||||
|
||||
# Define the CSV header
|
||||
fieldnames = [
|
||||
"trace_id",
|
||||
"span_id",
|
||||
"parent_span_id",
|
||||
"operation_name",
|
||||
"start_time",
|
||||
"duration",
|
||||
"status_code",
|
||||
"service_name",
|
||||
]
|
||||
|
||||
fl = temp.NamedTemporaryFile(suffix=".csv", delete=False, delete_on_close=False)
|
||||
# Write to CSV
|
||||
with open(fl.name, "w", newline="") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
df = pd.read_csv(fl)
|
||||
os.remove(fl.name)
|
||||
return df
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
def _to_sql(
|
||||
self,
|
||||
|
||||
@@ -4,10 +4,10 @@ import json
|
||||
import os
|
||||
import uuid
|
||||
import warnings
|
||||
import tempfile as tmp
|
||||
from datetime import datetime
|
||||
|
||||
from mrkdwn_analysis import MarkdownAnalyzer
|
||||
from mrkdwn_analysis.markdown_analyzer import InlineParser, MarkdownParser
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
from llama_index.core.llms import ChatMessage
|
||||
from llama_cloud_services import LlamaExtract, LlamaParse
|
||||
@@ -17,6 +17,7 @@ from llama_index.core.query_engine import CitationQueryEngine
|
||||
from llama_index.core.base.response.schema import Response
|
||||
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
|
||||
from llama_index.llms.openai import OpenAIResponses
|
||||
from typing_extensions import override
|
||||
from typing import List, Tuple, Union, Optional, Dict, cast
|
||||
from typing_extensions import Self
|
||||
from pyvis.network import Network
|
||||
@@ -24,6 +25,20 @@ from pyvis.network import Network
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class MarkdownTextAnalyzer(MarkdownAnalyzer):
|
||||
@override
|
||||
def __init__(self, text: str):
|
||||
self.text = text
|
||||
parser = MarkdownParser(self.text)
|
||||
self.tokens = parser.parse()
|
||||
self.references = parser.references
|
||||
self.footnotes = parser.footnotes
|
||||
self.inline_parser = InlineParser(
|
||||
references=self.references, footnotes=self.footnotes
|
||||
)
|
||||
self._parse_inline_tokens()
|
||||
|
||||
|
||||
class Node(BaseModel):
|
||||
id: str
|
||||
content: str
|
||||
@@ -187,12 +202,7 @@ async def parse_file(
|
||||
images = rename_and_remove_current_images(imgs)
|
||||
if with_tables:
|
||||
if text is not None:
|
||||
tmp_file = tmp.NamedTemporaryFile(
|
||||
suffix=".md", delete=False, delete_on_close=False
|
||||
)
|
||||
with open(tmp_file.name, "w") as f:
|
||||
f.write(text)
|
||||
analyzer = MarkdownAnalyzer(tmp_file.name)
|
||||
analyzer = MarkdownTextAnalyzer(text)
|
||||
md_tables = analyzer.identify_tables()["Table"]
|
||||
tables = []
|
||||
for md_table in md_tables:
|
||||
@@ -204,7 +214,6 @@ async def parse_file(
|
||||
f"data/extracted_tables/table_{datetime.now().strftime('%Y_%d_%m_%H_%M_%S_%f')[:-3]}.csv",
|
||||
index=False,
|
||||
)
|
||||
os.remove(tmp_file.name)
|
||||
return text, images, tables
|
||||
|
||||
|
||||
|
||||
+10
-8
@@ -3,7 +3,6 @@ import os
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
from mrkdwn_analysis import MarkdownAnalyzer
|
||||
|
||||
from typing import Callable
|
||||
from pydantic import ValidationError
|
||||
@@ -13,6 +12,7 @@ from src.notebookllama.utils import (
|
||||
md_table_to_pd_dataframe,
|
||||
rename_and_remove_current_images,
|
||||
rename_and_remove_past_images,
|
||||
MarkdownTextAnalyzer,
|
||||
)
|
||||
from src.notebookllama.models import Notebook
|
||||
|
||||
@@ -80,16 +80,16 @@ def dataframe_from_tables() -> pd.DataFrame:
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def file_exists_fn() -> Callable[[os.PathLike[str]], bool]:
|
||||
def file_exists(file_path: os.PathLike[str]) -> bool:
|
||||
def file_exists_fn() -> Callable[[str], bool]:
|
||||
def file_exists(file_path: str) -> bool:
|
||||
return Path(file_path).exists()
|
||||
|
||||
return file_exists
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def is_not_empty_fn() -> Callable[[os.PathLike[str]], bool]:
|
||||
def is_not_empty(file_path: os.PathLike[str]) -> bool:
|
||||
def is_not_empty_fn() -> Callable[[str], bool]:
|
||||
def is_not_empty(file_path: str) -> bool:
|
||||
return Path(file_path).stat().st_size > 0
|
||||
|
||||
return is_not_empty
|
||||
@@ -131,8 +131,8 @@ def notebook_to_process() -> Notebook:
|
||||
@pytest.mark.asyncio
|
||||
async def test_mind_map_creation(
|
||||
notebook_to_process: Notebook,
|
||||
file_exists_fn: Callable[[os.PathLike[str]], bool],
|
||||
is_not_empty_fn: Callable[[os.PathLike[str]], bool],
|
||||
file_exists_fn: Callable[[str], bool],
|
||||
is_not_empty_fn: Callable[[str], bool],
|
||||
):
|
||||
test_mindmap = await get_mind_map(
|
||||
summary=notebook_to_process.summary, highlights=notebook_to_process.highlights
|
||||
@@ -163,7 +163,9 @@ async def test_file_processing(input_file: str) -> None:
|
||||
def test_table_to_dataframe(
|
||||
markdown_file: str, dataframe_from_tables: pd.DataFrame
|
||||
) -> None:
|
||||
analyzer = MarkdownAnalyzer(markdown_file)
|
||||
with open(markdown_file, "r") as f:
|
||||
text = f.read()
|
||||
analyzer = MarkdownTextAnalyzer(text)
|
||||
md_tables = analyzer.identify_tables()["Table"]
|
||||
assert len(md_tables) == 2
|
||||
for md_table in md_tables:
|
||||
|
||||
Reference in New Issue
Block a user