feat: adding a first (still not very good) implementation of table and images extraction

This commit is contained in:
Clelia (Astra) Bertelli
2025-07-06 18:53:33 +02:00
parent 08da1b195f
commit c8d751b262
7 changed files with 436 additions and 4 deletions
+3
View File
@@ -18,3 +18,6 @@ wheels/
# audio files
*.mp3
# auto-generated images
static/
+51
View File
@@ -0,0 +1,51 @@
# Project Status Report
This document provides an overview of our current project status and key metrics.
## Introduction
Our development team has been working on multiple initiatives this quarter. The following sections outline our progress and upcoming milestones.
## Team Performance Metrics
The table below shows our team's performance across different projects:
| Project Name | Status | Completion % | Assigned Developer | Due Date |
| ------------------ | ----------- | ------------ | ------------------ | ---------- |
| User Dashboard | In Progress | 75% | Alice Johnson | 2025-07-15 |
| API Integration | Completed | 100% | Bob Smith | 2025-06-30 |
| Mobile App | Planning | 25% | Carol Davis | 2025-08-20 |
| Database Migration | In Progress | 60% | David Wilson | 2025-07-10 |
| Security Audit | Not Started | 0% | Eve Brown | 2025-08-01 |
| Project Name | Status | Completion % | Assigned Developer | Due Date |
| ------------------ | ----------- | ------------ | ------------------ | ---------- |
| User Dashboard | In Progress | 75% | Alice Johnson | 2025-07-15 |
| API Integration | Completed | 100% | Bob Smith | 2025-06-30 |
| Mobile App | Planning | 25% | Carol Davis | 2025-08-20 |
| Database Migration | In Progress | 60% | David Wilson | 2025-07-10 |
| Security Audit | Not Started | 0% | Eve Brown | 2025-08-01 |
## Key Observations
Based on the data above, we can see that:
- The API Integration project was completed on schedule
- The User Dashboard is progressing well and should meet its deadline
- The Database Migration needs attention to stay on track
- We need to begin the Security Audit soon to meet the August deadline
## Next Steps
1. **Prioritize Database Migration** - Assign additional resources if needed
2. **Begin Security Audit planning** - Schedule kickoff meeting with Eve
3. **Continue monitoring** User Dashboard progress
4. **Celebrate** the successful API Integration completion
## Conclusion
Overall, our team is performing well with most projects on track. The key focus areas for the coming weeks are the database migration and security audit preparation.
---
_Last updated: July 6, 2025_
+1
View File
@@ -18,6 +18,7 @@ dependencies = [
"llama-index-observability-otel>=0.1.1",
"llama-index-tools-mcp>=0.2.5",
"llama-index-workflows>=1.0.1",
"markdown-analysis>=0.1.5",
"mypy>=1.16.1",
"opentelemetry-exporter-otlp-proto-http>=1.34.1",
"plotly>=6.2.0",
@@ -0,0 +1,216 @@
import sys
import os
import io
import tempfile as tmp
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
import asyncio
from utils import get_plots_and_tables
import streamlit as st
from PIL import Image
def get_plots_and_tables_sync(file: io.BytesIO):
fl = tmp.NamedTemporaryFile(suffix=".pdf", delete=False, delete_on_close=False)
with open(fl.name, "wb") as f:
f.write(file.getvalue())
# Try to get existing event loop, if not create a new one
try:
loop = asyncio.get_event_loop()
if loop.is_closed():
raise RuntimeError("Event loop is closed")
except RuntimeError:
# No event loop exists or it's closed, create a new one
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Run the async function in the current event loop
if loop.is_running():
# If loop is already running (e.g., in Jupyter/Streamlit),
# we need to use a different approach
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(
asyncio.run, get_plots_and_tables(file_path=fl.name)
)
return future.result()
else:
# If loop is not running, we can use it directly
return loop.run_until_complete(get_plots_and_tables(file_path=fl.name))
def cleanup_temp_files(image_paths):
"""Clean up temporary image files"""
for img_path in image_paths:
try:
if os.path.exists(img_path):
os.remove(img_path)
except Exception as e:
st.warning(f"Could not remove {img_path}: {str(e)}")
def create_download_zip(dataframes, image_paths):
"""Create a ZIP file with all extracted data"""
import zipfile
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
# Add CSV files
for i, df in enumerate(dataframes):
csv_data = df.to_csv(index=False)
zip_file.writestr(f"table_{i + 1}.csv", csv_data)
# Add image files
for i, img_path in enumerate(image_paths):
if os.path.exists(img_path):
file_ext = os.path.splitext(img_path)[1]
zip_file.write(img_path, f"image_{i + 1}{file_ext}")
zip_buffer.seek(0)
return zip_buffer.getvalue()
# Direct Streamlit execution
# Chat Interface
st.set_page_config(page_title="NotebookLlaMa - Images and Tables", page_icon="📊")
st.sidebar.header("Images and Tables📊")
st.sidebar.info("To switch to the Home page, select it from above!🔺")
st.markdown("---")
st.markdown("## NotebookLlaMa - Images and Tables📊")
st.markdown("### Upload a PDF file to extract plots and tables")
# File uploader
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
# Process the file
with st.spinner("Processing PDF... This may take a moment."):
try:
# Convert uploaded file to BytesIO
file_bytes = io.BytesIO(uploaded_file.getvalue())
# Extract plots and tables
image_paths, dataframes = get_plots_and_tables_sync(file_bytes)
# Display results summary
st.success("✅ Processing complete!")
st.info(f"Found {len(image_paths)} images and {len(dataframes)} tables")
# Create tabs for better organization
tab1, tab2 = st.tabs(["📊 Tables", "📈 Plots/Images"])
with tab1:
st.header("Extracted Tables")
if dataframes:
for i, df in enumerate(dataframes):
st.subheader(f"Table {i + 1}")
# Display table with options
col1, col2 = st.columns([3, 1])
with col1:
st.dataframe(df, use_container_width=True)
with col2:
st.write("**Table Info:**")
st.write(f"Rows: {len(df)}")
st.write(f"Columns: {len(df.columns)}")
# Download button for CSV
csv = df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv,
file_name=f"table_{i + 1}.csv",
mime="text/csv",
key=f"download_table_{i}",
)
# Show additional table statistics
with st.expander(f"Table {i + 1} Details"):
st.write("**Column Information:**")
st.write(df.dtypes)
st.write("**First few rows:**")
st.write(df.head())
if len(df) > 5:
st.write("**Summary statistics:**")
st.write(df.describe(include="all"))
st.divider()
else:
st.warning("No tables found in the PDF")
with tab2:
st.header("Extracted Plots and Images")
if image_paths:
for i, img_path in enumerate(image_paths):
st.subheader(f"Image {i + 1}")
try:
# Display image
if os.path.exists(img_path):
image = Image.open(img_path)
# Create columns for image and info
col1, col2 = st.columns([3, 1])
with col1:
st.image(
image,
caption=f"Image {i + 1}",
use_container_width=True,
)
with col2:
st.write("**Image Info:**")
st.write(f"Size: {image.size}")
st.write(f"Format: {image.format}")
st.write(f"Mode: {image.mode}")
# Download button for image
with open(img_path, "rb") as file:
st.download_button(
label="Download Image",
data=file.read(),
file_name=f"image_{i + 1}.{image.format.lower()}",
mime=f"image/{image.format.lower()}",
key=f"download_image_{i}",
)
st.divider()
else:
st.error(f"Image file not found: {img_path}")
except Exception as e:
st.error(f"Error loading image {i + 1}: {str(e)}")
else:
st.warning("No images found in the PDF")
# Cleanup section
st.header("📂 File Management")
col1, col2 = st.columns(2)
with col1:
if st.button("🗑️ Clean up temporary files"):
cleanup_temp_files(image_paths)
st.success("Temporary files cleaned up!")
with col2:
# Option to download all data as ZIP
if st.button("📦 Download All Data"):
zip_data = create_download_zip(dataframes, image_paths)
st.download_button(
label="Download ZIP",
data=zip_data,
file_name="extracted_data.zip",
mime="application/zip",
)
except Exception as e:
st.error(f"Error processing file: {str(e)}")
st.exception(e) # Show full traceback in development
+55 -3
View File
@@ -1,9 +1,12 @@
from dotenv import load_dotenv
import pandas as pd
import json
import os
import uuid
import warnings
import tempfile as tmp
from mrkdwn_analysis import MarkdownAnalyzer
from pydantic import BaseModel, Field, model_validator
from llama_index.core.llms import ChatMessage
from llama_cloud_services import LlamaExtract, LlamaParse
@@ -11,7 +14,7 @@ from llama_cloud_services.extract import SourceText
from llama_cloud.client import AsyncLlamaCloud
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
from llama_index.llms.openai import OpenAIResponses
from typing import List, Tuple, Union
from typing import List, Tuple, Union, Optional, Dict
from typing_extensions import Self
from pyvis.network import Network
@@ -94,6 +97,47 @@ if (
LLM_STRUCT = LLM.as_structured_llm(MindMap)
def md_table_to_pd_dataframe(md_table: Dict[str, list]) -> Optional[pd.DataFrame]:
try:
df = pd.DataFrame()
for i in range(len(md_table["header"])):
ls = [row[i] for row in md_table["rows"]]
df[md_table["header"][i]] = ls
return df
except Exception as e:
warnings.warn(f"Skipping table as an error occurred: {e}")
return None
async def parse_file(
file_path: str, with_images: bool = False, with_tables: bool = False
) -> Union[Tuple[Optional[str], Optional[List[str]], Optional[List[pd.DataFrame]]]]:
images: Optional[List[str]] = None
text: Optional[str] = None
tables: Optional[List[pd.DataFrame]] = None
document = await PARSER.aparse(file_path=file_path)
md_content = await document.aget_markdown_documents()
if len(md_content) != 0:
text = "\n\n---\n\n".join([doc.text for doc in md_content])
if with_images:
images = await document.asave_all_images("./static/")
if with_tables:
if text is not None:
tmp_file = tmp.NamedTemporaryFile(
suffix=".md", delete=False, delete_on_close=False
)
with open(tmp_file.name, "w") as f:
f.write(text)
analyzer = MarkdownAnalyzer(tmp_file.name)
md_tables = analyzer.identify_tables()["Table"]
tables = []
for md_table in md_tables:
table = md_table_to_pd_dataframe(md_table=md_table)
if table is not None:
tables.append(table)
return text, images, tables
async def process_file(
filename: str,
) -> Union[Tuple[str, None], Tuple[None, None], Tuple[str, str]]:
@@ -103,8 +147,7 @@ async def process_file(
await CLIENT.pipelines.add_files_to_pipeline_api(
pipeline_id=PIPELINE_ID, request=files
)
document = await PARSER.aparse(file_path=filename)
md_content = await document.aget_markdown_documents()
md_content, _, _ = await parse_file(file_path=filename)
if len(md_content) == 0:
return None, None
text = "\n\n---\n\n".join([md.text for md in md_content])
@@ -163,3 +206,12 @@ async def query_index(question: str) -> Union[str, None]:
+ "\n\n## Sources\n\n- "
+ "\n- ".join(sources)
)
async def get_plots_and_tables(
file_path: str,
) -> Union[Tuple[Optional[List[str]], Optional[List[pd.DataFrame]]]]:
_, images, tables = await parse_file(
file_path=file_path, with_images=True, with_tables=True
)
return images, tables
+58 -1
View File
@@ -1,11 +1,13 @@
import pytest
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from mrkdwn_analysis import MarkdownAnalyzer
from typing import Callable
from pydantic import ValidationError
from src.notebookllama.utils import process_file, get_mind_map
from src.notebookllama.utils import process_file, get_mind_map, md_table_to_pd_dataframe
from src.notebookllama.models import Notebook
load_dotenv()
@@ -23,6 +25,49 @@ def input_file() -> str:
return "data/test/brain_for_kids.pdf"
@pytest.fixture()
def markdown_file() -> str:
return "data/test/md_sample.md"
@pytest.fixture()
def dataframe_from_tables() -> pd.DataFrame:
project_data = {
"Project Name": [
"User Dashboard",
"API Integration",
"Mobile App",
"Database Migration",
"Security Audit",
],
"Status": [
"In Progress",
"Completed",
"Planning",
"In Progress",
"Not Started",
],
"Completion %": ["75%", "100%", "25%", "60%", "0%"],
"Assigned Developer": [
"Alice Johnson",
"Bob Smith",
"Carol Davis",
"David Wilson",
"Eve Brown",
],
"Due Date": [
"2025-07-15",
"2025-06-30",
"2025-08-20",
"2025-07-10",
"2025-08-01",
],
}
df = pd.DataFrame(project_data)
return df
@pytest.fixture()
def file_exists_fn() -> Callable[[os.PathLike[str]], bool]:
def file_exists(file_path: os.PathLike[str]) -> bool:
@@ -102,3 +147,15 @@ async def test_file_processing(input_file: str) -> None:
except ValidationError:
notebook_model = None
assert isinstance(notebook_model, Notebook)
def test_table_to_dataframe(
markdown_file: str, dataframe_from_tables: pd.DataFrame
) -> None:
analyzer = MarkdownAnalyzer(markdown_file)
md_tables = analyzer.identify_tables()["Table"]
assert len(md_tables) == 2
for md_table in md_tables:
df = md_table_to_pd_dataframe(md_table)
assert df is not None
assert df.equals(dataframe_from_tables)
Generated
+52
View File
@@ -193,6 +193,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/fe/27/a30c24a74cc4f3969f3e0d184da149fa6327620c7c72333ccc3a8e3e1095/banks-2.1.3-py3-none-any.whl", hash = "sha256:9e1217dc977e6dd1ce42c5ff48e9bcaf238d788c81b42deb6a555615ffcffbab", size = 28133, upload-time = "2025-06-27T07:12:05.986Z" },
]
[[package]]
name = "beautifulsoup4"
version = "4.13.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "soupsieve" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067, upload-time = "2025-04-15T17:05:13.836Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" },
]
[[package]]
name = "blinker"
version = "1.9.0"
@@ -979,6 +992,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/57/cf/2980bcdbb58c24afd5798ef0cc9403e092beae528c6002bfd4fc03a62674/llama_index_workflows-1.0.1-py3-none-any.whl", hash = "sha256:765844e143fac7fa1f25749be2479a9f75bf553d779a2fbec4f2caeeaa4ff1dd", size = 36943, upload-time = "2025-06-26T04:20:03.858Z" },
]
[[package]]
name = "markdown-analysis"
version = "0.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "markdownify" },
{ name = "requests" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/8e/c0/cbe75dfbc93ea0b2aa8d3819b69a2b5ee25aeba0ca937770109de120074b/markdown_analysis-0.1.5.tar.gz", hash = "sha256:5a2091686011b38b9dd5617fae8e928b768288869fcf11d5f6d61186dc219657", size = 25175, upload-time = "2025-04-05T18:16:56.766Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d3/94/50890298581013aa7db6c494308f459d47dfbb9555fe087e44e4b5e729ba/markdown_analysis-0.1.5-py3-none-any.whl", hash = "sha256:0b1058bf44d65f1d508bc53e46113631a98219fa040e6a3221d46d776b7728db", size = 25776, upload-time = "2025-04-05T18:16:55.843Z" },
]
[[package]]
name = "markdown-it-py"
version = "3.0.0"
@@ -991,6 +1019,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" },
]
[[package]]
name = "markdownify"
version = "1.1.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "six" },
]
sdist = { url = "https://files.pythonhosted.org/packages/2f/78/c48fed23c7aebc2c16049062e72de1da3220c274de59d28c942acdc9ffb2/markdownify-1.1.0.tar.gz", hash = "sha256:449c0bbbf1401c5112379619524f33b63490a8fa479456d41de9dc9e37560ebd", size = 17127, upload-time = "2025-03-05T11:54:40.574Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/64/11/b751af7ad41b254a802cf52f7bc1fca7cabe2388132f2ce60a1a6b9b9622/markdownify-1.1.0-py3-none-any.whl", hash = "sha256:32a5a08e9af02c8a6528942224c91b933b4bd2c7d078f9012943776fc313eeef", size = 13901, upload-time = "2025-03-05T11:54:39.454Z" },
]
[[package]]
name = "markupsafe"
version = "3.0.2"
@@ -1215,6 +1256,7 @@ dependencies = [
{ name = "llama-index-observability-otel" },
{ name = "llama-index-tools-mcp" },
{ name = "llama-index-workflows" },
{ name = "markdown-analysis" },
{ name = "mypy" },
{ name = "opentelemetry-exporter-otlp-proto-http" },
{ name = "plotly" },
@@ -1243,6 +1285,7 @@ requires-dist = [
{ name = "llama-index-observability-otel", specifier = ">=0.1.1" },
{ name = "llama-index-tools-mcp", specifier = ">=0.2.5" },
{ name = "llama-index-workflows", specifier = ">=1.0.1" },
{ name = "markdown-analysis", specifier = ">=0.1.5" },
{ name = "mypy", specifier = ">=1.16.1" },
{ name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.34.1" },
{ name = "plotly", specifier = ">=6.2.0" },
@@ -2011,6 +2054,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
]
[[package]]
name = "soupsieve"
version = "2.7"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/3f/f4/4a80cd6ef364b2e8b65b15816a843c0980f7a5a2b4dc701fc574952aa19f/soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a", size = 103418, upload-time = "2025-04-20T18:50:08.518Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload-time = "2025-04-20T18:50:07.196Z" },
]
[[package]]
name = "sqlalchemy"
version = "2.0.41"