feat: adding a first (still not very good) implementation of table and images extraction

2026-07-01 22:14:04 -04:00 · 2025-07-06 18:53:33 +02:00
parent 08da1b195f
commit c8d751b262
7 changed files with 436 additions and 4 deletions
@@ -18,3 +18,6 @@ wheels/

 # audio files
 *.mp3
+
+# auto-generated images
+static/
@@ -0,0 +1,51 @@
+# Project Status Report
+
+This document provides an overview of our current project status and key metrics.
+
+## Introduction
+
+Our development team has been working on multiple initiatives this quarter. The following sections outline our progress and upcoming milestones.
+
+## Team Performance Metrics
+
+The table below shows our team's performance across different projects:
+
+| Project Name       | Status      | Completion % | Assigned Developer | Due Date   |
+| ------------------ | ----------- | ------------ | ------------------ | ---------- |
+| User Dashboard     | In Progress | 75%          | Alice Johnson      | 2025-07-15 |
+| API Integration    | Completed   | 100%         | Bob Smith          | 2025-06-30 |
+| Mobile App         | Planning    | 25%          | Carol Davis        | 2025-08-20 |
+| Database Migration | In Progress | 60%          | David Wilson       | 2025-07-10 |
+| Security Audit     | Not Started | 0%           | Eve Brown          | 2025-08-01 |
+
+| Project Name       | Status      | Completion % | Assigned Developer | Due Date   |
+| ------------------ | ----------- | ------------ | ------------------ | ---------- |
+| User Dashboard     | In Progress | 75%          | Alice Johnson      | 2025-07-15 |
+| API Integration    | Completed   | 100%         | Bob Smith          | 2025-06-30 |
+| Mobile App         | Planning    | 25%          | Carol Davis        | 2025-08-20 |
+| Database Migration | In Progress | 60%          | David Wilson       | 2025-07-10 |
+| Security Audit     | Not Started | 0%           | Eve Brown          | 2025-08-01 |
+
+## Key Observations
+
+Based on the data above, we can see that:
+
+- The API Integration project was completed on schedule
+- The User Dashboard is progressing well and should meet its deadline
+- The Database Migration needs attention to stay on track
+- We need to begin the Security Audit soon to meet the August deadline
+
+## Next Steps
+
+1. **Prioritize Database Migration** - Assign additional resources if needed
+2. **Begin Security Audit planning** - Schedule kickoff meeting with Eve
+3. **Continue monitoring** User Dashboard progress
+4. **Celebrate** the successful API Integration completion
+
+## Conclusion
+
+Overall, our team is performing well with most projects on track. The key focus areas for the coming weeks are the database migration and security audit preparation.
+
+---
+
+_Last updated: July 6, 2025_
@@ -18,6 +18,7 @@ dependencies = [
  "llama-index-observability-otel>=0.1.1",
  "llama-index-tools-mcp>=0.2.5",
  "llama-index-workflows>=1.0.1",
+  "markdown-analysis>=0.1.5",
  "mypy>=1.16.1",
  "opentelemetry-exporter-otlp-proto-http>=1.34.1",
  "plotly>=6.2.0",
@@ -0,0 +1,216 @@
+import sys
+import os
+import io
+import tempfile as tmp
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import asyncio
+from utils import get_plots_and_tables
+import streamlit as st
+from PIL import Image
+
+
+def get_plots_and_tables_sync(file: io.BytesIO):
+    fl = tmp.NamedTemporaryFile(suffix=".pdf", delete=False, delete_on_close=False)
+    with open(fl.name, "wb") as f:
+        f.write(file.getvalue())
+
+    # Try to get existing event loop, if not create a new one
+    try:
+        loop = asyncio.get_event_loop()
+        if loop.is_closed():
+            raise RuntimeError("Event loop is closed")
+    except RuntimeError:
+        # No event loop exists or it's closed, create a new one
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+    # Run the async function in the current event loop
+    if loop.is_running():
+        # If loop is already running (e.g., in Jupyter/Streamlit),
+        # we need to use a different approach
+        import concurrent.futures
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(
+                asyncio.run, get_plots_and_tables(file_path=fl.name)
+            )
+            return future.result()
+    else:
+        # If loop is not running, we can use it directly
+        return loop.run_until_complete(get_plots_and_tables(file_path=fl.name))
+
+
+def cleanup_temp_files(image_paths):
+    """Clean up temporary image files"""
+    for img_path in image_paths:
+        try:
+            if os.path.exists(img_path):
+                os.remove(img_path)
+        except Exception as e:
+            st.warning(f"Could not remove {img_path}: {str(e)}")
+
+
+def create_download_zip(dataframes, image_paths):
+    """Create a ZIP file with all extracted data"""
+    import zipfile
+
+    zip_buffer = io.BytesIO()
+
+    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
+        # Add CSV files
+        for i, df in enumerate(dataframes):
+            csv_data = df.to_csv(index=False)
+            zip_file.writestr(f"table_{i + 1}.csv", csv_data)
+
+        # Add image files
+        for i, img_path in enumerate(image_paths):
+            if os.path.exists(img_path):
+                file_ext = os.path.splitext(img_path)[1]
+                zip_file.write(img_path, f"image_{i + 1}{file_ext}")
+
+    zip_buffer.seek(0)
+    return zip_buffer.getvalue()
+
+
+# Direct Streamlit execution
+# Chat Interface
+st.set_page_config(page_title="NotebookLlaMa - Images and Tables", page_icon="📊")
+
+st.sidebar.header("Images and Tables📊")
+st.sidebar.info("To switch to the Home page, select it from above!🔺")
+st.markdown("---")
+st.markdown("## NotebookLlaMa - Images and Tables📊")
+st.markdown("### Upload a PDF file to extract plots and tables")
+
+# File uploader
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+
+if uploaded_file is not None:
+    # Process the file
+    with st.spinner("Processing PDF... This may take a moment."):
+        try:
+            # Convert uploaded file to BytesIO
+            file_bytes = io.BytesIO(uploaded_file.getvalue())
+
+            # Extract plots and tables
+            image_paths, dataframes = get_plots_and_tables_sync(file_bytes)
+
+            # Display results summary
+            st.success("✅ Processing complete!")
+            st.info(f"Found {len(image_paths)} images and {len(dataframes)} tables")
+
+            # Create tabs for better organization
+            tab1, tab2 = st.tabs(["📊 Tables", "📈 Plots/Images"])
+
+            with tab1:
+                st.header("Extracted Tables")
+                if dataframes:
+                    for i, df in enumerate(dataframes):
+                        st.subheader(f"Table {i + 1}")
+
+                        # Display table with options
+                        col1, col2 = st.columns([3, 1])
+
+                        with col1:
+                            st.dataframe(df, use_container_width=True)
+
+                        with col2:
+                            st.write("**Table Info:**")
+                            st.write(f"Rows: {len(df)}")
+                            st.write(f"Columns: {len(df.columns)}")
+
+                            # Download button for CSV
+                            csv = df.to_csv(index=False)
+                            st.download_button(
+                                label="Download CSV",
+                                data=csv,
+                                file_name=f"table_{i + 1}.csv",
+                                mime="text/csv",
+                                key=f"download_table_{i}",
+                            )
+
+                        # Show additional table statistics
+                        with st.expander(f"Table {i + 1} Details"):
+                            st.write("**Column Information:**")
+                            st.write(df.dtypes)
+                            st.write("**First few rows:**")
+                            st.write(df.head())
+                            if len(df) > 5:
+                                st.write("**Summary statistics:**")
+                                st.write(df.describe(include="all"))
+
+                        st.divider()
+                else:
+                    st.warning("No tables found in the PDF")
+
+            with tab2:
+                st.header("Extracted Plots and Images")
+                if image_paths:
+                    for i, img_path in enumerate(image_paths):
+                        st.subheader(f"Image {i + 1}")
+
+                        try:
+                            # Display image
+                            if os.path.exists(img_path):
+                                image = Image.open(img_path)
+
+                                # Create columns for image and info
+                                col1, col2 = st.columns([3, 1])
+
+                                with col1:
+                                    st.image(
+                                        image,
+                                        caption=f"Image {i + 1}",
+                                        use_container_width=True,
+                                    )
+
+                                with col2:
+                                    st.write("**Image Info:**")
+                                    st.write(f"Size: {image.size}")
+                                    st.write(f"Format: {image.format}")
+                                    st.write(f"Mode: {image.mode}")
+
+                                    # Download button for image
+                                    with open(img_path, "rb") as file:
+                                        st.download_button(
+                                            label="Download Image",
+                                            data=file.read(),
+                                            file_name=f"image_{i + 1}.{image.format.lower()}",
+                                            mime=f"image/{image.format.lower()}",
+                                            key=f"download_image_{i}",
+                                        )
+
+                                st.divider()
+                            else:
+                                st.error(f"Image file not found: {img_path}")
+
+                        except Exception as e:
+                            st.error(f"Error loading image {i + 1}: {str(e)}")
+                else:
+                    st.warning("No images found in the PDF")
+
+            # Cleanup section
+            st.header("📂 File Management")
+            col1, col2 = st.columns(2)
+
+            with col1:
+                if st.button("🗑️ Clean up temporary files"):
+                    cleanup_temp_files(image_paths)
+                    st.success("Temporary files cleaned up!")
+
+            with col2:
+                # Option to download all data as ZIP
+                if st.button("📦 Download All Data"):
+                    zip_data = create_download_zip(dataframes, image_paths)
+                    st.download_button(
+                        label="Download ZIP",
+                        data=zip_data,
+                        file_name="extracted_data.zip",
+                        mime="application/zip",
+                    )
+
+        except Exception as e:
+            st.error(f"Error processing file: {str(e)}")
+            st.exception(e)  # Show full traceback in development
@@ -1,9 +1,12 @@
 from dotenv import load_dotenv
+import pandas as pd
 import json
 import os
 import uuid
 import warnings
+import tempfile as tmp

+from mrkdwn_analysis import MarkdownAnalyzer
 from pydantic import BaseModel, Field, model_validator
 from llama_index.core.llms import ChatMessage
 from llama_cloud_services import LlamaExtract, LlamaParse
@@ -11,7 +14,7 @@ from llama_cloud_services.extract import SourceText
 from llama_cloud.client import AsyncLlamaCloud
 from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
 from llama_index.llms.openai import OpenAIResponses
-from typing import List, Tuple, Union
+from typing import List, Tuple, Union, Optional, Dict
 from typing_extensions import Self
 from pyvis.network import Network

@@ -94,6 +97,47 @@ if (
    LLM_STRUCT = LLM.as_structured_llm(MindMap)


+def md_table_to_pd_dataframe(md_table: Dict[str, list]) -> Optional[pd.DataFrame]:
+    try:
+        df = pd.DataFrame()
+        for i in range(len(md_table["header"])):
+            ls = [row[i] for row in md_table["rows"]]
+            df[md_table["header"][i]] = ls
+        return df
+    except Exception as e:
+        warnings.warn(f"Skipping table as an error occurred: {e}")
+        return None
+
+
+async def parse_file(
+    file_path: str, with_images: bool = False, with_tables: bool = False
+) -> Union[Tuple[Optional[str], Optional[List[str]], Optional[List[pd.DataFrame]]]]:
+    images: Optional[List[str]] = None
+    text: Optional[str] = None
+    tables: Optional[List[pd.DataFrame]] = None
+    document = await PARSER.aparse(file_path=file_path)
+    md_content = await document.aget_markdown_documents()
+    if len(md_content) != 0:
+        text = "\n\n---\n\n".join([doc.text for doc in md_content])
+    if with_images:
+        images = await document.asave_all_images("./static/")
+    if with_tables:
+        if text is not None:
+            tmp_file = tmp.NamedTemporaryFile(
+                suffix=".md", delete=False, delete_on_close=False
+            )
+            with open(tmp_file.name, "w") as f:
+                f.write(text)
+            analyzer = MarkdownAnalyzer(tmp_file.name)
+            md_tables = analyzer.identify_tables()["Table"]
+            tables = []
+            for md_table in md_tables:
+                table = md_table_to_pd_dataframe(md_table=md_table)
+                if table is not None:
+                    tables.append(table)
+    return text, images, tables
+
+
 async def process_file(
    filename: str,
 ) -> Union[Tuple[str, None], Tuple[None, None], Tuple[str, str]]:
@@ -103,8 +147,7 @@ async def process_file(
    await CLIENT.pipelines.add_files_to_pipeline_api(
        pipeline_id=PIPELINE_ID, request=files
    )
-    document = await PARSER.aparse(file_path=filename)
-    md_content = await document.aget_markdown_documents()
+    md_content, _, _ = await parse_file(file_path=filename)
    if len(md_content) == 0:
        return None, None
    text = "\n\n---\n\n".join([md.text for md in md_content])
@@ -163,3 +206,12 @@ async def query_index(question: str) -> Union[str, None]:
        + "\n\n## Sources\n\n- "
        + "\n- ".join(sources)
    )
+
+
+async def get_plots_and_tables(
+    file_path: str,
+) -> Union[Tuple[Optional[List[str]], Optional[List[pd.DataFrame]]]]:
+    _, images, tables = await parse_file(
+        file_path=file_path, with_images=True, with_tables=True
+    )
+    return images, tables
@@ -1,11 +1,13 @@
 import pytest
 import os
+import pandas as pd
 from pathlib import Path
 from dotenv import load_dotenv
+from mrkdwn_analysis import MarkdownAnalyzer

 from typing import Callable
 from pydantic import ValidationError
-from src.notebookllama.utils import process_file, get_mind_map
+from src.notebookllama.utils import process_file, get_mind_map, md_table_to_pd_dataframe
 from src.notebookllama.models import Notebook

 load_dotenv()
@@ -23,6 +25,49 @@ def input_file() -> str:
    return "data/test/brain_for_kids.pdf"


+@pytest.fixture()
+def markdown_file() -> str:
+    return "data/test/md_sample.md"
+
+
+@pytest.fixture()
+def dataframe_from_tables() -> pd.DataFrame:
+    project_data = {
+        "Project Name": [
+            "User Dashboard",
+            "API Integration",
+            "Mobile App",
+            "Database Migration",
+            "Security Audit",
+        ],
+        "Status": [
+            "In Progress",
+            "Completed",
+            "Planning",
+            "In Progress",
+            "Not Started",
+        ],
+        "Completion %": ["75%", "100%", "25%", "60%", "0%"],
+        "Assigned Developer": [
+            "Alice Johnson",
+            "Bob Smith",
+            "Carol Davis",
+            "David Wilson",
+            "Eve Brown",
+        ],
+        "Due Date": [
+            "2025-07-15",
+            "2025-06-30",
+            "2025-08-20",
+            "2025-07-10",
+            "2025-08-01",
+        ],
+    }
+
+    df = pd.DataFrame(project_data)
+    return df
+
+
@pytest.fixture()
 def file_exists_fn() -> Callable[[os.PathLike[str]], bool]:
    def file_exists(file_path: os.PathLike[str]) -> bool:
@@ -102,3 +147,15 @@ async def test_file_processing(input_file: str) -> None:
    except ValidationError:
        notebook_model = None
    assert isinstance(notebook_model, Notebook)
+
+
+def test_table_to_dataframe(
+    markdown_file: str, dataframe_from_tables: pd.DataFrame
+) -> None:
+    analyzer = MarkdownAnalyzer(markdown_file)
+    md_tables = analyzer.identify_tables()["Table"]
+    assert len(md_tables) == 2
+    for md_table in md_tables:
+        df = md_table_to_pd_dataframe(md_table)
+        assert df is not None
+        assert df.equals(dataframe_from_tables)
@@ -193,6 +193,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/fe/27/a30c24a74cc4f3969f3e0d184da149fa6327620c7c72333ccc3a8e3e1095/banks-2.1.3-py3-none-any.whl", hash = "sha256:9e1217dc977e6dd1ce42c5ff48e9bcaf238d788c81b42deb6a555615ffcffbab", size = 28133, upload-time = "2025-06-27T07:12:05.986Z" },
 ]

+[[package]]
+name = "beautifulsoup4"
+version = "4.13.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067, upload-time = "2025-04-15T17:05:13.836Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" },
+]
+
 [[package]]
 name = "blinker"
 version = "1.9.0"
@@ -979,6 +992,21 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/57/cf/2980bcdbb58c24afd5798ef0cc9403e092beae528c6002bfd4fc03a62674/llama_index_workflows-1.0.1-py3-none-any.whl", hash = "sha256:765844e143fac7fa1f25749be2479a9f75bf553d779a2fbec4f2caeeaa4ff1dd", size = 36943, upload-time = "2025-06-26T04:20:03.858Z" },
 ]

+[[package]]
+name = "markdown-analysis"
+version = "0.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "markdownify" },
+    { name = "requests" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8e/c0/cbe75dfbc93ea0b2aa8d3819b69a2b5ee25aeba0ca937770109de120074b/markdown_analysis-0.1.5.tar.gz", hash = "sha256:5a2091686011b38b9dd5617fae8e928b768288869fcf11d5f6d61186dc219657", size = 25175, upload-time = "2025-04-05T18:16:56.766Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/94/50890298581013aa7db6c494308f459d47dfbb9555fe087e44e4b5e729ba/markdown_analysis-0.1.5-py3-none-any.whl", hash = "sha256:0b1058bf44d65f1d508bc53e46113631a98219fa040e6a3221d46d776b7728db", size = 25776, upload-time = "2025-04-05T18:16:55.843Z" },
+]
+
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
@@ -991,6 +1019,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" },
 ]

+[[package]]
+name = "markdownify"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2f/78/c48fed23c7aebc2c16049062e72de1da3220c274de59d28c942acdc9ffb2/markdownify-1.1.0.tar.gz", hash = "sha256:449c0bbbf1401c5112379619524f33b63490a8fa479456d41de9dc9e37560ebd", size = 17127, upload-time = "2025-03-05T11:54:40.574Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/11/b751af7ad41b254a802cf52f7bc1fca7cabe2388132f2ce60a1a6b9b9622/markdownify-1.1.0-py3-none-any.whl", hash = "sha256:32a5a08e9af02c8a6528942224c91b933b4bd2c7d078f9012943776fc313eeef", size = 13901, upload-time = "2025-03-05T11:54:39.454Z" },
+]
+
 [[package]]
 name = "markupsafe"
 version = "3.0.2"
@@ -1215,6 +1256,7 @@ dependencies = [
    { name = "llama-index-observability-otel" },
    { name = "llama-index-tools-mcp" },
    { name = "llama-index-workflows" },
+    { name = "markdown-analysis" },
    { name = "mypy" },
    { name = "opentelemetry-exporter-otlp-proto-http" },
    { name = "plotly" },
@@ -1243,6 +1285,7 @@ requires-dist = [
    { name = "llama-index-observability-otel", specifier = ">=0.1.1" },
    { name = "llama-index-tools-mcp", specifier = ">=0.2.5" },
    { name = "llama-index-workflows", specifier = ">=1.0.1" },
+    { name = "markdown-analysis", specifier = ">=0.1.5" },
    { name = "mypy", specifier = ">=1.16.1" },
    { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.34.1" },
    { name = "plotly", specifier = ">=6.2.0" },
@@ -2011,6 +2054,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]

+[[package]]
+name = "soupsieve"
+version = "2.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/f4/4a80cd6ef364b2e8b65b15816a843c0980f7a5a2b4dc701fc574952aa19f/soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a", size = 103418, upload-time = "2025-04-20T18:50:08.518Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload-time = "2025-04-20T18:50:07.196Z" },
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.41"