first commit

2026-06-30 22:17:57 -04:00 · 2025-06-27 22:47:48 +02:00
commit 3d86713cb3
19 changed files with 1798 additions and 0 deletions
@@ -0,0 +1,20 @@
+name: Linting
+
+on:
+  pull_request:
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
+      - name: Set up Python
+        run: uv python install 3.12
+
+      - name: Run linter
+        shell: bash
+        run: uv run -- pre-commit run -a
@@ -0,0 +1,21 @@
+name: CI Tests
+
+on:
+  pull_request:
+
+jobs:
+  testing:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
+      - name: Set up Python
+        run: uv python install
+
+      - name: Run Tests
+        run: uv run -- pytest tests/test_*.py
@@ -0,0 +1,22 @@
+name: Typecheck
+
+on:
+  pull_request:
+
+jobs:
+  core-typecheck:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
+      - name: Set up Python
+        run: uv python install
+
+      - name: Run Mypy
+        working-directory: src
+        run: uv run -- mypy notebooklm_clone
@@ -0,0 +1,16 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
+# caches
+.*_cache/
+
+# env files
+.env
@@ -0,0 +1,84 @@
+---
+default_language_version:
+  python: python3
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: check-toml
+      - id: check-yaml
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.11.8
+    hooks:
+      - id: ruff
+        args: [--exit-non-zero-on-fix, --fix]
+      - id: ruff-format
+        exclude: ".*poetry.lock|.*_static|.*uv.lock|.*ipynb|.*docs.*"
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.0.1
+    hooks:
+      - id: mypy
+        additional_dependencies:
+          [
+            "types-requests",
+            "types-Deprecated",
+            "types-redis",
+            "types-setuptools",
+            "types-PyYAML",
+            "types-protobuf==4.24.0.4",
+          ]
+        args:
+          [
+            --namespace-packages,
+            --explicit-package-bases,
+            --disallow-untyped-defs,
+            --ignore-missing-imports,
+            --python-version=3.9,
+          ]
+        entry: bash -c "export MYPYPATH=src/emoji_searcher_app"
+
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.10.1
+    hooks:
+      - id: black-jupyter
+        name: black-docs-py
+        alias: black
+        files: ^(README.md|CONTRIBUTING.md)
+        # Using PEP 8's line length in docs prevents excess left/right scrolling
+        args: [--line-length=79]
+
+  - repo: https://github.com/adamchainz/blacken-docs
+    rev: 1.16.0
+    hooks:
+      - id: blacken-docs
+        name: black-docs-text
+        alias: black
+        types_or: [rst, markdown, tex]
+        additional_dependencies: [black==23.10.1]
+        # Using PEP 8's line length in docs prevents excess left/right scrolling
+        args: [--line-length=79]
+
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v3.0.3
+    hooks:
+      - id: prettier
+
+  - repo: https://github.com/srstevenson/nb-clean
+    rev: 3.1.0
+    hooks:
+      - id: nb-clean
+        args: [--preserve-cell-outputs, --remove-empty-cells]
+
+  - repo: https://github.com/pappasam/toml-sort
+    rev: v0.23.1
+    hooks:
+      - id: toml-sort-fix
@@ -0,0 +1 @@
+3.13
@@ -0,0 +1,48 @@
+# Contributing to `notebooklm-clone`
+
+Do you want to contribute to this project? Make sure to read this guidelines first :)
+
+## Issue
+
+**When to do it**:
+
+- You found bugs but you don't know how to solve them or don't have time/will to do the solve
+- You want new features but you don't know how to implement them or don't have time/will to do the implementation
+
+> ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_
+
+**How to do it**:
+
+- Open an issue
+- Give the issue a meaningful title (short but effective problem/feature request description)
+- Describe the problem/feature request
+
+## Traditional contribution
+
+**When to do it**:
+
+- You found bugs and corrected them
+- You optimized/improved the code
+- You added new features that you think could be useful to others
+
+**How to do it**:
+
+1. Fork this repository
+2. Install `pre-commit` and make sure to have it within the Git Hooks for your fork:
+
+```bash
+pip install pre-commit
+pre-commit install
+```
+
+3. Change the things you want, and make sure tests still pass or add new ones:
+
+```bash
+pytest tests/test_*.py
+```
+
+3. Commit your changes
+4. Make sure your changes pass the pre-commit linting/type checking, if not modify them so that they pass
+5. Submit pull request (make sure to provide a thorough description of the changes)
+
+### Thanks for contributing!
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) Clelia Astra Bertelli
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -0,0 +1,3 @@
+# NotebookLM clone
+
+This project is aimed at producing a fully open-source, LlamaCloud-backed alternative to NotebookLM.
@@ -0,0 +1,24 @@
+[project]
+name = "notebooklm-clone"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+  "llama-cloud>=0.1.29",
+  "llama-cloud-services>=0.6.38",
+  "llama-index-core>=0.12.44",
+  "llama-index-embeddings-openai>=0.3.1",
+  "llama-index-indices-managed-llama-cloud>=0.6.11",
+  "llama-index-llms-openai>=0.4.7",
+  "mypy>=1.16.1",
+  "pre-commit>=4.2.0",
+  "pytest>=8.4.1",
+  "python-dotenv>=1.1.1"
+]
+
+[tool.mypy]
+disable_error_code = ["import-not-found", "import-untyped"]
+
+[tool.pytest.ini_options]
+pythonpath = ["src"]
@@ -0,0 +1,63 @@
+from pydantic import BaseModel, Field, model_validator
+from typing import List, Dict, Literal, Tuple
+from typing_extensions import Self
+
+
+class Notebook(BaseModel):
+    summary: str = Field(
+        description="Summary of the document.",
+    )
+    highlights: List[str] = Field(
+        description="Highlights of the documents: 3 to 10 bullet points that represent the crucial knots of the documents.",
+        min_length=3,
+        max_length=10,
+    )
+    questions_and_answers: List[Dict[Literal["question", "answer"], str]] = Field(
+        description="5 to 15 questions and answers about a given topic. This field should be organized as a list of dictionaries, each containing a 'question' and an 'answer' fields.",
+        examples=[
+            [
+                {"question": "What is the capital of Spain?", "answer": "Madrid"},
+                {"question": "What is the capital of France?", "answer": "Paris"},
+                {"question": "What is the capital of Italy?", "answer": "Rome"},
+                {"question": "What is the capital of Portugal?", "answer": "Lisbon"},
+                {"question": "What is the capital of Germany?", "answer": "Berlin"},
+            ]
+        ],
+        min_length=5,
+        max_length=15,
+    )
+
+
+class MindMap(BaseModel):
+    nodes: List[Tuple[str, str]] = Field(
+        description="List of nodes of the mind map, with their ID as first element and their content as second. Content should never exceed 5 words.",
+        examples=[
+            [
+                ("A", "Fall of the Roman Empire"),
+                ("B", "476 AD"),
+                ("C", "Barbarian invasions"),
+            ],
+            [
+                ("A", "Auxin is released"),
+                ("B", "Travels to the roots"),
+                ("C", "Root cells grow in dimensions"),
+            ],
+        ],
+    )
+    edges: List[Tuple[str, str]] = Field(
+        description="The edges connecting the nodes of the mind map, as a list of tuples containing the IDs of the two connected edges.",
+        examples=[
+            [("A", "B"), ("A", "C"), ("B", "C")],
+            [("C", "A"), ("B", "C"), ("A", "B")],
+        ],
+    )
+
+    @model_validator(mode="after")
+    def validate_mind_map(self) -> Self:
+        all_nodes = [el[0] for el in self.nodes]
+        all_edges = [el[0] for el in self.edges] + [el[1] for el in self.edges]
+        if set(all_nodes).issubset(set(all_edges)) and set(all_nodes) != set(all_edges):
+            raise ValueError(
+                "There are non-existing nodes listed as source or target in the edges"
+            )
+        return self
@@ -0,0 +1,28 @@
+from dotenv import load_dotenv
+import os
+
+from llama_cloud_services import LlamaExtract
+from llama_cloud.client import AsyncLlamaCloud
+from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
+from llama_index.llms.openai import OpenAI
+
+
+load_dotenv()
+
+
+load_dotenv()
+if (
+    os.getenv("LLAMACLOUD_API_KEY", None)
+    and os.getenv("EXTRACT_AGENT_ID", None)
+    and os.getenv("LLAMACLOUD_PIPELINE_ID", None)
+    and os.getenv("OPENAI_API_KEY", None)
+):
+    LLM = OpenAI(model="gpt-4.1", api_key=os.getenv("OPENAI_API_KEY"))
+    CLIENT = AsyncLlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY"))
+    EXTRACT_AGENT = LlamaExtract(api_key=os.getenv("LLAMACLOUD_API_KEY")).get_agent(
+        id=os.getenv("EXTRACT_AGENT_ID")
+    )
+    PIPELINE_ID = os.getenv("LLAMACLOUD_PIPELINE_ID")
+    QE = LlamaCloudIndex(
+        api_key=os.getenv("LLAMACLOUD_API_KEY"), pipeline_id=PIPELINE_ID
+    ).as_query_engine(llm=LLM)
@@ -0,0 +1,68 @@
+import pytest
+
+from src.notebooklm_clone.models import Notebook, MindMap
+from pydantic import ValidationError
+
+
+def test_notebook() -> None:
+    n1 = Notebook(
+        summary="This is a summary",
+        questions_and_answers=[
+            {"question": "What is the capital of Spain?", "answer": "Madrid"},
+            {"question": "What is the capital of France?", "answer": "Paris"},
+            {"question": "What is the capital of Italy?", "answer": "Rome"},
+            {"question": "What is the capital of Portugal?", "answer": "Lisbon"},
+            {"question": "What is the capital of Germany?", "answer": "Berlin"},
+        ],
+        highlights=["This", "is", "a", "summary"],
+    )
+    assert n1.summary == "This is a summary"
+    assert n1.questions_and_answers[0]["question"] == "What is the capital of Spain?"
+    assert n1.questions_and_answers[0]["answer"] == "Madrid"
+    assert n1.highlights[0] == "This"
+    with pytest.raises(ValidationError):
+        Notebook(
+            summary="This is a summary",
+            questions_and_answers=[
+                {"question": "What is the capital of France?", "answer": "Paris"},
+                {"question": "What is the capital of Italy?", "answer": "Rome"},
+                {"question": "What is the capital of Portugal?", "answer": "Lisbon"},
+                {"question": "What is the capital of Germany?", "answer": "Berlin"},
+            ],
+            highlights=["This", "is", "a", "summary"],
+        )
+    with pytest.raises(ValidationError):
+        Notebook(
+            summary="This is a summary",
+            questions_and_answers=[
+                {"question": "What is the capital of Spain?", "answer": "Madrid"},
+                {"question": "What is the capital of France?", "answer": "Paris"},
+                {"question": "What is the capital of Italy?", "answer": "Rome"},
+                {"question": "What is the capital of Portugal?", "answer": "Lisbon"},
+                {"question": "What is the capital of Germany?", "answer": "Berlin"},
+            ],
+            highlights=["This", "is"],
+        )
+
+
+def test_mind_map() -> None:
+    m1 = MindMap(
+        nodes=[
+            ("A", "Auxin is released"),
+            ("B", "Travels to the roots"),
+            ("C", "Root cells grow in dimensions"),
+        ],
+        edges=[("A", "B"), ("A", "C"), ("B", "C")],
+    )
+    assert m1.nodes[0][0] == "A"
+    assert m1.nodes[0][1] == "Auxin is released"
+    assert m1.edges[0] == ("A", "B")
+    with pytest.raises(ValidationError):
+        MindMap(
+            nodes=[
+                ("A", "Auxin is released"),
+                ("B", "Travels to the roots"),
+                ("C", "Root cells grow in dimensions"),
+            ],
+            edges=[("A", "B"), ("A", "D"), ("B", "C")],
+        )
@@ -0,0 +1,59 @@
+import os
+from dotenv import load_dotenv
+
+from llama_cloud import (
+    PipelineCreateEmbeddingConfig_OpenaiEmbedding,
+    PipelineTransformConfig_Advanced,
+    AdvancedModeTransformConfigChunkingConfig_Sentence,
+    AdvancedModeTransformConfigSegmentationConfig_Page,
+    PipelineCreate,
+)
+from llama_cloud.client import LlamaCloud
+from llama_index.embeddings.openai import OpenAIEmbedding
+
+
+def main():
+    load_dotenv()
+
+    embed_model = OpenAIEmbedding(
+        model="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY")
+    )
+
+    client = LlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY"))
+
+    embedding_config = PipelineCreateEmbeddingConfig_OpenaiEmbedding(
+        type="OPENAI_EMBEDDING",
+        component=embed_model,
+    )
+
+    segm_config = AdvancedModeTransformConfigSegmentationConfig_Page(mode="page")
+    chunk_config = AdvancedModeTransformConfigChunkingConfig_Sentence(
+        chunk_size=1024,
+        chunk_overlap=200,
+        separator="<whitespace>",
+        paragraph_separator="\n\n\n",
+        mode="sentence",
+    )
+
+    transform_config = PipelineTransformConfig_Advanced(
+        segmentation_config=segm_config,
+        chunking_config=chunk_config,
+        mode="advanced",
+    )
+
+    pipeline_request = PipelineCreate(
+        name="notebooklm_pipeline",
+        embedding_config=embedding_config,
+        transform_config=transform_config,
+    )
+
+    pipeline = client.pipelines.upsert_pipeline(request=pipeline_request)
+
+    with open(".env", "a") as f:
+        f.write(f'\nLLAMACLOUD_PIPELINE_ID="{pipeline.id}"')
+
+    return 0
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,23 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+from llama_cloud_services import LlamaExtract
+from src.notebooklm_clone.models import Notebook
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def main() -> int:
+    conn = LlamaExtract(api_key=os.getenv("LLAMACLOUD_API_KEY"))
+    agent = conn.create_agent(name="q_and_a_agent", data_schema=Notebook)
+    _id = agent.id
+    with open(".env", "a") as f:
+        f.write(f'\nEXTRACT_AGENT_ID="{_id}"')
+    return 0
+
+
+if __name__ == "__main__":
+    main()