first commit

This commit is contained in:
Clelia (Astra) Bertelli
2025-06-27 22:47:48 +02:00
commit 3d86713cb3
19 changed files with 1798 additions and 0 deletions
View File
+20
View File
@@ -0,0 +1,20 @@
name: Linting
on:
pull_request:
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Set up Python
run: uv python install 3.12
- name: Run linter
shell: bash
run: uv run -- pre-commit run -a
+21
View File
@@ -0,0 +1,21 @@
name: CI Tests
on:
pull_request:
jobs:
testing:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Set up Python
run: uv python install
- name: Run Tests
run: uv run -- pytest tests/test_*.py
+22
View File
@@ -0,0 +1,22 @@
name: Typecheck
on:
pull_request:
jobs:
core-typecheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Set up Python
run: uv python install
- name: Run Mypy
working-directory: src
run: uv run -- mypy notebooklm_clone
+16
View File
@@ -0,0 +1,16 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# Virtual environments
.venv
# caches
.*_cache/
# env files
.env
+84
View File
@@ -0,0 +1,84 @@
---
default_language_version:
python: python3
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: check-toml
- id: check-yaml
- id: detect-private-key
- id: end-of-file-fixer
- id: mixed-line-ending
- id: trailing-whitespace
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.11.8
hooks:
- id: ruff
args: [--exit-non-zero-on-fix, --fix]
- id: ruff-format
exclude: ".*poetry.lock|.*_static|.*uv.lock|.*ipynb|.*docs.*"
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.0.1
hooks:
- id: mypy
additional_dependencies:
[
"types-requests",
"types-Deprecated",
"types-redis",
"types-setuptools",
"types-PyYAML",
"types-protobuf==4.24.0.4",
]
args:
[
--namespace-packages,
--explicit-package-bases,
--disallow-untyped-defs,
--ignore-missing-imports,
--python-version=3.9,
]
entry: bash -c "export MYPYPATH=src/emoji_searcher_app"
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.10.1
hooks:
- id: black-jupyter
name: black-docs-py
alias: black
files: ^(README.md|CONTRIBUTING.md)
# Using PEP 8's line length in docs prevents excess left/right scrolling
args: [--line-length=79]
- repo: https://github.com/adamchainz/blacken-docs
rev: 1.16.0
hooks:
- id: blacken-docs
name: black-docs-text
alias: black
types_or: [rst, markdown, tex]
additional_dependencies: [black==23.10.1]
# Using PEP 8's line length in docs prevents excess left/right scrolling
args: [--line-length=79]
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.3
hooks:
- id: prettier
- repo: https://github.com/srstevenson/nb-clean
rev: 3.1.0
hooks:
- id: nb-clean
args: [--preserve-cell-outputs, --remove-empty-cells]
- repo: https://github.com/pappasam/toml-sort
rev: v0.23.1
hooks:
- id: toml-sort-fix
+1
View File
@@ -0,0 +1 @@
3.13
+48
View File
@@ -0,0 +1,48 @@
# Contributing to `notebooklm-clone`
Do you want to contribute to this project? Make sure to read this guidelines first :)
## Issue
**When to do it**:
- You found bugs but you don't know how to solve them or don't have time/will to do the solve
- You want new features but you don't know how to implement them or don't have time/will to do the implementation
> ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_
**How to do it**:
- Open an issue
- Give the issue a meaningful title (short but effective problem/feature request description)
- Describe the problem/feature request
## Traditional contribution
**When to do it**:
- You found bugs and corrected them
- You optimized/improved the code
- You added new features that you think could be useful to others
**How to do it**:
1. Fork this repository
2. Install `pre-commit` and make sure to have it within the Git Hooks for your fork:
```bash
pip install pre-commit
pre-commit install
```
3. Change the things you want, and make sure tests still pass or add new ones:
```bash
pytest tests/test_*.py
```
3. Commit your changes
4. Make sure your changes pass the pre-commit linting/type checking, if not modify them so that they pass
5. Submit pull request (make sure to provide a thorough description of the changes)
### Thanks for contributing!
+21
View File
@@ -0,0 +1,21 @@
The MIT License
Copyright (c) Clelia Astra Bertelli
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
+3
View File
@@ -0,0 +1,3 @@
# NotebookLM clone
This project is aimed at producing a fully open-source, LlamaCloud-backed alternative to NotebookLM.
+24
View File
@@ -0,0 +1,24 @@
[project]
name = "notebooklm-clone"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"llama-cloud>=0.1.29",
"llama-cloud-services>=0.6.38",
"llama-index-core>=0.12.44",
"llama-index-embeddings-openai>=0.3.1",
"llama-index-indices-managed-llama-cloud>=0.6.11",
"llama-index-llms-openai>=0.4.7",
"mypy>=1.16.1",
"pre-commit>=4.2.0",
"pytest>=8.4.1",
"python-dotenv>=1.1.1"
]
[tool.mypy]
disable_error_code = ["import-not-found", "import-untyped"]
[tool.pytest.ini_options]
pythonpath = ["src"]
View File
+63
View File
@@ -0,0 +1,63 @@
from pydantic import BaseModel, Field, model_validator
from typing import List, Dict, Literal, Tuple
from typing_extensions import Self
class Notebook(BaseModel):
summary: str = Field(
description="Summary of the document.",
)
highlights: List[str] = Field(
description="Highlights of the documents: 3 to 10 bullet points that represent the crucial knots of the documents.",
min_length=3,
max_length=10,
)
questions_and_answers: List[Dict[Literal["question", "answer"], str]] = Field(
description="5 to 15 questions and answers about a given topic. This field should be organized as a list of dictionaries, each containing a 'question' and an 'answer' fields.",
examples=[
[
{"question": "What is the capital of Spain?", "answer": "Madrid"},
{"question": "What is the capital of France?", "answer": "Paris"},
{"question": "What is the capital of Italy?", "answer": "Rome"},
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
{"question": "What is the capital of Germany?", "answer": "Berlin"},
]
],
min_length=5,
max_length=15,
)
class MindMap(BaseModel):
nodes: List[Tuple[str, str]] = Field(
description="List of nodes of the mind map, with their ID as first element and their content as second. Content should never exceed 5 words.",
examples=[
[
("A", "Fall of the Roman Empire"),
("B", "476 AD"),
("C", "Barbarian invasions"),
],
[
("A", "Auxin is released"),
("B", "Travels to the roots"),
("C", "Root cells grow in dimensions"),
],
],
)
edges: List[Tuple[str, str]] = Field(
description="The edges connecting the nodes of the mind map, as a list of tuples containing the IDs of the two connected edges.",
examples=[
[("A", "B"), ("A", "C"), ("B", "C")],
[("C", "A"), ("B", "C"), ("A", "B")],
],
)
@model_validator(mode="after")
def validate_mind_map(self) -> Self:
all_nodes = [el[0] for el in self.nodes]
all_edges = [el[0] for el in self.edges] + [el[1] for el in self.edges]
if set(all_nodes).issubset(set(all_edges)) and set(all_nodes) != set(all_edges):
raise ValueError(
"There are non-existing nodes listed as source or target in the edges"
)
return self
+28
View File
@@ -0,0 +1,28 @@
from dotenv import load_dotenv
import os
from llama_cloud_services import LlamaExtract
from llama_cloud.client import AsyncLlamaCloud
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
from llama_index.llms.openai import OpenAI
load_dotenv()
load_dotenv()
if (
os.getenv("LLAMACLOUD_API_KEY", None)
and os.getenv("EXTRACT_AGENT_ID", None)
and os.getenv("LLAMACLOUD_PIPELINE_ID", None)
and os.getenv("OPENAI_API_KEY", None)
):
LLM = OpenAI(model="gpt-4.1", api_key=os.getenv("OPENAI_API_KEY"))
CLIENT = AsyncLlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY"))
EXTRACT_AGENT = LlamaExtract(api_key=os.getenv("LLAMACLOUD_API_KEY")).get_agent(
id=os.getenv("EXTRACT_AGENT_ID")
)
PIPELINE_ID = os.getenv("LLAMACLOUD_PIPELINE_ID")
QE = LlamaCloudIndex(
api_key=os.getenv("LLAMACLOUD_API_KEY"), pipeline_id=PIPELINE_ID
).as_query_engine(llm=LLM)
View File
+68
View File
@@ -0,0 +1,68 @@
import pytest
from src.notebooklm_clone.models import Notebook, MindMap
from pydantic import ValidationError
def test_notebook() -> None:
n1 = Notebook(
summary="This is a summary",
questions_and_answers=[
{"question": "What is the capital of Spain?", "answer": "Madrid"},
{"question": "What is the capital of France?", "answer": "Paris"},
{"question": "What is the capital of Italy?", "answer": "Rome"},
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
{"question": "What is the capital of Germany?", "answer": "Berlin"},
],
highlights=["This", "is", "a", "summary"],
)
assert n1.summary == "This is a summary"
assert n1.questions_and_answers[0]["question"] == "What is the capital of Spain?"
assert n1.questions_and_answers[0]["answer"] == "Madrid"
assert n1.highlights[0] == "This"
with pytest.raises(ValidationError):
Notebook(
summary="This is a summary",
questions_and_answers=[
{"question": "What is the capital of France?", "answer": "Paris"},
{"question": "What is the capital of Italy?", "answer": "Rome"},
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
{"question": "What is the capital of Germany?", "answer": "Berlin"},
],
highlights=["This", "is", "a", "summary"],
)
with pytest.raises(ValidationError):
Notebook(
summary="This is a summary",
questions_and_answers=[
{"question": "What is the capital of Spain?", "answer": "Madrid"},
{"question": "What is the capital of France?", "answer": "Paris"},
{"question": "What is the capital of Italy?", "answer": "Rome"},
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
{"question": "What is the capital of Germany?", "answer": "Berlin"},
],
highlights=["This", "is"],
)
def test_mind_map() -> None:
m1 = MindMap(
nodes=[
("A", "Auxin is released"),
("B", "Travels to the roots"),
("C", "Root cells grow in dimensions"),
],
edges=[("A", "B"), ("A", "C"), ("B", "C")],
)
assert m1.nodes[0][0] == "A"
assert m1.nodes[0][1] == "Auxin is released"
assert m1.edges[0] == ("A", "B")
with pytest.raises(ValidationError):
MindMap(
nodes=[
("A", "Auxin is released"),
("B", "Travels to the roots"),
("C", "Root cells grow in dimensions"),
],
edges=[("A", "B"), ("A", "D"), ("B", "C")],
)
+59
View File
@@ -0,0 +1,59 @@
import os
from dotenv import load_dotenv
from llama_cloud import (
PipelineCreateEmbeddingConfig_OpenaiEmbedding,
PipelineTransformConfig_Advanced,
AdvancedModeTransformConfigChunkingConfig_Sentence,
AdvancedModeTransformConfigSegmentationConfig_Page,
PipelineCreate,
)
from llama_cloud.client import LlamaCloud
from llama_index.embeddings.openai import OpenAIEmbedding
def main():
load_dotenv()
embed_model = OpenAIEmbedding(
model="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY")
)
client = LlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY"))
embedding_config = PipelineCreateEmbeddingConfig_OpenaiEmbedding(
type="OPENAI_EMBEDDING",
component=embed_model,
)
segm_config = AdvancedModeTransformConfigSegmentationConfig_Page(mode="page")
chunk_config = AdvancedModeTransformConfigChunkingConfig_Sentence(
chunk_size=1024,
chunk_overlap=200,
separator="<whitespace>",
paragraph_separator="\n\n\n",
mode="sentence",
)
transform_config = PipelineTransformConfig_Advanced(
segmentation_config=segm_config,
chunking_config=chunk_config,
mode="advanced",
)
pipeline_request = PipelineCreate(
name="notebooklm_pipeline",
embedding_config=embedding_config,
transform_config=transform_config,
)
pipeline = client.pipelines.upsert_pipeline(request=pipeline_request)
with open(".env", "a") as f:
f.write(f'\nLLAMACLOUD_PIPELINE_ID="{pipeline.id}"')
return 0
if __name__ == "__main__":
main()
+23
View File
@@ -0,0 +1,23 @@
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from llama_cloud_services import LlamaExtract
from src.notebooklm_clone.models import Notebook
from dotenv import load_dotenv
load_dotenv()
def main() -> int:
conn = LlamaExtract(api_key=os.getenv("LLAMACLOUD_API_KEY"))
agent = conn.create_agent(name="q_and_a_agent", data_schema=Notebook)
_id = agent.id
with open(".env", "a") as f:
f.write(f'\nEXTRACT_AGENT_ID="{_id}"')
return 0
if __name__ == "__main__":
main()
Generated
+1297
View File
File diff suppressed because it is too large Load Diff