mirror of
https://github.com/run-llama/notebookllama.git
synced 2026-06-30 22:17:57 -04:00
first commit
This commit is contained in:
@@ -0,0 +1,20 @@
|
||||
name: Linting
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install 3.12
|
||||
|
||||
- name: Run linter
|
||||
shell: bash
|
||||
run: uv run -- pre-commit run -a
|
||||
@@ -0,0 +1,21 @@
|
||||
name: CI Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
testing:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install
|
||||
|
||||
- name: Run Tests
|
||||
run: uv run -- pytest tests/test_*.py
|
||||
@@ -0,0 +1,22 @@
|
||||
name: Typecheck
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
core-typecheck:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install
|
||||
|
||||
- name: Run Mypy
|
||||
working-directory: src
|
||||
run: uv run -- mypy notebooklm_clone
|
||||
+16
@@ -0,0 +1,16 @@
|
||||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
|
||||
# caches
|
||||
.*_cache/
|
||||
|
||||
# env files
|
||||
.env
|
||||
@@ -0,0 +1,84 @@
|
||||
---
|
||||
default_language_version:
|
||||
python: python3
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.5.0
|
||||
hooks:
|
||||
- id: check-merge-conflict
|
||||
- id: check-symlinks
|
||||
- id: check-toml
|
||||
- id: check-yaml
|
||||
- id: detect-private-key
|
||||
- id: end-of-file-fixer
|
||||
- id: mixed-line-ending
|
||||
- id: trailing-whitespace
|
||||
|
||||
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
||||
rev: v0.11.8
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--exit-non-zero-on-fix, --fix]
|
||||
- id: ruff-format
|
||||
exclude: ".*poetry.lock|.*_static|.*uv.lock|.*ipynb|.*docs.*"
|
||||
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.0.1
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies:
|
||||
[
|
||||
"types-requests",
|
||||
"types-Deprecated",
|
||||
"types-redis",
|
||||
"types-setuptools",
|
||||
"types-PyYAML",
|
||||
"types-protobuf==4.24.0.4",
|
||||
]
|
||||
args:
|
||||
[
|
||||
--namespace-packages,
|
||||
--explicit-package-bases,
|
||||
--disallow-untyped-defs,
|
||||
--ignore-missing-imports,
|
||||
--python-version=3.9,
|
||||
]
|
||||
entry: bash -c "export MYPYPATH=src/emoji_searcher_app"
|
||||
|
||||
- repo: https://github.com/psf/black-pre-commit-mirror
|
||||
rev: 23.10.1
|
||||
hooks:
|
||||
- id: black-jupyter
|
||||
name: black-docs-py
|
||||
alias: black
|
||||
files: ^(README.md|CONTRIBUTING.md)
|
||||
# Using PEP 8's line length in docs prevents excess left/right scrolling
|
||||
args: [--line-length=79]
|
||||
|
||||
- repo: https://github.com/adamchainz/blacken-docs
|
||||
rev: 1.16.0
|
||||
hooks:
|
||||
- id: blacken-docs
|
||||
name: black-docs-text
|
||||
alias: black
|
||||
types_or: [rst, markdown, tex]
|
||||
additional_dependencies: [black==23.10.1]
|
||||
# Using PEP 8's line length in docs prevents excess left/right scrolling
|
||||
args: [--line-length=79]
|
||||
|
||||
- repo: https://github.com/pre-commit/mirrors-prettier
|
||||
rev: v3.0.3
|
||||
hooks:
|
||||
- id: prettier
|
||||
|
||||
- repo: https://github.com/srstevenson/nb-clean
|
||||
rev: 3.1.0
|
||||
hooks:
|
||||
- id: nb-clean
|
||||
args: [--preserve-cell-outputs, --remove-empty-cells]
|
||||
|
||||
- repo: https://github.com/pappasam/toml-sort
|
||||
rev: v0.23.1
|
||||
hooks:
|
||||
- id: toml-sort-fix
|
||||
@@ -0,0 +1 @@
|
||||
3.13
|
||||
@@ -0,0 +1,48 @@
|
||||
# Contributing to `notebooklm-clone`
|
||||
|
||||
Do you want to contribute to this project? Make sure to read this guidelines first :)
|
||||
|
||||
## Issue
|
||||
|
||||
**When to do it**:
|
||||
|
||||
- You found bugs but you don't know how to solve them or don't have time/will to do the solve
|
||||
- You want new features but you don't know how to implement them or don't have time/will to do the implementation
|
||||
|
||||
> ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_
|
||||
|
||||
**How to do it**:
|
||||
|
||||
- Open an issue
|
||||
- Give the issue a meaningful title (short but effective problem/feature request description)
|
||||
- Describe the problem/feature request
|
||||
|
||||
## Traditional contribution
|
||||
|
||||
**When to do it**:
|
||||
|
||||
- You found bugs and corrected them
|
||||
- You optimized/improved the code
|
||||
- You added new features that you think could be useful to others
|
||||
|
||||
**How to do it**:
|
||||
|
||||
1. Fork this repository
|
||||
2. Install `pre-commit` and make sure to have it within the Git Hooks for your fork:
|
||||
|
||||
```bash
|
||||
pip install pre-commit
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
3. Change the things you want, and make sure tests still pass or add new ones:
|
||||
|
||||
```bash
|
||||
pytest tests/test_*.py
|
||||
```
|
||||
|
||||
3. Commit your changes
|
||||
4. Make sure your changes pass the pre-commit linting/type checking, if not modify them so that they pass
|
||||
5. Submit pull request (make sure to provide a thorough description of the changes)
|
||||
|
||||
### Thanks for contributing!
|
||||
@@ -0,0 +1,21 @@
|
||||
The MIT License
|
||||
|
||||
Copyright (c) Clelia Astra Bertelli
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
@@ -0,0 +1,3 @@
|
||||
# NotebookLM clone
|
||||
|
||||
This project is aimed at producing a fully open-source, LlamaCloud-backed alternative to NotebookLM.
|
||||
@@ -0,0 +1,24 @@
|
||||
[project]
|
||||
name = "notebooklm-clone"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"llama-cloud>=0.1.29",
|
||||
"llama-cloud-services>=0.6.38",
|
||||
"llama-index-core>=0.12.44",
|
||||
"llama-index-embeddings-openai>=0.3.1",
|
||||
"llama-index-indices-managed-llama-cloud>=0.6.11",
|
||||
"llama-index-llms-openai>=0.4.7",
|
||||
"mypy>=1.16.1",
|
||||
"pre-commit>=4.2.0",
|
||||
"pytest>=8.4.1",
|
||||
"python-dotenv>=1.1.1"
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
disable_error_code = ["import-not-found", "import-untyped"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
pythonpath = ["src"]
|
||||
@@ -0,0 +1,63 @@
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
from typing import List, Dict, Literal, Tuple
|
||||
from typing_extensions import Self
|
||||
|
||||
|
||||
class Notebook(BaseModel):
|
||||
summary: str = Field(
|
||||
description="Summary of the document.",
|
||||
)
|
||||
highlights: List[str] = Field(
|
||||
description="Highlights of the documents: 3 to 10 bullet points that represent the crucial knots of the documents.",
|
||||
min_length=3,
|
||||
max_length=10,
|
||||
)
|
||||
questions_and_answers: List[Dict[Literal["question", "answer"], str]] = Field(
|
||||
description="5 to 15 questions and answers about a given topic. This field should be organized as a list of dictionaries, each containing a 'question' and an 'answer' fields.",
|
||||
examples=[
|
||||
[
|
||||
{"question": "What is the capital of Spain?", "answer": "Madrid"},
|
||||
{"question": "What is the capital of France?", "answer": "Paris"},
|
||||
{"question": "What is the capital of Italy?", "answer": "Rome"},
|
||||
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
|
||||
{"question": "What is the capital of Germany?", "answer": "Berlin"},
|
||||
]
|
||||
],
|
||||
min_length=5,
|
||||
max_length=15,
|
||||
)
|
||||
|
||||
|
||||
class MindMap(BaseModel):
|
||||
nodes: List[Tuple[str, str]] = Field(
|
||||
description="List of nodes of the mind map, with their ID as first element and their content as second. Content should never exceed 5 words.",
|
||||
examples=[
|
||||
[
|
||||
("A", "Fall of the Roman Empire"),
|
||||
("B", "476 AD"),
|
||||
("C", "Barbarian invasions"),
|
||||
],
|
||||
[
|
||||
("A", "Auxin is released"),
|
||||
("B", "Travels to the roots"),
|
||||
("C", "Root cells grow in dimensions"),
|
||||
],
|
||||
],
|
||||
)
|
||||
edges: List[Tuple[str, str]] = Field(
|
||||
description="The edges connecting the nodes of the mind map, as a list of tuples containing the IDs of the two connected edges.",
|
||||
examples=[
|
||||
[("A", "B"), ("A", "C"), ("B", "C")],
|
||||
[("C", "A"), ("B", "C"), ("A", "B")],
|
||||
],
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_mind_map(self) -> Self:
|
||||
all_nodes = [el[0] for el in self.nodes]
|
||||
all_edges = [el[0] for el in self.edges] + [el[1] for el in self.edges]
|
||||
if set(all_nodes).issubset(set(all_edges)) and set(all_nodes) != set(all_edges):
|
||||
raise ValueError(
|
||||
"There are non-existing nodes listed as source or target in the edges"
|
||||
)
|
||||
return self
|
||||
@@ -0,0 +1,28 @@
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
from llama_cloud_services import LlamaExtract
|
||||
from llama_cloud.client import AsyncLlamaCloud
|
||||
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
load_dotenv()
|
||||
if (
|
||||
os.getenv("LLAMACLOUD_API_KEY", None)
|
||||
and os.getenv("EXTRACT_AGENT_ID", None)
|
||||
and os.getenv("LLAMACLOUD_PIPELINE_ID", None)
|
||||
and os.getenv("OPENAI_API_KEY", None)
|
||||
):
|
||||
LLM = OpenAI(model="gpt-4.1", api_key=os.getenv("OPENAI_API_KEY"))
|
||||
CLIENT = AsyncLlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY"))
|
||||
EXTRACT_AGENT = LlamaExtract(api_key=os.getenv("LLAMACLOUD_API_KEY")).get_agent(
|
||||
id=os.getenv("EXTRACT_AGENT_ID")
|
||||
)
|
||||
PIPELINE_ID = os.getenv("LLAMACLOUD_PIPELINE_ID")
|
||||
QE = LlamaCloudIndex(
|
||||
api_key=os.getenv("LLAMACLOUD_API_KEY"), pipeline_id=PIPELINE_ID
|
||||
).as_query_engine(llm=LLM)
|
||||
@@ -0,0 +1,68 @@
|
||||
import pytest
|
||||
|
||||
from src.notebooklm_clone.models import Notebook, MindMap
|
||||
from pydantic import ValidationError
|
||||
|
||||
|
||||
def test_notebook() -> None:
|
||||
n1 = Notebook(
|
||||
summary="This is a summary",
|
||||
questions_and_answers=[
|
||||
{"question": "What is the capital of Spain?", "answer": "Madrid"},
|
||||
{"question": "What is the capital of France?", "answer": "Paris"},
|
||||
{"question": "What is the capital of Italy?", "answer": "Rome"},
|
||||
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
|
||||
{"question": "What is the capital of Germany?", "answer": "Berlin"},
|
||||
],
|
||||
highlights=["This", "is", "a", "summary"],
|
||||
)
|
||||
assert n1.summary == "This is a summary"
|
||||
assert n1.questions_and_answers[0]["question"] == "What is the capital of Spain?"
|
||||
assert n1.questions_and_answers[0]["answer"] == "Madrid"
|
||||
assert n1.highlights[0] == "This"
|
||||
with pytest.raises(ValidationError):
|
||||
Notebook(
|
||||
summary="This is a summary",
|
||||
questions_and_answers=[
|
||||
{"question": "What is the capital of France?", "answer": "Paris"},
|
||||
{"question": "What is the capital of Italy?", "answer": "Rome"},
|
||||
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
|
||||
{"question": "What is the capital of Germany?", "answer": "Berlin"},
|
||||
],
|
||||
highlights=["This", "is", "a", "summary"],
|
||||
)
|
||||
with pytest.raises(ValidationError):
|
||||
Notebook(
|
||||
summary="This is a summary",
|
||||
questions_and_answers=[
|
||||
{"question": "What is the capital of Spain?", "answer": "Madrid"},
|
||||
{"question": "What is the capital of France?", "answer": "Paris"},
|
||||
{"question": "What is the capital of Italy?", "answer": "Rome"},
|
||||
{"question": "What is the capital of Portugal?", "answer": "Lisbon"},
|
||||
{"question": "What is the capital of Germany?", "answer": "Berlin"},
|
||||
],
|
||||
highlights=["This", "is"],
|
||||
)
|
||||
|
||||
|
||||
def test_mind_map() -> None:
|
||||
m1 = MindMap(
|
||||
nodes=[
|
||||
("A", "Auxin is released"),
|
||||
("B", "Travels to the roots"),
|
||||
("C", "Root cells grow in dimensions"),
|
||||
],
|
||||
edges=[("A", "B"), ("A", "C"), ("B", "C")],
|
||||
)
|
||||
assert m1.nodes[0][0] == "A"
|
||||
assert m1.nodes[0][1] == "Auxin is released"
|
||||
assert m1.edges[0] == ("A", "B")
|
||||
with pytest.raises(ValidationError):
|
||||
MindMap(
|
||||
nodes=[
|
||||
("A", "Auxin is released"),
|
||||
("B", "Travels to the roots"),
|
||||
("C", "Root cells grow in dimensions"),
|
||||
],
|
||||
edges=[("A", "B"), ("A", "D"), ("B", "C")],
|
||||
)
|
||||
@@ -0,0 +1,59 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from llama_cloud import (
|
||||
PipelineCreateEmbeddingConfig_OpenaiEmbedding,
|
||||
PipelineTransformConfig_Advanced,
|
||||
AdvancedModeTransformConfigChunkingConfig_Sentence,
|
||||
AdvancedModeTransformConfigSegmentationConfig_Page,
|
||||
PipelineCreate,
|
||||
)
|
||||
from llama_cloud.client import LlamaCloud
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
|
||||
|
||||
def main():
|
||||
load_dotenv()
|
||||
|
||||
embed_model = OpenAIEmbedding(
|
||||
model="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
|
||||
client = LlamaCloud(token=os.getenv("LLAMACLOUD_API_KEY"))
|
||||
|
||||
embedding_config = PipelineCreateEmbeddingConfig_OpenaiEmbedding(
|
||||
type="OPENAI_EMBEDDING",
|
||||
component=embed_model,
|
||||
)
|
||||
|
||||
segm_config = AdvancedModeTransformConfigSegmentationConfig_Page(mode="page")
|
||||
chunk_config = AdvancedModeTransformConfigChunkingConfig_Sentence(
|
||||
chunk_size=1024,
|
||||
chunk_overlap=200,
|
||||
separator="<whitespace>",
|
||||
paragraph_separator="\n\n\n",
|
||||
mode="sentence",
|
||||
)
|
||||
|
||||
transform_config = PipelineTransformConfig_Advanced(
|
||||
segmentation_config=segm_config,
|
||||
chunking_config=chunk_config,
|
||||
mode="advanced",
|
||||
)
|
||||
|
||||
pipeline_request = PipelineCreate(
|
||||
name="notebooklm_pipeline",
|
||||
embedding_config=embedding_config,
|
||||
transform_config=transform_config,
|
||||
)
|
||||
|
||||
pipeline = client.pipelines.upsert_pipeline(request=pipeline_request)
|
||||
|
||||
with open(".env", "a") as f:
|
||||
f.write(f'\nLLAMACLOUD_PIPELINE_ID="{pipeline.id}"')
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,23 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
|
||||
from llama_cloud_services import LlamaExtract
|
||||
from src.notebooklm_clone.models import Notebook
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
conn = LlamaExtract(api_key=os.getenv("LLAMACLOUD_API_KEY"))
|
||||
agent = conn.create_agent(name="q_and_a_agent", data_schema=Notebook)
|
||||
_id = agent.id
|
||||
with open(".env", "a") as f:
|
||||
f.write(f'\nEXTRACT_AGENT_ID="{_id}"')
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user