Compare commits
335 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f385e96ab8 | |||
| c3e4696b5f | |||
| 1e40c9cf94 | |||
| 802bc2a9f8 | |||
| 5ea758b853 | |||
| 208b6f2fa5 | |||
| e1b9143f79 | |||
| 232c55bd6a | |||
| ab6f2f8da5 | |||
| 66c2639ec8 | |||
| da1916c69f | |||
| 345e272573 | |||
| d70fbac1ce | |||
| 2358df10c6 | |||
| 829628cc86 | |||
| 42b7bbd1ae | |||
| 38da9a52d7 | |||
| 1e7ec40ee7 | |||
| dd83c1a9d0 | |||
| 7cb83f5cd3 | |||
| b05266be6d | |||
| eab4798165 | |||
| b174fa8fab | |||
| b12ffef916 | |||
| 07ec282257 | |||
| 013b689812 | |||
| 3040951cb8 | |||
| 9239498945 | |||
| 19cbb25631 | |||
| 812e2f7d72 | |||
| d7864afe3f | |||
| ade8d027a5 | |||
| 997bcc8531 | |||
| 8be554c234 | |||
| f777cab0c5 | |||
| b9b83c953d | |||
| 3ec7024626 | |||
| d5b18a03fa | |||
| 18dd04b6de | |||
| 685a5e6ccc | |||
| 576c3d9076 | |||
| c8321d2bc5 | |||
| 131bbed7aa | |||
| 41c8ac2348 | |||
| 32c53cdf96 | |||
| 71db318fc2 | |||
| dac0f79e51 | |||
| 32487763d5 | |||
| 06c3c556e6 | |||
| e5dcaa83df | |||
| 1b7198dc62 | |||
| 9cfe074206 | |||
| ae30990ada | |||
| 8f1c359abc | |||
| 0a110de9c7 | |||
| d705b16923 | |||
| ca781132c8 | |||
| 7a68b0fb68 | |||
| 87dec5433d | |||
| 99f4eba8d0 | |||
| 54561e2dd2 | |||
| bfaec79a8f | |||
| 3e0e522a6b | |||
| f70b6d87ec | |||
| 693b5b83b1 | |||
| ad38ef5cd7 | |||
| 4c4c6e6575 | |||
| 740b47d9dc | |||
| f3233deb2e | |||
| fd45127678 | |||
| 0506c88735 | |||
| 4bc9eb6c0d | |||
| 5a3dac655c | |||
| 519254efbe | |||
| 6ab56b79f3 | |||
| e020e3e2b1 | |||
| f293547910 | |||
| 662bc37462 | |||
| 9f1ef4ef1f | |||
| 1243573924 | |||
| 407292b177 | |||
| a7df7c0912 | |||
| c758144bfe | |||
| fee516dd19 | |||
| 032fbd5768 | |||
| 970e864514 | |||
| d0649ece6e | |||
| 5d4cabd843 | |||
| 9070a6ac16 | |||
| 4f24f537f6 | |||
| 8859a203e2 | |||
| b091364054 | |||
| 43b1a013ca | |||
| f81532e7f2 | |||
| 986d3987d3 | |||
| 1bf522311f | |||
| 24166dcfc8 | |||
| bfb7f3973f | |||
| 979f643c77 | |||
| aefd89cf1b | |||
| 8ea2b2c64e | |||
| 4a9a2a21d8 | |||
| e6a7939206 | |||
| 104a03e829 | |||
| 6e0f2f4ca0 | |||
| 0708d11f8a | |||
| be19185503 | |||
| 7571b0d6c4 | |||
| ad6734bf80 | |||
| 9ec2a8322e | |||
| 51011b9f30 | |||
| 09805f9e15 | |||
| 8ced6f6eab | |||
| 081ddeca34 | |||
| 2460908789 | |||
| c226d6a54c | |||
| 5d4c682eb2 | |||
| f72d3535c8 | |||
| 1ea09a366e | |||
| d4bbeb6389 | |||
| d028397603 | |||
| 35ea8476db | |||
| 3e5f7c4f1e | |||
| 9d9b816644 | |||
| 83555f76e6 | |||
| 5edf5f914a | |||
| 22e4975cb2 | |||
| bc2f04379b | |||
| f9f951d5d8 | |||
| 355129fea5 | |||
| d9aed80ded | |||
| c07d2d70a8 | |||
| ed6937a5a9 | |||
| 34c15932a3 | |||
| b18ea96d11 | |||
| 196ab827f5 | |||
| ba4cb4d5e9 | |||
| 58d883b825 | |||
| 5fc5ebfc6c | |||
| fe3e20fd53 | |||
| e7e59459ab | |||
| f4d7c84e19 | |||
| 9050a346e4 | |||
| 9690ccf4ea | |||
| 97745f0f1c | |||
| 61a696b9db | |||
| 3e01adaf0e | |||
| 37393b7e98 | |||
| ecd859a67c | |||
| decca8e671 | |||
| 5ea0815187 | |||
| cf149650f5 | |||
| 4c6c231ea4 | |||
| 5955b26509 | |||
| 31f54bca55 | |||
| b1ae7bb736 | |||
| 31fe12e0da | |||
| 90b0c5e295 | |||
| 79fe1930cf | |||
| ab225c3eab | |||
| 6f1de75909 | |||
| 230ed64e41 | |||
| ef126c3a93 | |||
| 51a7534733 | |||
| 4f5d2bde13 | |||
| 3d05fe5d77 | |||
| c16ca673af | |||
| 6619034bce | |||
| c56fb5d8f7 | |||
| b407a5edb5 | |||
| e6a27d17fb | |||
| 34077fd479 | |||
| 7a68ad5a7f | |||
| 74a1b6c2f2 | |||
| 9a90ae5264 | |||
| 310c1bc105 | |||
| cd20b29299 | |||
| 0cb7aeb81c | |||
| 98db5eeeae | |||
| c21cb34ff6 | |||
| e28c7b9d92 | |||
| ee4e565604 | |||
| 6dbb089f4c | |||
| c4b694db8d | |||
| 97f428ad06 | |||
| ef92ee5408 | |||
| d094668d03 | |||
| 5bb5fc1625 | |||
| 1d57e0071d | |||
| 2a344c4f5c | |||
| ce02559b8d | |||
| e42746e372 | |||
| 3149dfd03a | |||
| e499fdbdab | |||
| e57df39248 | |||
| 09b192b98b | |||
| 13f01a0621 | |||
| cf879a1a58 | |||
| fcdf2ab63e | |||
| 083d8109c2 | |||
| 89cfc8b25f | |||
| c46e157f92 | |||
| 05d6026d37 | |||
| 8e98d5c146 | |||
| 3f311c0669 | |||
| b1a2f9d42b | |||
| 142f55c94c | |||
| 230a110e52 | |||
| 83e2b031cd | |||
| 4844e26e5c | |||
| 70a049af3c | |||
| dc11776c86 | |||
| 2448a42b90 | |||
| c75a900174 | |||
| 2fb7adfe0e | |||
| dc82270724 | |||
| d880a48dd0 | |||
| 7567e8b45e | |||
| 0d59a90151 | |||
| 98ad550b1a | |||
| b58f43ce9f | |||
| acf6adcd91 | |||
| daf6576c3c | |||
| 8caa4defa6 | |||
| 26918b8de4 | |||
| 6fb5ebe2f9 | |||
| c0aa67995b | |||
| 9f841f8328 | |||
| 99c75eece9 | |||
| 57d2586ee3 | |||
| 4280a43ec8 | |||
| 7f1082bbb2 | |||
| 57cfc45804 | |||
| 30e8913875 | |||
| 0ce6d4d7a4 | |||
| 584ba8d48e | |||
| 925805ee11 | |||
| 76fb73c971 | |||
| 6d19ea9ac0 | |||
| 90431090e9 | |||
| 6dff35b204 | |||
| e634c7978d | |||
| 7a9e99bba2 | |||
| efcdd4405b | |||
| bf3614690f | |||
| 7463e00da3 | |||
| cbe9de0c57 | |||
| a023507d42 | |||
| e48f544ddc | |||
| 4aa7ad5642 | |||
| c39cdbcd01 | |||
| 71eaa8bcc6 | |||
| 1e1cbdfc79 | |||
| cc8af4a43a | |||
| 43fbd48ab8 | |||
| 5ec66e9452 | |||
| 211521c82e | |||
| 4ddaab1efb | |||
| 53e5ce2e83 | |||
| 9f4bd1cb64 | |||
| 456863752b | |||
| c2dc34bbd6 | |||
| fcabb04baf | |||
| 8e7c32d3d6 | |||
| 7e3013d914 | |||
| 4a664c33d2 | |||
| 6d049ee2e4 | |||
| fa73e73664 | |||
| bf67ee6056 | |||
| a1abef2ee9 | |||
| a753e01d3c | |||
| 9b15065b24 | |||
| 6e4150537c | |||
| 233d715a14 | |||
| 77ac385dfe | |||
| 53b78fcd7d | |||
| 16f81bd7ee | |||
| 0ee049fd11 | |||
| 7dba17e5bc | |||
| eeb678b937 | |||
| fe4eb664fd | |||
| 257720e443 | |||
| e7afaedf3e | |||
| b66b47a708 | |||
| fe485ff62e | |||
| 1ebe1cee67 | |||
| e9252eb48a | |||
| dad7728135 | |||
| c5111e3335 | |||
| bbbdb98362 | |||
| 60cdc2af84 | |||
| 344c20f331 | |||
| 2b0496e947 | |||
| 6c63dba6fb | |||
| 734c021a2e | |||
| eeb034896f | |||
| 4c977e8384 | |||
| c6137713c7 | |||
| fd4b1893f1 | |||
| e542e6136b | |||
| 393451e304 | |||
| 5084ba27ab | |||
| c82771f841 | |||
| dc6860535a | |||
| c872617b4e | |||
| 47c8682761 | |||
| 683400788b | |||
| 05065a8329 | |||
| 1ae4d2bbc7 | |||
| ae38f406fd | |||
| 4897d01cb0 | |||
| bd7b563463 | |||
| 530241dd0b | |||
| 6338641107 | |||
| 6d62fb89c3 | |||
| 7d4df3b6e5 | |||
| bc28db5b92 | |||
| f78186c0f7 | |||
| e3292f5566 | |||
| 58f980f411 | |||
| 4740d0611d | |||
| 3651a10e80 | |||
| 483b51c51c | |||
| cdbddef86d | |||
| 3690109abf | |||
| 2e322b4fc8 | |||
| 735e5f3ddc | |||
| e4cb4c75e5 | |||
| 1693deff72 | |||
| 3270f1228d | |||
| eeabf48d29 | |||
| 89348aa8e5 | |||
| 3ab2ce27b5 | |||
| 265261862f | |||
| 66cf052b8c |
@@ -0,0 +1,8 @@
|
||||
# Changesets
|
||||
|
||||
Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
|
||||
with multi-package repos, or single-package repos to help you version and publish your code. You can
|
||||
find the full documentation for it [in our repository](https://github.com/changesets/changesets)
|
||||
|
||||
We have a quick list of common questions to get you started engaging with this project in
|
||||
[our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md)
|
||||
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"$schema": "https://unpkg.com/@changesets/config@3.1.1/schema.json",
|
||||
"changelog": "@changesets/cli/changelog",
|
||||
"commit": false,
|
||||
"fixed": [],
|
||||
"linked": [],
|
||||
"access": "restricted",
|
||||
"baseBranch": "main",
|
||||
"updateInternalDependencies": "patch",
|
||||
"ignore": []
|
||||
}
|
||||
@@ -7,8 +7,6 @@ assignees: ''
|
||||
|
||||
---
|
||||
|
||||
_Note: we're aware of some missing content in the output and layout issues on tables. Please refrain from opening new issues on this topic unless if you think it's different from what has already been reported._
|
||||
|
||||
**Describe the bug**
|
||||
Write a concise description of what the bug is.
|
||||
|
||||
@@ -19,19 +17,15 @@ If possible, please provide the PDF file causing the issue.
|
||||
If you have it, please provide the ID of the job you ran.
|
||||
You can find it here: https://cloud.llamaindex.ai/parse in the "History" tab.
|
||||
|
||||
**Screenshots**
|
||||
Feel free to also provide screenshots if relevant.
|
||||
|
||||
**Client:**
|
||||
Please remove untested options:
|
||||
- Frontend (cloud.llamaindex.ai)
|
||||
- Python Library
|
||||
- API
|
||||
- Frontend (cloud.llamaindex.ai)
|
||||
- Typescript Library
|
||||
- Notebook
|
||||
- API
|
||||
|
||||
**Options**
|
||||
What options did you use? Multimodal, fast mode, parsing instructions, etc.
|
||||
|
||||
**Additional context**
|
||||
Add any additional context about the problem here.
|
||||
What options did you use? Premium mode, multimodal, fast mode, parsing instructions, etc.
|
||||
Screenshots, code snippets, etc.
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
|
||||
# and
|
||||
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
@@ -1,48 +0,0 @@
|
||||
name: Build Package
|
||||
|
||||
# Build package on its own without additional pip install
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.6.1"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest]
|
||||
python-version: ["3.9"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
with:
|
||||
version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install deps
|
||||
shell: bash
|
||||
run: poetry install
|
||||
- name: Ensure lock works
|
||||
shell: bash
|
||||
run: poetry lock
|
||||
- name: Build
|
||||
shell: bash
|
||||
run: poetry build
|
||||
- name: Test installing built package
|
||||
shell: bash
|
||||
run: python -m pip install .
|
||||
- name: Test import
|
||||
shell: bash
|
||||
working-directory: ${{ vars.RUNNER_TEMP }}
|
||||
run: python -c "import llama_parse"
|
||||
@@ -0,0 +1,53 @@
|
||||
name: Build Package - Python
|
||||
|
||||
# Build package on its own without additional pip install
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "py/**"
|
||||
pull_request:
|
||||
paths:
|
||||
- "py/**"
|
||||
env:
|
||||
UV_VERSION: "0.7.20"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest]
|
||||
python-version: ["3.9"]
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install
|
||||
|
||||
- name: Display Python version
|
||||
run: python --version
|
||||
|
||||
- name: Build
|
||||
working-directory: py
|
||||
run: uv build
|
||||
|
||||
- name: Test installing built package
|
||||
shell: bash
|
||||
working-directory: py
|
||||
run: |
|
||||
uv venv
|
||||
uv pip install dist/*.whl
|
||||
|
||||
- name: Test import
|
||||
working-directory: py
|
||||
run: uv run -- python -c "import llama_cloud_services"
|
||||
@@ -0,0 +1,34 @@
|
||||
name: Build Package - TypeScript
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "ts/**"
|
||||
pull_request:
|
||||
paths:
|
||||
- "ts/**"
|
||||
|
||||
jobs:
|
||||
pre_release:
|
||||
name: Pre Release
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v5
|
||||
|
||||
- uses: pnpm/action-setup@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v5
|
||||
with:
|
||||
node-version-file: "ts/llama_cloud_services/.nvmrc"
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: ts/llama_cloud_services/
|
||||
run: pnpm install --no-frozen-lockfile
|
||||
|
||||
- name: Build
|
||||
working-directory: ts/llama_cloud_services/
|
||||
run: pnpm run build
|
||||
@@ -0,0 +1,95 @@
|
||||
name: Claude Code
|
||||
|
||||
on:
|
||||
issue_comment:
|
||||
types: [created]
|
||||
pull_request_review_comment:
|
||||
types: [created]
|
||||
issues:
|
||||
types: [opened, assigned]
|
||||
pull_request_review:
|
||||
types: [submitted]
|
||||
|
||||
jobs:
|
||||
claude:
|
||||
if: |
|
||||
(github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
|
||||
(github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
|
||||
(github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
|
||||
(github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
issues: read
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Check repository access
|
||||
id: check-access
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Get the user who triggered the event
|
||||
case "${{ github.event_name }}" in
|
||||
"issue_comment")
|
||||
USER="${{ github.event.comment.user.login }}"
|
||||
;;
|
||||
"pull_request_review_comment")
|
||||
USER="${{ github.event.comment.user.login }}"
|
||||
;;
|
||||
"pull_request_review")
|
||||
USER="${{ github.event.review.user.login }}"
|
||||
;;
|
||||
"issues")
|
||||
USER="${{ github.event.issue.user.login }}"
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "Checking repository access for user: $USER"
|
||||
|
||||
# Check if user has write access to the repository
|
||||
REPO="${{ github.repository }}"
|
||||
if gh api repos/$REPO/collaborators/$USER/permission --jq '.permission' | grep -E "(admin|write)" > /dev/null 2>&1; then
|
||||
echo "User $USER has write access to the repository"
|
||||
echo "authorized=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "User $USER does not have write access to the repository"
|
||||
echo "authorized=false" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Checkout repository
|
||||
if: steps.check-access.outputs.authorized == 'true'
|
||||
uses: actions/checkout@v5
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Run Claude Code
|
||||
if: steps.check-access.outputs.authorized == 'true'
|
||||
id: claude
|
||||
uses: anthropics/claude-code-action@beta
|
||||
with:
|
||||
anthropic_api_key: ${{ secrets.ANTHROPIC_GITHUB_API_KEY }}
|
||||
|
||||
# Optional: Specify model (defaults to Claude Sonnet 4, uncomment for Claude Opus 4)
|
||||
# model: "claude-opus-4-20250514"
|
||||
|
||||
# Optional: Customize the trigger phrase (default: @claude)
|
||||
# trigger_phrase: "/claude"
|
||||
|
||||
# Optional: Trigger when specific user is assigned to an issue
|
||||
# assignee_trigger: "claude-bot"
|
||||
|
||||
# Optional: Allow Claude to run specific commands
|
||||
# Allow bash commands to be run, for things like running tests, linting, etc.
|
||||
allowed_tools: "Bash(rg:*),Bash(find:*),Bash(grep:*),Bash(pnpm:*),Bash(npm:*),Bash(uv:*),Bash(pip:*),Bash(pipx:*),Bash(make:*),Bash(cd:*),WebFetch"
|
||||
|
||||
# Optional: Add custom instructions for Claude to customize its behavior for your project
|
||||
# custom_instructions: |
|
||||
# Follow our coding standards
|
||||
# Ensure all new code has tests
|
||||
# Use TypeScript for new files
|
||||
|
||||
# Optional: Custom environment variables for Claude
|
||||
# claude_env: |
|
||||
# NODE_ENV: test
|
||||
@@ -1,14 +1,3 @@
|
||||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
@@ -28,54 +17,25 @@ jobs:
|
||||
# - https://gh.io/supported-runners-and-hardware-resources
|
||||
# - https://gh.io/using-larger-runners
|
||||
# Consider using larger runners for possible analysis time improvements.
|
||||
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
|
||||
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
|
||||
runs-on: "ubuntu-latest"
|
||||
timeout-minutes: 360
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: ["python"]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby', 'swift' ]
|
||||
# Use only 'java' to analyze code written in Java, Kotlin or both
|
||||
# Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
|
||||
# Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v5
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v2
|
||||
uses: github/codeql-action/init@v4
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
|
||||
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
|
||||
# queries: security-extended,security-and-quality
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v2
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
|
||||
|
||||
# If the Autobuild fails above, remove it and uncomment the following three lines.
|
||||
# modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
|
||||
|
||||
# - run: |
|
||||
# echo "Run, Build Application using script"
|
||||
# ./location_of_script_within_repo/buildscript.sh
|
||||
languages: python
|
||||
dependency-caching: true
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v2
|
||||
uses: github/codeql-action/analyze@v4
|
||||
with:
|
||||
category: "/language:${{matrix.language}}"
|
||||
category: "/language:python"
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
name: Extract E2E Tests (every 4 hours)
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 */4 * * *"
|
||||
workflow_dispatch:
|
||||
# Allows manual triggering
|
||||
inputs:
|
||||
environment:
|
||||
description: "Environment to run the tests in"
|
||||
required: false
|
||||
default: staging
|
||||
type: choice
|
||||
options:
|
||||
- staging
|
||||
- production
|
||||
notify_slack:
|
||||
description: "Notify Slack"
|
||||
required: false
|
||||
default: false
|
||||
type: boolean
|
||||
workflow_call:
|
||||
|
||||
env:
|
||||
UV_VERSION: "0.7.20"
|
||||
PYTHON_VERSION: "3.12"
|
||||
SLACK_CHANNEL_ID: C078PHNTF44 # Extract channel ID
|
||||
API_E2E_LOG_PATH: ${{ github.workspace }}/extract-e2e.log
|
||||
|
||||
jobs:
|
||||
extract-e2e:
|
||||
name: "Extract E2E Tests (${{ matrix.environment }})"
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.environment }}
|
||||
cancel-in-progress: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
environment: ${{ github.event_name == 'schedule' && fromJson('["staging", "production"]') || fromJson(format('["{0}"]', github.event.inputs.environment || 'staging')) }}
|
||||
steps:
|
||||
- name: Set runtime inputs
|
||||
id: runtime
|
||||
run: |
|
||||
environment=${{ matrix.environment }}
|
||||
notify_slack=${{ github.event.inputs.notify_slack || github.event_name == 'schedule' }}
|
||||
echo "environment=${environment}" >> $GITHUB_OUTPUT
|
||||
echo "notify_slack=${notify_slack}" >> $GITHUB_OUTPUT
|
||||
|
||||
if [ "${environment}" = "production" ]; then
|
||||
echo "LLAMA_CLOUD_BASE_URL=https://api.cloud.llamaindex.ai" >> $GITHUB_ENV
|
||||
api_key_secret="${{ secrets.LLAMA_CLOUD_API_KEY }}"
|
||||
project_id_secret="${{ secrets.LLAMA_CLOUD_PROJECT_ID }}"
|
||||
else
|
||||
echo "LLAMA_CLOUD_BASE_URL=https://api.staging.llamaindex.ai" >> $GITHUB_ENV
|
||||
api_key_secret="${{ secrets.LLAMA_CLOUD_API_KEY_STAGING }}"
|
||||
project_id_secret="${{ secrets.LLAMA_CLOUD_PROJECT_ID_STAGING }}"
|
||||
fi
|
||||
|
||||
if [ -n "$api_key_secret" ]; then
|
||||
echo "LLAMA_CLOUD_API_KEY=$api_key_secret" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
if [ -n "$project_id_secret" ]; then
|
||||
echo "LLAMA_CLOUD_PROJECT_ID=$project_id_secret" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v5
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ env.PYTHON_VERSION }} && uv python pin ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Run Extract E2E tests
|
||||
id: extract-tests
|
||||
continue-on-error: true
|
||||
working-directory: py
|
||||
run: |
|
||||
set -o pipefail
|
||||
rm -f "$API_E2E_LOG_PATH"
|
||||
uv run pytest -v -n 8 --timeout=300 --session-timeout=1740 tests/extract/ 2>&1 | tee "$API_E2E_LOG_PATH"
|
||||
|
||||
- name: Extract pytest failure summary
|
||||
id: failed-tests
|
||||
if: steps.extract-tests.outcome == 'failure' || cancelled()
|
||||
run: |
|
||||
summary="$(python3 - <<'PY'
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
log_path = Path(os.environ["API_E2E_LOG_PATH"])
|
||||
if not log_path.exists():
|
||||
print("Test log not found.")
|
||||
raise SystemExit(0)
|
||||
|
||||
lines = log_path.read_text(errors="ignore").splitlines()
|
||||
|
||||
# Find the "short test summary info" section
|
||||
start = None
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("=") and "short test summary info" in line:
|
||||
start = i + 1
|
||||
break
|
||||
|
||||
if start is None:
|
||||
print("No test summary found.")
|
||||
raise SystemExit(0)
|
||||
|
||||
# Extract just the FAILED/ERROR lines (test name + short reason)
|
||||
failed_tests = []
|
||||
for line in lines[start:]:
|
||||
if line.startswith("="):
|
||||
break # End of section
|
||||
if line.startswith("FAILED ") or line.startswith("ERROR "):
|
||||
# Extract test name and truncate the error message
|
||||
match = re.match(r"(FAILED|ERROR) ([\w/:.\[\]_-]+)", line)
|
||||
if match:
|
||||
failed_tests.append(f"{match.group(1)}: {match.group(2)}")
|
||||
|
||||
if failed_tests:
|
||||
print("\n".join(failed_tests[:20])) # Limit to 20 tests max
|
||||
else:
|
||||
print("No failed tests found in summary.")
|
||||
PY
|
||||
)"
|
||||
if [ -z "$summary" ]; then
|
||||
summary="Failed test summary not available. Review the full run logs."
|
||||
fi
|
||||
{
|
||||
printf 'summary<<EOF\n%s\nEOF\n' "$summary"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Check test results
|
||||
if: always()
|
||||
run: |
|
||||
if [ "${{ steps.extract-tests.outcome }}" == "failure" ]; then
|
||||
echo "Extract E2E tests failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Post to Extract Slack channel
|
||||
id: slack
|
||||
if: (failure() || cancelled()) && steps.runtime.outputs.notify_slack == 'true'
|
||||
uses: slackapi/slack-github-action@v2.1.1
|
||||
with:
|
||||
channel-id: ${{ env.SLACK_CHANNEL_ID }}
|
||||
slack-message: |
|
||||
:red_circle: *Extract E2E Failed* (${{ steps.runtime.outputs.environment }})
|
||||
```
|
||||
${{ steps.failed-tests.outputs.summary }}
|
||||
```
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Run>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
@@ -1,4 +1,4 @@
|
||||
name: Linting
|
||||
name: Lint
|
||||
|
||||
on:
|
||||
push:
|
||||
@@ -7,7 +7,7 @@ on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.6.1"
|
||||
UV_VERSION: "0.7.20"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
@@ -18,20 +18,29 @@ jobs:
|
||||
matrix:
|
||||
python-version: ["3.9"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v5
|
||||
with:
|
||||
fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
|
||||
- name: Set up python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ matrix.python-version }}
|
||||
|
||||
- uses: pnpm/action-setup@v4
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v5
|
||||
with:
|
||||
version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install pre-commit
|
||||
shell: bash
|
||||
run: poetry run pip install pre-commit
|
||||
node-version-file: "ts/llama_cloud_services/.nvmrc"
|
||||
- name: Install dependencies
|
||||
run: pnpm install --no-frozen-lockfile
|
||||
|
||||
- name: Run linter
|
||||
shell: bash
|
||||
run: poetry run make lint
|
||||
working-directory: py
|
||||
run: uv run -- pre-commit run -a
|
||||
# the js checks are run roundaboutly through lint-staged, and -a doesn't run it. Run them directly.
|
||||
- run: pnpm -w --filter llama-cloud-services run lint
|
||||
- run: pnpm -w --filter llama-cloud-services run format:check
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
name: Publish llama-parse to PyPI / GitHub
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.6.1"
|
||||
PYTHON_VERSION: "3.9"
|
||||
|
||||
jobs:
|
||||
build-n-publish:
|
||||
name: Build and publish to PyPI
|
||||
if: github.repository == 'run-llama/llama_parse'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up python ${{ env.PYTHON_VERSION }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
with:
|
||||
version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install deps
|
||||
shell: bash
|
||||
run: pip install -e .
|
||||
- name: Build and publish to pypi
|
||||
uses: JRubics/poetry-publish@v1.17
|
||||
with:
|
||||
pypi_token: ${{ secrets.LLAMA_PARSE_PYPI_TOKEN }}
|
||||
ignore_dev_requirements: "yes"
|
||||
|
||||
- name: Create GitHub Release
|
||||
id: create_release
|
||||
uses: actions/create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
|
||||
with:
|
||||
tag_name: ${{ github.ref }}
|
||||
release_name: ${{ github.ref }}
|
||||
draft: false
|
||||
prerelease: false
|
||||
|
||||
- name: Get Asset name
|
||||
run: |
|
||||
export PKG=$(ls dist/ | grep tar)
|
||||
set -- $PKG
|
||||
echo "name=$1" >> $GITHUB_ENV
|
||||
- name: Upload Release Asset (sdist) to GitHub
|
||||
id: upload-release-asset
|
||||
uses: actions/upload-release-asset@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||
asset_path: dist/${{ env.name }}
|
||||
asset_name: ${{ env.name }}
|
||||
asset_content_type: application/zip
|
||||
@@ -0,0 +1,39 @@
|
||||
name: Test end-to-end - Python
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "py/**"
|
||||
|
||||
env:
|
||||
UV_VERSION: "0.7.20"
|
||||
LLAMA_CLOUD_API_KEY: ${{ secrets.LLAMA_CLOUD_API_KEY }}
|
||||
|
||||
jobs:
|
||||
test_e2e:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
python-version: ["3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ matrix.python-version }} && uv python pin ${{ matrix.python-version }}
|
||||
|
||||
- name: Run Tests
|
||||
working-directory: py
|
||||
run: make e2e
|
||||
|
||||
- name: Remove virtual environment
|
||||
working-directory: py
|
||||
run: rm -rf .venv/
|
||||
@@ -0,0 +1,42 @@
|
||||
name: Test - Python
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "py/**"
|
||||
pull_request:
|
||||
paths:
|
||||
- "py/**"
|
||||
|
||||
env:
|
||||
UV_VERSION: "0.7.20"
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
version: ${{ env.UV_VERSION }}
|
||||
|
||||
- name: Set up Python
|
||||
run: uv python install ${{ matrix.python-version }} && uv python pin ${{ matrix.python-version }}
|
||||
|
||||
- name: Run Tests
|
||||
working-directory: py
|
||||
run: uv run pytest unit_tests/ -v
|
||||
|
||||
- name: Remove virtual environment
|
||||
working-directory: py
|
||||
run: rm -rf .venv/
|
||||
@@ -0,0 +1,39 @@
|
||||
name: Test - TypeScript
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "ts/**"
|
||||
pull_request:
|
||||
paths:
|
||||
- "ts/**"
|
||||
|
||||
env:
|
||||
TURBO_TOKEN: ${{ secrets.TURBO_TOKEN }}
|
||||
TURBO_TEAM: ${{ vars.TURBO_TEAM }}
|
||||
TURBO_REMOTE_ONLY: true
|
||||
LLAMA_CLOUD_API_KEY: ${{ secrets.LLAMA_CLOUD_API_KEY }}
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: Test - TypeScript
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: pnpm/action-setup@v4
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v5
|
||||
with:
|
||||
node-version-file: "ts/llama_cloud_services/.nvmrc"
|
||||
- name: Install dependencies
|
||||
run: pnpm -r install --no-frozen-lockfile
|
||||
- name: Build package
|
||||
run: pnpm --filter llama-cloud-services build
|
||||
- name: Run Tests
|
||||
working-directory: ts/llama_cloud_services/
|
||||
run: pnpm test
|
||||
- name: Run e2e tests
|
||||
working-directory: ts/e2e-tests/
|
||||
run: pnpm test
|
||||
@@ -1,40 +0,0 @@
|
||||
name: Unit Testing
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.6.1"
|
||||
LLAMA_CLOUD_API_KEY: ${{ secrets.LLAMA_CLOUD_API_KEY }}
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
# You can use PyPy versions in python-version.
|
||||
# For example, pypy-2.7 and pypy-3.8
|
||||
matrix:
|
||||
python-version: ["3.8", "3.10", "3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
with:
|
||||
version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install deps
|
||||
shell: bash
|
||||
run: poetry install --with dev
|
||||
- name: Run testing
|
||||
env:
|
||||
CI: true
|
||||
shell: bash
|
||||
run: poetry run pytest tests
|
||||
@@ -0,0 +1,61 @@
|
||||
name: Version Bump and Release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
concurrency: ${{ github.workflow }}-${{ github.ref }}
|
||||
|
||||
jobs:
|
||||
release:
|
||||
name: Release
|
||||
runs-on: ubuntu-latest
|
||||
# Only run on main branch pushes
|
||||
if: github.ref == 'refs/heads/main'
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v5
|
||||
|
||||
- uses: pnpm/action-setup@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v5
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "pnpm"
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
|
||||
- name: Install dependencies
|
||||
run: pnpm install
|
||||
|
||||
- name: Add auth token to .npmrc file
|
||||
run: |
|
||||
cat << EOF >> ".npmrc"
|
||||
//registry.npmjs.org/:_authToken=$NPM_TOKEN
|
||||
EOF
|
||||
env:
|
||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
|
||||
- name: Create Release Pull Request or Publish packages
|
||||
id: changesets
|
||||
uses: changesets/action@v1
|
||||
with:
|
||||
commit: "chore: version packages"
|
||||
title: "chore: version packages"
|
||||
# Custom version script
|
||||
version: pnpm -w run version
|
||||
# Custom publish script
|
||||
publish: pnpm -w run publish
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
||||
LLAMA_PARSE_PYPI_TOKEN: ${{ secrets.LLAMA_PARSE_PYPI_TOKEN }}
|
||||
@@ -3,3 +3,10 @@ __pycache__/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
.idea
|
||||
.env*
|
||||
.ipynb_checkpoints*
|
||||
*_cache/
|
||||
node_modules/
|
||||
.turbo/
|
||||
dist/
|
||||
.npmrc
|
||||
|
||||
@@ -15,24 +15,26 @@ repos:
|
||||
- id: end-of-file-fixer
|
||||
- id: mixed-line-ending
|
||||
- id: trailing-whitespace
|
||||
exclude: ^ts/llama_cloud_services/src/client/
|
||||
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
||||
rev: v0.1.5
|
||||
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix, --exit-non-zero-on-fix]
|
||||
exclude: ".*poetry.lock"
|
||||
exclude: ".*uv.lock|examples/"
|
||||
- repo: https://github.com/psf/black-pre-commit-mirror
|
||||
rev: 23.10.1
|
||||
hooks:
|
||||
- id: black-jupyter
|
||||
name: black-src
|
||||
alias: black
|
||||
exclude: ".*poetry.lock"
|
||||
exclude: ".*uv.lock|examples/extract/solar_panel_e2e_comparison.ipynb"
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.0.1
|
||||
hooks:
|
||||
- id: mypy
|
||||
exclude: ^py/tests|^py/unit_tests|^examples
|
||||
additional_dependencies:
|
||||
[
|
||||
"types-requests",
|
||||
@@ -46,7 +48,7 @@ repos:
|
||||
[
|
||||
--disallow-untyped-defs,
|
||||
--ignore-missing-imports,
|
||||
--python-version=3.8,
|
||||
--python-version=3.10,
|
||||
]
|
||||
- repo: https://github.com/adamchainz/blacken-docs
|
||||
rev: 1.16.0
|
||||
@@ -58,17 +60,19 @@ repos:
|
||||
additional_dependencies: [black==23.10.1]
|
||||
# Using PEP 8's line length in docs prevents excess left/right scrolling
|
||||
args: [--line-length=79]
|
||||
- repo: https://github.com/pre-commit/mirrors-prettier
|
||||
rev: v3.0.3
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: prettier
|
||||
exclude: poetry.lock
|
||||
- id: lint-staged
|
||||
name: Run lint-staged for TS files
|
||||
entry: pnpm -w exec lint-staged
|
||||
language: system
|
||||
pass_filenames: false
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.2.6
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: [tomli]
|
||||
exclude: ^(poetry.lock|examples)
|
||||
exclude: ^(uv.lock|docs|ts|examples|pnpm-lock.yaml)
|
||||
args:
|
||||
[
|
||||
"--ignore-words-list",
|
||||
@@ -83,6 +87,6 @@ repos:
|
||||
rev: v0.23.1
|
||||
hooks:
|
||||
- id: toml-sort-fix
|
||||
exclude: ".*poetry.lock"
|
||||
exclude: ".*uv.lock"
|
||||
|
||||
exclude: .github/ISSUE_TEMPLATE
|
||||
exclude: ^(.github/ISSUE_TEMPLATE|ts/llama_cloud_services/src/client|pnpm-lock.yaml)
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
# Python
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses uv. Create a virtual environment, and run `uv sync`
|
||||
|
||||
## Versioning (Maintainers only)
|
||||
|
||||
Before merging your changes, make sure to bump the versions.
|
||||
|
||||
Make a version bump to `pyproject.toml`. If the underlying dependency on the llamacloud platform OpenAPI
|
||||
sdk needs bumping, make sure to bring that in as well. If updating dependencies, run `uv lock`.
|
||||
|
||||
The legacy `llama_parse` package re-exports some of `llama_cloud_services` in the old namespace. The
|
||||
versions need to be kept consistent to sidecar it with `llama_cloud_services`. Bump it's version in `llama_parse/pyproject.toml`, and also bump it's dependency version of `llama-cloud-services` to match.
|
||||
|
||||
**Note**: Don't worry about updating the `llama_parse/poetry.lock` file when bumping versions. The GitHub action will automatically run `poetry lock` for the llama_parse package during the build process (though it doesn't commit the updated lockfile back to the repo).
|
||||
|
||||
You can also do this with `./scripts/version-bump.py set 0.x.x` if you have `uv` installed.
|
||||
|
||||
Once the change is merged, push a tag `git tag -a v0.x.x -m 0.x.x` and `git push origin v0.x.x`.
|
||||
|
||||
This tagging step can be done with `./scripts/version-bump tag`.
|
||||
|
||||
# Typescript
|
||||
|
||||
## Installation
|
||||
|
||||
...
|
||||
|
||||
## Versioning
|
||||
|
||||
...
|
||||
@@ -1,14 +0,0 @@
|
||||
GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
|
||||
|
||||
help: ## Show all Makefile targets.
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
|
||||
|
||||
format: ## Run code autoformatters (black).
|
||||
pre-commit install
|
||||
git ls-files | xargs pre-commit run black --files
|
||||
|
||||
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy
|
||||
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
|
||||
|
||||
test: ## Run tests via pytest
|
||||
pytest tests
|
||||
@@ -1,165 +1,15 @@
|
||||
# LlamaParse
|
||||
|
||||
[](https://pypi.org/project/llama-parse/)
|
||||
[](https://github.com/run-llama/llama_parse/graphs/contributors)
|
||||
[](https://pypi.org/project/llama-cloud-services/)
|
||||
[](https://github.com/run-llama/llama_cloud_services/graphs/contributors)
|
||||
[](https://discord.gg/dGcwcsnxhU)
|
||||
|
||||
LlamaParse is a **GenAI-native document parser** that can parse complex document data for any downstream LLM use case (RAG, agents).
|
||||
# Llama Cloud Services
|
||||
|
||||
It is really good at the following:
|
||||
|
||||
- ✅ **Broad file type support**: Parsing a variety of unstructured file types (.pdf, .pptx, .docx, .xlsx, .html) with text, tables, visual elements, weird layouts, and more.
|
||||
- ✅ **Table recognition**: Parsing embedded tables accurately into text and semi-structured representations.
|
||||
- ✅ **Multimodal parsing and chunking**: Extracting visual elements (images/diagrams) into structured formats and return image chunks using the latest multimodal models.
|
||||
- ✅ **Custom parsing**: Input custom prompt instructions to customize the output the way you want it.
|
||||
|
||||
LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index).
|
||||
|
||||
The free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page by default. There is a sandbox available to test the API [**https://cloud.llamaindex.ai/parse ↗**](https://cloud.llamaindex.ai/parse).
|
||||
|
||||
Read below for some quickstart information, or see the [full documentation](https://docs.cloud.llamaindex.ai/).
|
||||
|
||||
If you're a company interested in enterprise RAG solutions, and/or high volume/on-prem usage of LlamaParse, come [talk to us](https://www.llamaindex.ai/contact).
|
||||
|
||||
## Getting Started
|
||||
|
||||
First, login and get an api-key from [**https://cloud.llamaindex.ai/api-key ↗**](https://cloud.llamaindex.ai/api-key).
|
||||
|
||||
Then, make sure you have the latest LlamaIndex version installed.
|
||||
|
||||
**NOTE:** If you are upgrading from v0.9.X, we recommend following our [migration guide](https://pretty-sodium-5e0.notion.site/v0-10-0-Migration-Guide-6ede431dcb8841b09ea171e7f133bd77), as well as uninstalling your previous version first.
|
||||
|
||||
```
|
||||
pip uninstall llama-index # run this if upgrading from v0.9.x or older
|
||||
pip install -U llama-index --upgrade --no-cache-dir --force-reinstall
|
||||
```
|
||||
|
||||
Lastly, install the package:
|
||||
|
||||
`pip install llama-parse`
|
||||
|
||||
Now you can parse your first PDF file using the command line interface. Use the command `llama-parse [file_paths]`. See the help text with `llama-parse --help`.
|
||||
|
||||
```bash
|
||||
export LLAMA_CLOUD_API_KEY='llx-...'
|
||||
|
||||
# output as text
|
||||
llama-parse my_file.pdf --result-type text --output-file output.txt
|
||||
|
||||
# output as markdown
|
||||
llama-parse my_file.pdf --result-type markdown --output-file output.md
|
||||
|
||||
# output as raw json
|
||||
llama-parse my_file.pdf --output-raw-json --output-file output.json
|
||||
```
|
||||
|
||||
You can also create simple scripts:
|
||||
|
||||
```python
|
||||
import nest_asyncio
|
||||
|
||||
nest_asyncio.apply()
|
||||
|
||||
from llama_parse import LlamaParse
|
||||
|
||||
parser = LlamaParse(
|
||||
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
|
||||
result_type="markdown", # "markdown" and "text" are available
|
||||
num_workers=4, # if multiple files passed, split in `num_workers` API calls
|
||||
verbose=True,
|
||||
language="en", # Optionally you can define a language, default=en
|
||||
)
|
||||
|
||||
# sync
|
||||
documents = parser.load_data("./my_file.pdf")
|
||||
|
||||
# sync batch
|
||||
documents = parser.load_data(["./my_file1.pdf", "./my_file2.pdf"])
|
||||
|
||||
# async
|
||||
documents = await parser.aload_data("./my_file.pdf")
|
||||
|
||||
# async batch
|
||||
documents = await parser.aload_data(["./my_file1.pdf", "./my_file2.pdf"])
|
||||
```
|
||||
|
||||
## Using with file object
|
||||
|
||||
You can parse a file object directly:
|
||||
|
||||
```python
|
||||
import nest_asyncio
|
||||
|
||||
nest_asyncio.apply()
|
||||
|
||||
from llama_parse import LlamaParse
|
||||
|
||||
parser = LlamaParse(
|
||||
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
|
||||
result_type="markdown", # "markdown" and "text" are available
|
||||
num_workers=4, # if multiple files passed, split in `num_workers` API calls
|
||||
verbose=True,
|
||||
language="en", # Optionally you can define a language, default=en
|
||||
)
|
||||
|
||||
file_name = "my_file1.pdf"
|
||||
extra_info = {"file_name": file_name}
|
||||
|
||||
with open(f"./{file_name}", "rb") as f:
|
||||
# must provide extra_info with file_name key with passing file object
|
||||
documents = parser.load_data(f, extra_info=extra_info)
|
||||
|
||||
# you can also pass file bytes directly
|
||||
with open(f"./{file_name}", "rb") as f:
|
||||
file_bytes = f.read()
|
||||
# must provide extra_info with file_name key with passing file bytes
|
||||
documents = parser.load_data(file_bytes, extra_info=extra_info)
|
||||
```
|
||||
|
||||
## Using with `SimpleDirectoryReader`
|
||||
|
||||
You can also integrate the parser as the default PDF loader in `SimpleDirectoryReader`:
|
||||
|
||||
```python
|
||||
import nest_asyncio
|
||||
|
||||
nest_asyncio.apply()
|
||||
|
||||
from llama_parse import LlamaParse
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
|
||||
parser = LlamaParse(
|
||||
api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY
|
||||
result_type="markdown", # "markdown" and "text" are available
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
file_extractor = {".pdf": parser}
|
||||
documents = SimpleDirectoryReader(
|
||||
"./data", file_extractor=file_extractor
|
||||
).load_data()
|
||||
```
|
||||
|
||||
Full documentation for `SimpleDirectoryReader` can be found on the [LlamaIndex Documentation](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader.html).
|
||||
|
||||
## Examples
|
||||
|
||||
Several end-to-end indexing examples can be found in the examples folder
|
||||
|
||||
- [Getting Started](examples/demo_basic.ipynb)
|
||||
- [Advanced RAG Example](examples/demo_advanced.ipynb)
|
||||
- [Raw API Usage](examples/demo_api.ipynb)
|
||||
|
||||
## Documentation
|
||||
|
||||
[https://docs.cloud.llamaindex.ai/](https://docs.cloud.llamaindex.ai/)
|
||||
|
||||
## Terms of Service
|
||||
|
||||
See the [Terms of Service Here](./TOS.pdf).
|
||||
|
||||
## Get in Touch (LlamaCloud)
|
||||
|
||||
LlamaParse is part of LlamaCloud, our e2e enterprise RAG platform that provides out-of-the-box, production-ready connectors, indexing, and retrieval over your complex data sources. We offer SaaS and VPC options.
|
||||
|
||||
LlamaCloud is currently available via waitlist (join by [creating an account](https://cloud.llamaindex.ai/)). If you're interested in state-of-the-art quality and in centralizing your RAG efforts, come [get in touch with us](https://www.llamaindex.ai/contact).
|
||||
> **⚠️ DEPRECATION NOTICE**
|
||||
>
|
||||
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
|
||||
>
|
||||
> **Please migrate to the new packages:**
|
||||
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
|
||||
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
|
||||
>
|
||||
> The new packages provide the same functionality with improved performance, better support, and active development.
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
# LlamaCloud Services Examples - Python
|
||||
|
||||
In this folder you will find several TypeScript end-to-end applications that contain examples regarding:
|
||||
|
||||
- [LlamaParse](./parse/)
|
||||
- [LlamaCloud Index](./index/)
|
||||
|
||||
Follow the instructions in each example folder to get started!
|
||||
@@ -0,0 +1,21 @@
|
||||
node_modules
|
||||
package-lock.json
|
||||
yarn.lock
|
||||
|
||||
.DS_Store
|
||||
.cache
|
||||
.env
|
||||
.vercel
|
||||
.output
|
||||
.nitro
|
||||
/build/
|
||||
/api/
|
||||
/server/build
|
||||
/public/build# Sentry Config File
|
||||
.env.sentry-build-plugin
|
||||
/test-results/
|
||||
/playwright-report/
|
||||
/blob-report/
|
||||
/playwright/.cache/
|
||||
.tanstack
|
||||
.vscode
|
||||
@@ -0,0 +1,4 @@
|
||||
**/build
|
||||
**/public
|
||||
pnpm-lock.yaml
|
||||
routeTree.gen.ts
|
||||
@@ -0,0 +1,88 @@
|
||||
# LlamaClassify Demo
|
||||
|
||||
A TypeScript demo application showcasing the power of **LlamaClassify** - an agentic documents classification service from [LlamaCloud](https://cloud.llamaindex.ai). This demo allows you to classify financial documents among three different types (Cash flow statement, Income Statement and Balance Sheet).
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Installation](#installation)
|
||||
- [Usage](#usage)
|
||||
- [Start the Demo](#start-the-demo)
|
||||
- [How It Works](#how-it-works)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Common Issues](#common-issues)
|
||||
- [License](#license)
|
||||
- [Contributing](#contributing)
|
||||
|
||||
## Features
|
||||
|
||||
- 📄 **Documemt Classification**: Classify files based on well-defined rules you can customized and play around with.
|
||||
- 🤖 **Reasoning-based Actionable Insights**: Get in-depth, reasoning based insights on the document classification, accompanied by confidence scores.
|
||||
- 🎨 **Beautiful UI**: [DaisyUI](https://daisyui.com)-based interface powered by [TanStack](https://tanstack.com)
|
||||
- ⚡ **Fast Development**: Hot reload support with development mode
|
||||
- 🛠️ **TypeScript**: Full TypeScript support with strict type checking
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (version 22 or higher)
|
||||
- pnpm package manager
|
||||
- LlamaCloud API key
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/run-llama/llama_cloud_services
|
||||
cd lama_cloud_services/examples-ts/classify/
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. Set up your environment variables:
|
||||
|
||||
```bash
|
||||
# Add your API key to your environment
|
||||
export LLAMA_CLOUD_API_KEY="your-llamacloud-api-key"
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Start the Demo
|
||||
|
||||
```bash
|
||||
npm run dev
|
||||
```
|
||||
|
||||
The application will be up and running on http://localhost:3000
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Document Input**: Enter the path to your document when prompted
|
||||
2. **Parsing**: LlamaClassify, based on the rules you can find [here](./src/utils/classifier.ts), processes the document and classifies it
|
||||
3. **Results**: The classification outcome, as well as the reasoning behind it and the confidence score, are displayed in the UI.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Module Resolution Errors**: Ensure you're using Node.js 22+ and have all dependencies installed
|
||||
2. **API Key Issues**: Verify your LlamaCloud API key is correctly set
|
||||
3. **File Path Errors**: Use absolute paths or ensure relative paths are correct from the project root
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see the [LICENSE](../../LICENSE) file for details.
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Make your changes
|
||||
4. Run `npm run format` and `npm run lint`
|
||||
5. Submit a pull request
|
||||
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"name": "tanstack-start-example-basic",
|
||||
"private": true,
|
||||
"sideEffects": false,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite dev",
|
||||
"build": "vite build && tsc --noEmit",
|
||||
"start": "node .output/server/index.mjs"
|
||||
},
|
||||
"dependencies": {
|
||||
"@tanstack/react-router": "^1.133.22",
|
||||
"@tanstack/react-router-devtools": "^1.133.22",
|
||||
"@tanstack/react-start": "^1.133.22",
|
||||
"llama-cloud-services": "file:../../ts/llama_cloud_services",
|
||||
"react": "^19.0.0",
|
||||
"react-dom": "^19.0.0",
|
||||
"tailwind-merge": "^2.6.0",
|
||||
"zod": "^3.24.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@tailwindcss/postcss": "^4.1.15",
|
||||
"@types/node": "^22.5.4",
|
||||
"@types/react": "^19.0.8",
|
||||
"@types/react-dom": "^19.0.3",
|
||||
"@vitejs/plugin-react": "^4.6.0",
|
||||
"daisyui": "^5.3.7",
|
||||
"postcss": "^8.5.1",
|
||||
"tailwindcss": "^4.1.15",
|
||||
"typescript": "^5.7.2",
|
||||
"vite": "^7.1.7",
|
||||
"vite-tsconfig-paths": "^5.1.4"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
export default {
|
||||
plugins: {
|
||||
'@tailwindcss/postcss': {},
|
||||
},
|
||||
}
|
||||
|
After Width: | Height: | Size: 3.3 KiB |
|
After Width: | Height: | Size: 21 KiB |
|
After Width: | Height: | Size: 3.8 KiB |
|
After Width: | Height: | Size: 862 B |
|
After Width: | Height: | Size: 1.1 KiB |
|
After Width: | Height: | Size: 1.1 KiB |
|
After Width: | Height: | Size: 2.0 KiB |
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"name": "",
|
||||
"short_name": "",
|
||||
"icons": [
|
||||
{
|
||||
"src": "/android-chrome-192x192.png",
|
||||
"sizes": "192x192",
|
||||
"type": "image/png"
|
||||
},
|
||||
{
|
||||
"src": "/android-chrome-512x512.png",
|
||||
"sizes": "512x512",
|
||||
"type": "image/png"
|
||||
}
|
||||
],
|
||||
"theme_color": "#ffffff",
|
||||
"background_color": "#ffffff",
|
||||
"display": "standalone"
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
import {
|
||||
ErrorComponent,
|
||||
Link,
|
||||
rootRouteId,
|
||||
useMatch,
|
||||
useRouter,
|
||||
} from '@tanstack/react-router'
|
||||
import type { ErrorComponentProps } from '@tanstack/react-router'
|
||||
|
||||
export function DefaultCatchBoundary({ error }: ErrorComponentProps) {
|
||||
const router = useRouter()
|
||||
const isRoot = useMatch({
|
||||
strict: false,
|
||||
select: (state) => state.id === rootRouteId,
|
||||
})
|
||||
|
||||
console.error('DefaultCatchBoundary Error:', error)
|
||||
|
||||
return (
|
||||
<div className="min-w-0 flex-1 p-4 flex flex-col items-center justify-center gap-6">
|
||||
<ErrorComponent error={error} />
|
||||
<div className="flex gap-2 items-center flex-wrap">
|
||||
<button
|
||||
onClick={() => {
|
||||
router.invalidate()
|
||||
}}
|
||||
className={`px-2 py-1 bg-gray-600 dark:bg-gray-700 rounded-sm text-white uppercase font-extrabold`}
|
||||
>
|
||||
Try Again
|
||||
</button>
|
||||
{isRoot ? (
|
||||
<Link
|
||||
to="/"
|
||||
className={`px-2 py-1 bg-gray-600 dark:bg-gray-700 rounded-sm text-white uppercase font-extrabold`}
|
||||
>
|
||||
Home
|
||||
</Link>
|
||||
) : (
|
||||
<Link
|
||||
to="/"
|
||||
className={`px-2 py-1 bg-gray-600 dark:bg-gray-700 rounded-sm text-white uppercase font-extrabold`}
|
||||
onClick={(e) => {
|
||||
e.preventDefault()
|
||||
window.history.back()
|
||||
}}
|
||||
>
|
||||
Go Back
|
||||
</Link>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
import { Link } from '@tanstack/react-router'
|
||||
|
||||
export function NotFound({ children }: { children?: any }) {
|
||||
return (
|
||||
<div className="space-y-2 p-2">
|
||||
<div className="text-gray-600 dark:text-gray-400">
|
||||
{children || <p>The page you are looking for does not exist.</p>}
|
||||
</div>
|
||||
<p className="flex items-center gap-2 flex-wrap">
|
||||
<button
|
||||
onClick={() => window.history.back()}
|
||||
className="bg-emerald-500 text-white px-2 py-1 rounded-sm uppercase font-black text-sm"
|
||||
>
|
||||
Go back
|
||||
</button>
|
||||
<Link
|
||||
to="/"
|
||||
className="bg-cyan-600 text-white px-2 py-1 rounded-sm uppercase font-black text-sm"
|
||||
>
|
||||
Start Over
|
||||
</Link>
|
||||
</p>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,225 @@
|
||||
/* eslint-disable */
|
||||
|
||||
// @ts-nocheck
|
||||
|
||||
// noinspection JSUnusedGlobalSymbols
|
||||
|
||||
// This file was automatically generated by TanStack Router.
|
||||
// You should NOT make any changes in this file as it will be overwritten.
|
||||
// Additionally, you should also exclude this file from your linter and/or formatter to prevent it from being checked or modified.
|
||||
|
||||
import { Route as rootRouteImport } from './routes/__root'
|
||||
import { Route as UsersRouteImport } from './routes/users'
|
||||
import { Route as IndexRouteImport } from './routes/index'
|
||||
import { Route as UsersIndexRouteImport } from './routes/users.index'
|
||||
import { Route as PostsIndexRouteImport } from './routes/posts.index'
|
||||
import { Route as UsersUserIdRouteImport } from './routes/users.$userId'
|
||||
import { Route as PostsPostIdRouteImport } from './routes/posts.$postId'
|
||||
import { Route as ApiClassifyRouteImport } from './routes/api/classify'
|
||||
import { Route as PostsPostIdDeepRouteImport } from './routes/posts_.$postId.deep'
|
||||
|
||||
const UsersRoute = UsersRouteImport.update({
|
||||
id: '/users',
|
||||
path: '/users',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const IndexRoute = IndexRouteImport.update({
|
||||
id: '/',
|
||||
path: '/',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const UsersIndexRoute = UsersIndexRouteImport.update({
|
||||
id: '/',
|
||||
path: '/',
|
||||
getParentRoute: () => UsersRoute,
|
||||
} as any)
|
||||
const PostsIndexRoute = PostsIndexRouteImport.update({
|
||||
id: '/posts/',
|
||||
path: '/posts/',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const UsersUserIdRoute = UsersUserIdRouteImport.update({
|
||||
id: '/$userId',
|
||||
path: '/$userId',
|
||||
getParentRoute: () => UsersRoute,
|
||||
} as any)
|
||||
const PostsPostIdRoute = PostsPostIdRouteImport.update({
|
||||
id: '/posts/$postId',
|
||||
path: '/posts/$postId',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const ApiClassifyRoute = ApiClassifyRouteImport.update({
|
||||
id: '/api/classify',
|
||||
path: '/api/classify',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
const PostsPostIdDeepRoute = PostsPostIdDeepRouteImport.update({
|
||||
id: '/posts_/$postId/deep',
|
||||
path: '/posts/$postId/deep',
|
||||
getParentRoute: () => rootRouteImport,
|
||||
} as any)
|
||||
|
||||
export interface FileRoutesByFullPath {
|
||||
'/': typeof IndexRoute
|
||||
'/users': typeof UsersRouteWithChildren
|
||||
'/api/classify': typeof ApiClassifyRoute
|
||||
'/posts/$postId': typeof PostsPostIdRoute
|
||||
'/users/$userId': typeof UsersUserIdRoute
|
||||
'/posts': typeof PostsIndexRoute
|
||||
'/users/': typeof UsersIndexRoute
|
||||
'/posts/$postId/deep': typeof PostsPostIdDeepRoute
|
||||
}
|
||||
export interface FileRoutesByTo {
|
||||
'/': typeof IndexRoute
|
||||
'/api/classify': typeof ApiClassifyRoute
|
||||
'/posts/$postId': typeof PostsPostIdRoute
|
||||
'/users/$userId': typeof UsersUserIdRoute
|
||||
'/posts': typeof PostsIndexRoute
|
||||
'/users': typeof UsersIndexRoute
|
||||
'/posts/$postId/deep': typeof PostsPostIdDeepRoute
|
||||
}
|
||||
export interface FileRoutesById {
|
||||
__root__: typeof rootRouteImport
|
||||
'/': typeof IndexRoute
|
||||
'/users': typeof UsersRouteWithChildren
|
||||
'/api/classify': typeof ApiClassifyRoute
|
||||
'/posts/$postId': typeof PostsPostIdRoute
|
||||
'/users/$userId': typeof UsersUserIdRoute
|
||||
'/posts/': typeof PostsIndexRoute
|
||||
'/users/': typeof UsersIndexRoute
|
||||
'/posts_/$postId/deep': typeof PostsPostIdDeepRoute
|
||||
}
|
||||
export interface FileRouteTypes {
|
||||
fileRoutesByFullPath: FileRoutesByFullPath
|
||||
fullPaths:
|
||||
| '/'
|
||||
| '/users'
|
||||
| '/api/classify'
|
||||
| '/posts/$postId'
|
||||
| '/users/$userId'
|
||||
| '/posts'
|
||||
| '/users/'
|
||||
| '/posts/$postId/deep'
|
||||
fileRoutesByTo: FileRoutesByTo
|
||||
to:
|
||||
| '/'
|
||||
| '/api/classify'
|
||||
| '/posts/$postId'
|
||||
| '/users/$userId'
|
||||
| '/posts'
|
||||
| '/users'
|
||||
| '/posts/$postId/deep'
|
||||
id:
|
||||
| '__root__'
|
||||
| '/'
|
||||
| '/users'
|
||||
| '/api/classify'
|
||||
| '/posts/$postId'
|
||||
| '/users/$userId'
|
||||
| '/posts/'
|
||||
| '/users/'
|
||||
| '/posts_/$postId/deep'
|
||||
fileRoutesById: FileRoutesById
|
||||
}
|
||||
export interface RootRouteChildren {
|
||||
IndexRoute: typeof IndexRoute
|
||||
UsersRoute: typeof UsersRouteWithChildren
|
||||
ApiClassifyRoute: typeof ApiClassifyRoute
|
||||
PostsPostIdRoute: typeof PostsPostIdRoute
|
||||
PostsIndexRoute: typeof PostsIndexRoute
|
||||
PostsPostIdDeepRoute: typeof PostsPostIdDeepRoute
|
||||
}
|
||||
|
||||
declare module '@tanstack/react-router' {
|
||||
interface FileRoutesByPath {
|
||||
'/users': {
|
||||
id: '/users'
|
||||
path: '/users'
|
||||
fullPath: '/users'
|
||||
preLoaderRoute: typeof UsersRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/': {
|
||||
id: '/'
|
||||
path: '/'
|
||||
fullPath: '/'
|
||||
preLoaderRoute: typeof IndexRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/users/': {
|
||||
id: '/users/'
|
||||
path: '/'
|
||||
fullPath: '/users/'
|
||||
preLoaderRoute: typeof UsersIndexRouteImport
|
||||
parentRoute: typeof UsersRoute
|
||||
}
|
||||
'/posts/': {
|
||||
id: '/posts/'
|
||||
path: '/posts'
|
||||
fullPath: '/posts'
|
||||
preLoaderRoute: typeof PostsIndexRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/users/$userId': {
|
||||
id: '/users/$userId'
|
||||
path: '/$userId'
|
||||
fullPath: '/users/$userId'
|
||||
preLoaderRoute: typeof UsersUserIdRouteImport
|
||||
parentRoute: typeof UsersRoute
|
||||
}
|
||||
'/posts/$postId': {
|
||||
id: '/posts/$postId'
|
||||
path: '/posts/$postId'
|
||||
fullPath: '/posts/$postId'
|
||||
preLoaderRoute: typeof PostsPostIdRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/api/classify': {
|
||||
id: '/api/classify'
|
||||
path: '/api/classify'
|
||||
fullPath: '/api/classify'
|
||||
preLoaderRoute: typeof ApiClassifyRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
'/posts_/$postId/deep': {
|
||||
id: '/posts_/$postId/deep'
|
||||
path: '/posts/$postId/deep'
|
||||
fullPath: '/posts/$postId/deep'
|
||||
preLoaderRoute: typeof PostsPostIdDeepRouteImport
|
||||
parentRoute: typeof rootRouteImport
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
interface UsersRouteChildren {
|
||||
UsersUserIdRoute: typeof UsersUserIdRoute
|
||||
UsersIndexRoute: typeof UsersIndexRoute
|
||||
}
|
||||
|
||||
const UsersRouteChildren: UsersRouteChildren = {
|
||||
UsersUserIdRoute: UsersUserIdRoute,
|
||||
UsersIndexRoute: UsersIndexRoute,
|
||||
}
|
||||
|
||||
const UsersRouteWithChildren = UsersRoute._addFileChildren(UsersRouteChildren)
|
||||
|
||||
const rootRouteChildren: RootRouteChildren = {
|
||||
IndexRoute: IndexRoute,
|
||||
UsersRoute: UsersRouteWithChildren,
|
||||
ApiClassifyRoute: ApiClassifyRoute,
|
||||
PostsPostIdRoute: PostsPostIdRoute,
|
||||
PostsIndexRoute: PostsIndexRoute,
|
||||
PostsPostIdDeepRoute: PostsPostIdDeepRoute,
|
||||
}
|
||||
export const routeTree = rootRouteImport
|
||||
._addFileChildren(rootRouteChildren)
|
||||
._addFileTypes<FileRouteTypes>()
|
||||
|
||||
import type { getRouter } from './router.tsx'
|
||||
import type { createStart } from '@tanstack/react-start'
|
||||
declare module '@tanstack/react-start' {
|
||||
interface Register {
|
||||
ssr: true
|
||||
router: Awaited<ReturnType<typeof getRouter>>
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
import { createRouter } from '@tanstack/react-router'
|
||||
import { routeTree } from './routeTree.gen'
|
||||
import { DefaultCatchBoundary } from './components/DefaultCatchBoundary'
|
||||
import { NotFound } from './components/NotFound'
|
||||
|
||||
export function getRouter() {
|
||||
const router = createRouter({
|
||||
routeTree,
|
||||
defaultPreload: 'intent',
|
||||
defaultErrorComponent: DefaultCatchBoundary,
|
||||
defaultNotFoundComponent: () => <NotFound />,
|
||||
scrollRestoration: true,
|
||||
})
|
||||
return router
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
/// <reference types="vite/client" />
|
||||
import {
|
||||
HeadContent,
|
||||
Scripts,
|
||||
createRootRoute,
|
||||
} from '@tanstack/react-router'
|
||||
import * as React from 'react'
|
||||
import { DefaultCatchBoundary } from '~/components/DefaultCatchBoundary'
|
||||
import { NotFound } from '~/components/NotFound'
|
||||
import { seo } from '~/utils/seo'
|
||||
|
||||
export const Route = createRootRoute({
|
||||
head: () => ({
|
||||
meta: [
|
||||
{
|
||||
charSet: 'utf-8',
|
||||
},
|
||||
{
|
||||
name: 'viewport',
|
||||
content: 'width=device-width, initial-scale=1',
|
||||
},
|
||||
...seo({
|
||||
title:
|
||||
'Financial Documents Classification Agent',
|
||||
description: `Classify financial documents as balance sheets, income statements and cash flow statemets. `,
|
||||
}),
|
||||
],
|
||||
links: [
|
||||
{ rel: 'stylesheet', href: "https://cdn.jsdelivr.net/npm/daisyui@5" },
|
||||
{
|
||||
rel: 'apple-touch-icon',
|
||||
sizes: '180x180',
|
||||
href: '/apple-touch-icon.png',
|
||||
},
|
||||
{
|
||||
rel: 'icon',
|
||||
type: 'image/png',
|
||||
sizes: '32x32',
|
||||
href: '/favicon-32x32.png',
|
||||
},
|
||||
{
|
||||
rel: 'icon',
|
||||
type: 'image/png',
|
||||
sizes: '16x16',
|
||||
href: '/favicon-16x16.png',
|
||||
},
|
||||
{ rel: 'manifest', href: '/site.webmanifest', color: '#fffff' },
|
||||
{ rel: 'icon', href: '/favicon.ico' },
|
||||
],
|
||||
scripts: [
|
||||
{
|
||||
src: '/customScript.js',
|
||||
type: 'text/javascript',
|
||||
},
|
||||
{
|
||||
src: "https://cdn.jsdelivr.net/npm/@tailwindcss/browser@4",
|
||||
type: "text/javascript",
|
||||
}
|
||||
],
|
||||
}),
|
||||
errorComponent: DefaultCatchBoundary,
|
||||
notFoundComponent: () => <NotFound />,
|
||||
shellComponent: RootDocument,
|
||||
})
|
||||
|
||||
function RootDocument({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<html>
|
||||
<head>
|
||||
<HeadContent />
|
||||
</head>
|
||||
<body>
|
||||
<div className="navbar bg-base-100 shadow-sm">
|
||||
<div className="navbar-start">
|
||||
<div className="dropdown">
|
||||
<div tabIndex={0} role="button" className="btn btn-ghost btn-circle">
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
className="h-5 w-5"
|
||||
fill="none"
|
||||
viewBox="0 0 24 24"
|
||||
stroke="currentColor"
|
||||
>
|
||||
<path
|
||||
strokeLinecap="round"
|
||||
strokeLinejoin="round"
|
||||
strokeWidth="2"
|
||||
d="M4 6h16M4 12h16M4 18h7"
|
||||
/>
|
||||
</svg>
|
||||
</div>
|
||||
<ul
|
||||
tabIndex={0}
|
||||
className="menu menu-lg dropdown-content bg-base-100 rounded-box z-1 mt-3 w-80 p-2 shadow"
|
||||
>
|
||||
<li><a href="/">Home</a></li>
|
||||
<li><a href="https://cloud.llamaindex.ai">Get Started with LlamaCloud</a></li>
|
||||
<li><a href="https://developers.llamaindex.ai/python/cloud/llamaclassify/getting_started/">LlamaClassify Docs</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<div className="navbar-center">
|
||||
<a className="btn btn-ghost text-xl" href="/">Financial Documents Classification Agent</a>
|
||||
</div>
|
||||
<div className="navbar-end">
|
||||
<a href="https://github.com/run-llama/llama_cloud_services/main/blob/examples-ts/classify">
|
||||
<button className="btn btn-ghost btn-circle">
|
||||
<div className="indicator">
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
className="h-10 w-10"
|
||||
fill="currentColor"
|
||||
viewBox="0 0 640 512"
|
||||
>
|
||||
<path d="M237.9 461.4C237.9 463.4 235.6 465 232.7 465C229.4 465.3 227.1 463.7 227.1 461.4C227.1 459.4 229.4 457.8 232.3 457.8C235.3 457.5 237.9 459.1 237.9 461.4zM206.8 456.9C206.1 458.9 208.1 461.2 211.1 461.8C213.7 462.8 216.7 461.8 217.3 459.8C217.9 457.8 216 455.5 213 454.6C210.4 453.9 207.5 454.9 206.8 456.9zM251 455.2C248.1 455.9 246.1 457.8 246.4 460.1C246.7 462.1 249.3 463.4 252.3 462.7C255.2 462 257.2 460.1 256.9 458.1C256.6 456.2 253.9 454.9 251 455.2zM316.8 72C178.1 72 72 177.3 72 316C72 426.9 141.8 521.8 241.5 555.2C254.3 557.5 258.8 549.6 258.8 543.1C258.8 536.9 258.5 502.7 258.5 481.7C258.5 481.7 188.5 496.7 173.8 451.9C173.8 451.9 162.4 422.8 146 415.3C146 415.3 123.1 399.6 147.6 399.9C147.6 399.9 172.5 401.9 186.2 425.7C208.1 464.3 244.8 453.2 259.1 446.6C261.4 430.6 267.9 419.5 275.1 412.9C219.2 406.7 162.8 398.6 162.8 302.4C162.8 274.9 170.4 261.1 186.4 243.5C183.8 237 175.3 210.2 189 175.6C209.9 169.1 258 202.6 258 202.6C278 197 299.5 194.1 320.8 194.1C342.1 194.1 363.6 197 383.6 202.6C383.6 202.6 431.7 169 452.6 175.6C466.3 210.3 457.8 237 455.2 243.5C471.2 261.2 481 275 481 302.4C481 398.9 422.1 406.6 366.2 412.9C375.4 420.8 383.2 435.8 383.2 459.3C383.2 493 382.9 534.7 382.9 542.9C382.9 549.4 387.5 557.3 400.2 555C500.2 521.8 568 426.9 568 316C568 177.3 455.5 72 316.8 72zM169.2 416.9C167.9 417.9 168.2 420.2 169.9 422.1C171.5 423.7 173.8 424.4 175.1 423.1C176.4 422.1 176.1 419.8 174.4 417.9C172.8 416.3 170.5 415.6 169.2 416.9zM158.4 408.8C157.7 410.1 158.7 411.7 160.7 412.7C162.3 413.7 164.3 413.4 165 412C165.7 410.7 164.7 409.1 162.7 408.1C160.7 407.5 159.1 407.8 158.4 408.8zM190.8 444.4C189.2 445.7 189.8 448.7 192.1 450.6C194.4 452.9 197.3 453.2 198.6 451.6C199.9 450.3 199.3 447.3 197.3 445.4C195.1 443.1 192.1 442.8 190.8 444.4zM179.4 429.7C177.8 430.7 177.8 433.3 179.4 435.6C181 437.9 183.7 438.9 185 437.9C186.6 436.6 186.6 434 185 431.7C183.6 429.4 181 428.4 179.4 429.7z" />
|
||||
</svg>
|
||||
</div>
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
<hr />
|
||||
{children}
|
||||
<Scripts />
|
||||
</body>
|
||||
</html>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
import { createFileRoute } from '@tanstack/react-router'
|
||||
import { classifier, classificationRules, parsingConfig } from '~/utils/classifier'
|
||||
|
||||
export const Route = createFileRoute('/api/classify')({
|
||||
component: RouteComponent,
|
||||
server: {
|
||||
handlers: {
|
||||
POST: async ({ request }) => {
|
||||
const body = await request.formData()
|
||||
const fl = body.get("file") as File;
|
||||
if (!fl) {
|
||||
return new Response(JSON.stringify({"result": "you need to provide a file"}))
|
||||
}
|
||||
const buff = await fl.arrayBuffer()
|
||||
const rawRes = await classifier.classify(
|
||||
classificationRules,
|
||||
parsingConfig,
|
||||
{ fileContents: [new Uint8Array(buff)] },
|
||||
)
|
||||
const results = rawRes.items
|
||||
let classification = ""
|
||||
|
||||
for (const result of results) {
|
||||
if ("result" in result && result.result) {
|
||||
classification += `
|
||||
<div class="card bg-base-100 shadow-xl p-6 mb-4">
|
||||
<div class="space-y-3">
|
||||
<p><span class="font-semibold">📄 Document:</span> ${fl.name}</p>
|
||||
<p><span class="font-semibold">🏷️ Type:</span> <span class="badge badge-primary">${result.result.type}</span></p>
|
||||
<p><span class="font-semibold">📊 Confidence:</span> ${result.result.confidence*100}%</p>
|
||||
<p><span class="font-semibold">💭 Reasoning:</span> ${result.result.reasoning}</p>
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
}
|
||||
}
|
||||
return new Response(JSON.stringify({"result": classification}))
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
function RouteComponent() {
|
||||
return
|
||||
}
|
||||
@@ -0,0 +1,99 @@
|
||||
import { createFileRoute } from '@tanstack/react-router'
|
||||
import { useRef, useState } from 'react'
|
||||
|
||||
export const Route = createFileRoute('/')({
|
||||
component: Home,
|
||||
})
|
||||
|
||||
function Home() {
|
||||
const [file, setFile] = useState<null | File>(null)
|
||||
const fileInputRef = useRef<HTMLInputElement>(null)
|
||||
const [reply, setReply] = useState<null | string>(null)
|
||||
const [loading, setLoading] = useState<boolean>(false)
|
||||
const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const selectedFile = event.target.files?.[0]
|
||||
if (selectedFile) {
|
||||
setFile(selectedFile)
|
||||
}
|
||||
}
|
||||
const handleClearFile = () => {
|
||||
if (file) {
|
||||
setFile(null)
|
||||
}
|
||||
if (fileInputRef.current) {
|
||||
fileInputRef.current.value = ''
|
||||
}
|
||||
if (reply) {
|
||||
setReply(null)
|
||||
}
|
||||
}
|
||||
|
||||
const handleClassify = async () => {
|
||||
if (!file) return
|
||||
|
||||
if (reply) {
|
||||
setReply(null)
|
||||
}
|
||||
setLoading(true)
|
||||
try {
|
||||
const formData = new FormData()
|
||||
formData.append('file', file)
|
||||
|
||||
const res = await fetch('/api/classify', {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
})
|
||||
|
||||
const data = await res.json()
|
||||
setReply(data.result)
|
||||
} catch (error) {
|
||||
console.error('Error:', error)
|
||||
} finally {
|
||||
setLoading(false)
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="flex flex-col justify-center items-center gap-y-8">
|
||||
<br />
|
||||
<h1 className="text-xl font-bold text-gray-700">AI-Powered finacial document classification</h1>
|
||||
<h2 className="text-lg font-semibold text-gray-500">Need help sorting out the financial documents jungle? Let our classification agent handle it!</h2>
|
||||
<fieldset className="fieldset bg-base-100 border-base-300 rounded-box w-200 border p-4">
|
||||
<legend className="fieldset-legend text-lg">Upload your financial document here</legend>
|
||||
<label className="label flex justify-center">
|
||||
<input type="file" className="file-input" onChange={handleFileChange} accept='application/pdf' ref={fileInputRef} />
|
||||
</label>
|
||||
</fieldset>
|
||||
{file && (
|
||||
<div className="flex flex-col justify-center items-center gap-y-8">
|
||||
<p className="text-sm text-gray-600">Selected file: {file.name}</p>
|
||||
<div className='grid grid-cols-2 gap-x-6'>
|
||||
<button
|
||||
type="button"
|
||||
className='btn bg-gray-500 text-white shadow-lg hover:bg-gray-600 hover:shadow-xl rounded'
|
||||
onClick={handleClassify}
|
||||
>
|
||||
Classify
|
||||
</button>
|
||||
<button
|
||||
onClick={handleClearFile}
|
||||
type="button"
|
||||
className="px-4 py-2 bg-red-300 text-black rounded hover:bg-red-400 hover:shadow-xl shadow-lg"
|
||||
>
|
||||
Clear
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{loading && (
|
||||
<span className="loading loading-spinner text-primary"></span>
|
||||
)}
|
||||
{reply && (
|
||||
<div
|
||||
className="max-w-2xl w-full"
|
||||
dangerouslySetInnerHTML={{ __html: reply }}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
import { LlamaClassify, ClassifierRule, ClassifyParsingConfiguration } from "llama-cloud-services"
|
||||
|
||||
export const classifier = new LlamaClassify(process.env.LLAMA_CLOUD_API_KEY);
|
||||
|
||||
export const classificationRules: ClassifierRule[] = [
|
||||
{
|
||||
description: "Shows a company's assets, liabilities, and shareholders' equity at a specific point in time, providing a snapshot of financial position.",
|
||||
type: "balance_sheet"
|
||||
},
|
||||
{
|
||||
description: "Reports cash inflows and outflows from operating, investing, and financing activities, highlighting liquidity and cash management.",
|
||||
type: "cash_flow_statement"
|
||||
},
|
||||
{
|
||||
description: "Summarizes revenues, expenses, and profits over a period, indicating financial performance and profitability.",
|
||||
type: "income_statement"
|
||||
},
|
||||
];
|
||||
|
||||
export const parsingConfig: ClassifyParsingConfiguration = {
|
||||
lang: "en",
|
||||
max_pages: 20,
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
export const seo = ({
|
||||
title,
|
||||
description,
|
||||
keywords,
|
||||
image,
|
||||
}: {
|
||||
title: string
|
||||
description?: string
|
||||
image?: string
|
||||
keywords?: string
|
||||
}) => {
|
||||
const tags = [
|
||||
{ title },
|
||||
{ name: 'description', content: description },
|
||||
{ name: 'keywords', content: keywords },
|
||||
{ name: 'twitter:title', content: title },
|
||||
{ name: 'twitter:description', content: description },
|
||||
{ name: 'twitter:creator', content: '@tannerlinsley' },
|
||||
{ name: 'twitter:site', content: '@tannerlinsley' },
|
||||
{ name: 'og:type', content: 'website' },
|
||||
{ name: 'og:title', content: title },
|
||||
{ name: 'og:description', content: description },
|
||||
...(image
|
||||
? [
|
||||
{ name: 'twitter:image', content: image },
|
||||
{ name: 'twitter:card', content: 'summary_large_image' },
|
||||
{ name: 'og:image', content: image },
|
||||
]
|
||||
: []),
|
||||
]
|
||||
|
||||
return tags
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"include": ["**/*.ts", "**/*.tsx"],
|
||||
"compilerOptions": {
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"jsx": "react-jsx",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Bundler",
|
||||
"lib": ["DOM", "DOM.Iterable", "ES2022"],
|
||||
"isolatedModules": true,
|
||||
"resolveJsonModule": true,
|
||||
"skipLibCheck": true,
|
||||
"target": "ES2022",
|
||||
"allowJs": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"baseUrl": ".",
|
||||
"paths": {
|
||||
"~/*": ["./src/*"]
|
||||
},
|
||||
"noEmit": true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
import { tanstackStart } from '@tanstack/react-start/plugin/vite'
|
||||
import { defineConfig } from 'vite'
|
||||
import tsConfigPaths from 'vite-tsconfig-paths'
|
||||
import viteReact from '@vitejs/plugin-react'
|
||||
|
||||
export default defineConfig({
|
||||
server: {
|
||||
port: 3000,
|
||||
},
|
||||
plugins: [
|
||||
tsConfigPaths({
|
||||
projects: ['./tsconfig.json'],
|
||||
}),
|
||||
tanstackStart({
|
||||
srcDirectory: 'src',
|
||||
}),
|
||||
viteReact(),
|
||||
],
|
||||
})
|
||||
@@ -0,0 +1,122 @@
|
||||
# LlamaExtract Demo
|
||||
|
||||
A TypeScript demo application showcasing the power of **LlamaExract** - a structured data extraction agentic service from [LlamaCloud](https://cloud.llamaindex.ai). This demo allows you to extract structured information from scientific papers and get them into a nice markdown format.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Installation](#installation)
|
||||
- [Usage](#usage)
|
||||
- [Start the Demo](#start-the-demo)
|
||||
- [Development Mode](#development-mode)
|
||||
- [Build the Project](#build-the-project)
|
||||
- [Code Quality](#code-quality)
|
||||
- [Quick Commands Reference](#quick-commands-reference)
|
||||
- [How It Works](#how-it-works)
|
||||
- [API Dependencies](#api-dependencies)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Common Issues](#common-issues)
|
||||
- [License](#license)
|
||||
- [Contributing](#contributing)
|
||||
|
||||
## Features
|
||||
|
||||
- 📄 **Structured Data Extraction**: Extract data from your files effortlessly, and structure them the way you want!
|
||||
- 🤖 **Markdown Rendering**: Generate markdown directly from your extracted data
|
||||
- 🎨 **Beautiful CLI**: Styled console interface with colors and ASCII art
|
||||
- ⚡ **Fast Development**: Hot reload support with watch mode
|
||||
- 🛠️ **TypeScript**: Full TypeScript support with strict type checking
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (version 18 or higher)
|
||||
- pnpm package manager
|
||||
- LlamaCloud API key
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/run-llama/llama_cloud_services
|
||||
cd lama_cloud_services/examples-ts/extract/
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. Set up your environment variables:
|
||||
|
||||
```bash
|
||||
# Add your API key to your environment
|
||||
export LLAMA_CLOUD_API_KEY="your-llamacloud-api-key"
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Start the Demo
|
||||
|
||||
```bash
|
||||
npm run start
|
||||
```
|
||||
|
||||
The application will display a welcome screen and prompt you to enter the path to a document you'd like to process.
|
||||
|
||||
### Development Mode
|
||||
|
||||
For development with hot reload:
|
||||
|
||||
```bash
|
||||
npm run dev
|
||||
```
|
||||
|
||||
### Build the Project
|
||||
|
||||
```bash
|
||||
npm run build
|
||||
```
|
||||
|
||||
### Code Quality
|
||||
|
||||
Format code:
|
||||
|
||||
```bash
|
||||
npm run format
|
||||
```
|
||||
|
||||
Lint code:
|
||||
|
||||
```bash
|
||||
npm run lint
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Document Input**: Enter the path to your document when prompted
|
||||
2. **Parsing**: LlamaExtract, based on the schema you can find [here](./src/schema.ts), processes the document and extracts structured data
|
||||
3. **Markdown Rendering**: The extracted content is rendered into beautiful markdown
|
||||
4. **Results**: View the results directly in your terminal
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Module Resolution Errors**: Ensure you're using Node.js 18+ and have all dependencies installed
|
||||
2. **API Key Issues**: Verify your LlamaCloud API key is correctly set
|
||||
3. **File Path Errors**: Use absolute paths or ensure relative paths are correct from the project root
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see the [LICENSE](../../LICENSE) file for details.
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Make your changes
|
||||
4. Run `npm run format` and `npm run lint`
|
||||
5. Submit a pull request
|
||||
@@ -0,0 +1,14 @@
|
||||
import js from "@eslint/js";
|
||||
import globals from "globals";
|
||||
import tseslint from "typescript-eslint";
|
||||
import { defineConfig } from "eslint/config";
|
||||
|
||||
export default defineConfig([
|
||||
{
|
||||
files: ["**/*.{js,mjs,cjs,ts,mts,cts}"],
|
||||
plugins: { js },
|
||||
extends: ["js/recommended"],
|
||||
languageOptions: { globals: globals.browser },
|
||||
},
|
||||
tseslint.configs.recommended,
|
||||
]);
|
||||
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"name": "llama-extract-demo",
|
||||
"version": "0.1.0",
|
||||
"description": "Demo for LlamaExtract in TypeScript",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"There are no tests\"",
|
||||
"start": "npm exec tsx src/index.ts",
|
||||
"lint": "eslint ./src/",
|
||||
"format": "prettier --write ./src/",
|
||||
"build": "tsc",
|
||||
"dev": "npm exec tsx --watch src/index.ts"
|
||||
},
|
||||
"author": "LlamaIndex",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"cli-markdown": "^3.5.1",
|
||||
"consola": "^3.4.2",
|
||||
"figlet": "^1.8.2",
|
||||
"llama-cloud-services": "file:../../ts/llama_cloud_services",
|
||||
"marked": "^15.0.12",
|
||||
"marked-terminal": "^7.3.0",
|
||||
"picocolors": "^1.1.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.32.0",
|
||||
"@types/figlet": "^1.7.0",
|
||||
"@types/marked-terminal": "^6.1.1",
|
||||
"@types/node": "^24.2.0",
|
||||
"eslint": "^9.32.0",
|
||||
"globals": "^16.3.0",
|
||||
"jiti": "^2.5.1",
|
||||
"prettier": "^3.6.2",
|
||||
"typescript": "^5.9.2",
|
||||
"typescript-eslint": "^8.39.0"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
import { LlamaExtract, ExtractConfig } from "llama-cloud-services";
|
||||
import cliMarkdown from "cli-markdown";
|
||||
import { logger } from "./logger";
|
||||
import pc from "picocolors";
|
||||
import { consoleInput, renderLogo } from "./utils";
|
||||
import { dataSchema } from "./schema";
|
||||
import { renderMarkdown, ResearchData } from "./markdown";
|
||||
|
||||
export async function main(): Promise<number> {
|
||||
const extractClient = new LlamaExtract(
|
||||
process.env.LLAMA_CLOUD_API_KEY!,
|
||||
"https://api.cloud.llamaindex.ai",
|
||||
);
|
||||
await renderLogo();
|
||||
logger.log(
|
||||
`Welcome to ${pc.bold(
|
||||
pc.magentaBright("LlamaExtract Demo✨"),
|
||||
)}, our demo for ${pc.bold(pc.green("LlamaExtract"))}, a ${pc.bold(
|
||||
pc.cyan("LlamaCloud☁️"),
|
||||
)} (https://cloud.llamaindex.ai) product!.\nIn this demo we are going to try extracting relevant information ${pc.bold(
|
||||
pc.yellowBright("from scientific papers"),
|
||||
)}. Type the path to the paper you would like to process below👇\nIf you wish to exit, just type ${pc.bold(
|
||||
pc.gray("quit"),
|
||||
)}.\n`,
|
||||
);
|
||||
while (true) {
|
||||
const userInput = await consoleInput();
|
||||
if (userInput.toLowerCase() == "quit") {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
const generatedData = await extractClient.extract(
|
||||
dataSchema,
|
||||
{} as ExtractConfig,
|
||||
userInput,
|
||||
);
|
||||
const research = renderMarkdown(generatedData?.data as ResearchData); // Added await here
|
||||
logger.log(`${pc.bold(pc.cyan("Extracted information:✨"))}:\n`);
|
||||
logger.log(cliMarkdown(research));
|
||||
} catch (error) {
|
||||
logger.error(`Error processing file: ${error}`);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -0,0 +1,8 @@
|
||||
import { createConsola } from "consola";
|
||||
import type { ConsolaInstance } from "consola";
|
||||
|
||||
export const logger: ConsolaInstance = createConsola({
|
||||
formatOptions: {
|
||||
date: false,
|
||||
},
|
||||
});
|
||||
@@ -0,0 +1,172 @@
|
||||
type Author = {
|
||||
name: string;
|
||||
affiliation?: string;
|
||||
email?: string;
|
||||
};
|
||||
|
||||
type Methodology = {
|
||||
approach?: string;
|
||||
participants?: string;
|
||||
methods?: string[];
|
||||
};
|
||||
|
||||
type Result = {
|
||||
finding?: string;
|
||||
significance?: string;
|
||||
supportingData?: string;
|
||||
};
|
||||
|
||||
type Reference = {
|
||||
title: string;
|
||||
authors: string;
|
||||
year?: string;
|
||||
relevance?: string;
|
||||
};
|
||||
|
||||
type Discussion = {
|
||||
implications?: string[];
|
||||
limitations?: string[];
|
||||
futureWork?: string[];
|
||||
};
|
||||
|
||||
type Publication = {
|
||||
journal?: string;
|
||||
year: string;
|
||||
doi?: string;
|
||||
url?: string;
|
||||
};
|
||||
|
||||
export type ResearchData = {
|
||||
title: string;
|
||||
authors: Author[];
|
||||
abstract: string;
|
||||
keywords?: string[];
|
||||
mainFindings: string[];
|
||||
methodology?: Methodology;
|
||||
results?: Result[];
|
||||
discussion?: Discussion;
|
||||
references?: Reference[];
|
||||
publication?: Publication;
|
||||
};
|
||||
|
||||
export function renderMarkdown(data: ResearchData): string {
|
||||
const {
|
||||
title,
|
||||
authors,
|
||||
abstract,
|
||||
keywords,
|
||||
mainFindings,
|
||||
methodology,
|
||||
results,
|
||||
discussion,
|
||||
references,
|
||||
publication,
|
||||
} = data;
|
||||
|
||||
const md: string[] = [];
|
||||
|
||||
md.push(`# ${title}\n`);
|
||||
|
||||
// Authors
|
||||
md.push(`## Authors`);
|
||||
md.push(
|
||||
authors
|
||||
.map(
|
||||
(author) =>
|
||||
`- **${author.name}**${
|
||||
author.affiliation ? `, *${author.affiliation}*` : ""
|
||||
}${author.email ? ` (${author.email})` : ""}`,
|
||||
)
|
||||
.join("\n"),
|
||||
);
|
||||
|
||||
// Abstract
|
||||
md.push(`\n## Abstract\n${abstract}`);
|
||||
|
||||
// Keywords
|
||||
if (keywords && keywords.length > 0) {
|
||||
md.push(`\n## Keywords\n${keywords.map((k) => `- ${k}`).join("\n")}`);
|
||||
}
|
||||
|
||||
// Main Findings
|
||||
md.push(
|
||||
`\n## Main Findings\n${mainFindings.map((f) => `- ${f}`).join("\n")}`,
|
||||
);
|
||||
|
||||
// Methodology
|
||||
if (methodology) {
|
||||
md.push(`\n## Methodology`);
|
||||
if (methodology.approach) md.push(`**Approach:** ${methodology.approach}`);
|
||||
if (methodology.participants)
|
||||
md.push(`**Participants:** ${methodology.participants}`);
|
||||
if (methodology.methods?.length) {
|
||||
md.push(
|
||||
`**Methods:**\n${methodology.methods.map((m) => `- ${m}`).join("\n")}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Results
|
||||
if (results?.length) {
|
||||
md.push(`\n## Results`);
|
||||
results.forEach((result, i) => {
|
||||
md.push(`\n### Result ${i + 1}`);
|
||||
if (result.finding) md.push(`- **Finding:** ${result.finding}`);
|
||||
if (result.significance)
|
||||
md.push(`- **Significance:** ${result.significance}`);
|
||||
if (result.supportingData)
|
||||
md.push(`- **Supporting Data:** ${result.supportingData}`);
|
||||
});
|
||||
}
|
||||
|
||||
// Discussion
|
||||
if (discussion) {
|
||||
md.push(`\n## Discussion`);
|
||||
if (discussion.implications?.length) {
|
||||
md.push(
|
||||
`### Implications\n${discussion.implications
|
||||
.map((d) => `- ${d}`)
|
||||
.join("\n")}`,
|
||||
);
|
||||
}
|
||||
if (discussion.limitations?.length) {
|
||||
md.push(
|
||||
`### Limitations\n${discussion.limitations
|
||||
.map((d) => `- ${d}`)
|
||||
.join("\n")}`,
|
||||
);
|
||||
}
|
||||
if (discussion.futureWork?.length) {
|
||||
md.push(
|
||||
`### Future Work\n${discussion.futureWork
|
||||
.map((d) => `- ${d}`)
|
||||
.join("\n")}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// References
|
||||
if (references?.length) {
|
||||
md.push(`\n## References`);
|
||||
references.forEach((ref, i) => {
|
||||
md.push(
|
||||
`\n**[${i + 1}]** ${ref.title} — *${ref.authors}*${
|
||||
ref.year ? ` (${ref.year})` : ""
|
||||
}`,
|
||||
);
|
||||
if (ref.relevance) md.push(`> ${ref.relevance}`);
|
||||
});
|
||||
}
|
||||
|
||||
// Publication Info
|
||||
if (publication) {
|
||||
md.push(`\n## Publication`);
|
||||
if (publication.journal) md.push(`- **Journal:** ${publication.journal}`);
|
||||
if (publication.year) md.push(`- **Year:** ${publication.year}`);
|
||||
if (publication.doi) md.push(`- **DOI:** ${publication.doi}`);
|
||||
if (publication.url)
|
||||
md.push(`- **URL:** [${publication.url}](${publication.url})`);
|
||||
}
|
||||
|
||||
return md.join("\n");
|
||||
}
|
||||
@@ -0,0 +1,169 @@
|
||||
export const dataSchema = {
|
||||
type: "object",
|
||||
required: ["title", "authors", "abstract", "mainFindings"],
|
||||
properties: {
|
||||
title: {
|
||||
type: "string",
|
||||
description: "The full title of the research paper",
|
||||
},
|
||||
authors: {
|
||||
type: "array",
|
||||
description: "List of all authors of the paper",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
name: {
|
||||
type: "string",
|
||||
description: "Full name of the author",
|
||||
},
|
||||
affiliation: {
|
||||
type: "string",
|
||||
description:
|
||||
"Institution or organization the author is affiliated with",
|
||||
},
|
||||
email: {
|
||||
type: "string",
|
||||
description: "Contact email of the author if provided",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
abstract: {
|
||||
type: "string",
|
||||
description: "Complete abstract or summary of the paper",
|
||||
},
|
||||
keywords: {
|
||||
type: "array",
|
||||
description:
|
||||
"Key terms and phrases that describe the paper's main topics",
|
||||
items: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
mainFindings: {
|
||||
type: "array",
|
||||
description: "Key findings, conclusions, or contributions of the paper",
|
||||
items: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
methodology: {
|
||||
type: "object",
|
||||
description: "Research methods and approaches used",
|
||||
properties: {
|
||||
approach: {
|
||||
type: "string",
|
||||
description: "Overall research approach or study design",
|
||||
},
|
||||
participants: {
|
||||
type: "string",
|
||||
description: "Description of study participants or data sources",
|
||||
},
|
||||
methods: {
|
||||
type: "array",
|
||||
description: "Specific methods, techniques, or tools used",
|
||||
items: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
results: {
|
||||
type: "array",
|
||||
description: "Main results and outcomes of the research",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
finding: {
|
||||
type: "string",
|
||||
description: "Description of the specific result or finding",
|
||||
},
|
||||
significance: {
|
||||
type: "string",
|
||||
description:
|
||||
"Statistical significance or importance of the finding",
|
||||
},
|
||||
supportingData: {
|
||||
type: "string",
|
||||
description: "Relevant statistics, measurements, or data points",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
discussion: {
|
||||
type: "object",
|
||||
properties: {
|
||||
implications: {
|
||||
type: "array",
|
||||
description: "Theoretical or practical implications of the findings",
|
||||
items: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
limitations: {
|
||||
type: "array",
|
||||
description: "Study limitations or constraints",
|
||||
items: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
futureWork: {
|
||||
type: "array",
|
||||
description: "Suggested future research directions",
|
||||
items: {
|
||||
type: "string",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
references: {
|
||||
type: "array",
|
||||
description:
|
||||
"Key papers cited that are crucial to understanding this work",
|
||||
items: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: {
|
||||
type: "string",
|
||||
description: "Title of the cited paper",
|
||||
},
|
||||
authors: {
|
||||
type: "string",
|
||||
description: "Authors of the cited paper",
|
||||
},
|
||||
year: {
|
||||
type: "string",
|
||||
description: "Publication year",
|
||||
},
|
||||
relevance: {
|
||||
type: "string",
|
||||
description: "Why this reference is important to the current paper",
|
||||
},
|
||||
},
|
||||
required: ["title", "authors"],
|
||||
},
|
||||
},
|
||||
publication: {
|
||||
type: "object",
|
||||
properties: {
|
||||
journal: {
|
||||
type: "string",
|
||||
description: "Name of the journal or conference",
|
||||
},
|
||||
year: {
|
||||
type: "string",
|
||||
description: "Year of publication",
|
||||
},
|
||||
doi: {
|
||||
type: "string",
|
||||
description: "Digital Object Identifier (DOI) of the paper",
|
||||
},
|
||||
url: {
|
||||
type: "string",
|
||||
description: "URL where the paper can be accessed",
|
||||
},
|
||||
},
|
||||
required: ["year"],
|
||||
},
|
||||
},
|
||||
};
|
||||
@@ -0,0 +1,4 @@
|
||||
declare module "cli-markdown" {
|
||||
function cliMarkdown(input: string): string;
|
||||
export default cliMarkdown;
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
import * as readline from "readline/promises";
|
||||
import figlet from "figlet";
|
||||
import pc from "picocolors";
|
||||
|
||||
export async function renderLogo(): Promise<void> {
|
||||
const logoText = figlet.textSync("Extract Demo", {
|
||||
font: "ANSI Shadow",
|
||||
horizontalLayout: "default",
|
||||
verticalLayout: "default",
|
||||
width: 100,
|
||||
whitespaceBreak: true,
|
||||
});
|
||||
|
||||
// Add some styling with picocolors
|
||||
const styledLogo = pc.bold(pc.redBright(logoText));
|
||||
|
||||
// Add some padding/margin
|
||||
console.log("\n");
|
||||
console.log(styledLogo);
|
||||
console.log(pc.gray("─".repeat(60)));
|
||||
console.log("\n");
|
||||
}
|
||||
|
||||
export async function consoleInput(): Promise<string> {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
});
|
||||
|
||||
const answer = await rl.question("Path to your file: ");
|
||||
rl.close();
|
||||
return answer;
|
||||
}
|
||||
@@ -0,0 +1,131 @@
|
||||
# LlamaCloud Index Demo
|
||||
|
||||
A TypeScript demo application showcasing the power of **LlamaCloud Index** - a fully automated document ingestion and retrieval serviced offered within [LlamaCloud](https://cloud.llamaindex.ai). This demo allows you to ask questions, retrieve relevant contextual information and generate AI-powered responses using OpenAI's GPT models.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Installation](#installation)
|
||||
- [Usage](#usage)
|
||||
- [Start the Demo](#start-the-demo)
|
||||
- [Development Mode](#development-mode)
|
||||
- [Build the Project](#build-the-project)
|
||||
- [Code Quality](#code-quality)
|
||||
- [Quick Commands Reference](#quick-commands-reference)
|
||||
- [How It Works](#how-it-works)
|
||||
- [API Dependencies](#api-dependencies)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Common Issues](#common-issues)
|
||||
- [License](#license)
|
||||
- [Contributing](#contributing)
|
||||
|
||||
## Features
|
||||
|
||||
- 🤖 **RAG**: Simple-yet-effective Retrieval Augmented Generation pipeline built on top of LlamaCloud Index and OpenAI
|
||||
- 🎨 **Beautiful CLI**: Styled console interface with colors and ASCII art
|
||||
- ⚡ **Fast Development**: Hot reload support with watch mode
|
||||
- 🛠️ **TypeScript**: Full TypeScript support with strict type checking
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (version 18 or higher)
|
||||
- pnpm package manager
|
||||
- OpenAI API key
|
||||
- LlamaCloud API key
|
||||
- An existing LlamaCloud Index pipeline
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/run-llama/llama_cloud_services
|
||||
cd lama_cloud_services/examples-ts/index/
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
pnpm install
|
||||
```
|
||||
|
||||
3. Set up your environment variables:
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY="your-openai-api-key"
|
||||
export LLAMA_CLOUD_API_KEY="your-llamacloud-api-key"
|
||||
export PIPELINE_NAME="your-pipeline-name"
|
||||
```
|
||||
|
||||
4. Or write them into a `.env` file:
|
||||
|
||||
```env
|
||||
OPENAI_API_KEY="your-openai-api-key"
|
||||
LLAMA_CLOUD_API_KEY="your-llamacloud-api-key"
|
||||
PIPELINE_NAME="your-pipeline-name"
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Start the Demo
|
||||
|
||||
```bash
|
||||
pnpm run start
|
||||
```
|
||||
|
||||
The application will display a welcome screen and prompt you to start chatting!
|
||||
|
||||
### Development Mode
|
||||
|
||||
For development with hot reload:
|
||||
|
||||
```bash
|
||||
pnpm run dev
|
||||
```
|
||||
|
||||
### Build the Project
|
||||
|
||||
```bash
|
||||
pnpm run build
|
||||
```
|
||||
|
||||
### Code Quality
|
||||
|
||||
Format code:
|
||||
|
||||
```bash
|
||||
pnpm run format
|
||||
```
|
||||
|
||||
Lint code:
|
||||
|
||||
```bash
|
||||
pnpm run lint
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Message Input**: Enter a message
|
||||
2. **Retrieval**: Several nodes are retrieved from the LlamaCloud index you specified
|
||||
3. **AI Response Generation**: The retrieved information is passed on to the AI model, along with its relevance score, and a reply to your original message is generated starting from that.
|
||||
4. **Results**: View the AI-generated summary in your terminal
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Module Resolution Errors**: Ensure you're using Node.js 18+ and have all dependencies installed
|
||||
2. **API Key Issues**: Verify your OpenAI and LlamaCloud API keys are correctly set
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see the [LICENSE](../../LICENSE) file for details.
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Make your changes
|
||||
4. Run `pnpm run format` and `pnpm run lint`
|
||||
5. Submit a pull request
|
||||
@@ -0,0 +1,15 @@
|
||||
import js from "@eslint/js";
|
||||
import globals from "globals";
|
||||
import tseslint from "typescript-eslint";
|
||||
import { defineConfig } from "eslint/config";
|
||||
|
||||
export default defineConfig([
|
||||
{
|
||||
files: ["**/*.{js,mjs,cjs,ts,mts,cts}"],
|
||||
plugins: { js },
|
||||
extends: ["js/recommended"],
|
||||
languageOptions: { globals: globals.browser },
|
||||
},
|
||||
{ files: ["**/*.js"], languageOptions: { sourceType: "script" } },
|
||||
tseslint.configs.recommended,
|
||||
]);
|
||||
@@ -0,0 +1,48 @@
|
||||
{
|
||||
"name": "llama-chat",
|
||||
"version": "0.1.0",
|
||||
"description": "Demo for LlamaCloud Index in TypeScript",
|
||||
"type": "module",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"There are no tests\"",
|
||||
"start": "pnpm exec tsx src/index.ts",
|
||||
"lint": "eslint ./src/",
|
||||
"format": "prettier --write ./src/",
|
||||
"build": "tsc",
|
||||
"dev": "pnpm exec tsx --watch src/index.ts"
|
||||
},
|
||||
"keywords": [
|
||||
"ai",
|
||||
"rag",
|
||||
"retrieval",
|
||||
"pipeline",
|
||||
"llms",
|
||||
"chatbot"
|
||||
],
|
||||
"author": "LlamaIndex",
|
||||
"license": "MIT",
|
||||
"packageManager": "pnpm@10.12.4",
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.32.0",
|
||||
"@types/figlet": "^1.7.0",
|
||||
"@types/node": "^24.1.0",
|
||||
"@typescript-eslint/eslint-plugin": "^8.38.0",
|
||||
"@typescript-eslint/parser": "^8.38.0",
|
||||
"eslint": "^9.32.0",
|
||||
"globals": "^16.3.0",
|
||||
"jiti": "^2.5.1",
|
||||
"prettier": "^3.6.2",
|
||||
"typescript": "^5.8.3",
|
||||
"typescript-eslint": "^8.38.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ai-sdk/openai": "^1.3.23",
|
||||
"ai": "^4.3.19",
|
||||
"consola": "^3.4.2",
|
||||
"dotenv": "^17.2.1",
|
||||
"figlet": "^1.8.2",
|
||||
"llama-cloud-services": "link:../../ts/llama_cloud_services",
|
||||
"picocolors": "^1.1.1"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
import { LlamaCloudIndex } from "llama-cloud-services";
|
||||
import { logger } from "./logger";
|
||||
import pc from "picocolors";
|
||||
import {
|
||||
consoleInput,
|
||||
retrievalAugmentedGeneration,
|
||||
renderLogo,
|
||||
} from "./utils";
|
||||
import dotenv from "dotenv";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function main(): Promise<number> {
|
||||
const index = new LlamaCloudIndex({
|
||||
name: process.env.PIPELINE_NAME as string,
|
||||
projectName: "Default",
|
||||
apiKey: process.env.LLAMA_CLOUD_API_KEY, // can provide API-key in the constructor or in the env
|
||||
});
|
||||
const retriever = index.asRetriever({
|
||||
similarityTopK: 5,
|
||||
});
|
||||
await renderLogo();
|
||||
logger.log(
|
||||
`Welcome to ${pc.bold(
|
||||
pc.magentaBright("✨LlamaChat✨"),
|
||||
)}, our demo for ${pc.bold(pc.green("Index🦙"))}, a ${pc.bold(
|
||||
pc.cyan("LlamaCloud☁️"),
|
||||
)} (https://cloud.llamaindex.ai) product!.\nType a question below, and you will get an answer!👇\nIf you wish to exit, just type ${pc.bold(
|
||||
pc.gray("quit"),
|
||||
)}.\n`,
|
||||
);
|
||||
while (true) {
|
||||
const userInput = await consoleInput();
|
||||
if (userInput.toLowerCase() == "quit") {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
const nodes = await retriever.retrieve(userInput);
|
||||
const summary = await retrievalAugmentedGeneration(nodes, userInput);
|
||||
logger.log(`${pc.bold(pc.magentaBright("LlamaChat✨:"))}\n${summary}`);
|
||||
} catch (error) {
|
||||
logger.error(`Error processing your request: ${error}`);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -0,0 +1,8 @@
|
||||
import { createConsola } from "consola";
|
||||
import type { ConsolaInstance } from "consola";
|
||||
|
||||
export const logger: ConsolaInstance = createConsola({
|
||||
formatOptions: {
|
||||
date: false,
|
||||
},
|
||||
});
|
||||
@@ -0,0 +1,56 @@
|
||||
import { generateText } from "ai";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { NodeWithScore, MetadataMode } from "llamaindex";
|
||||
import * as readline from "readline/promises";
|
||||
import figlet from "figlet";
|
||||
import pc from "picocolors";
|
||||
|
||||
export async function renderLogo(): Promise<void> {
|
||||
const logoText = figlet.textSync("LlamaChat", {
|
||||
font: "ANSI Shadow",
|
||||
horizontalLayout: "default",
|
||||
verticalLayout: "default",
|
||||
width: 100,
|
||||
whitespaceBreak: true,
|
||||
});
|
||||
|
||||
// Add some styling with picocolors
|
||||
const styledLogo = pc.bold(pc.yellowBright(logoText));
|
||||
|
||||
// Add some padding/margin
|
||||
console.log("\n");
|
||||
console.log(styledLogo);
|
||||
console.log(pc.gray("─".repeat(60)));
|
||||
console.log("\n");
|
||||
}
|
||||
|
||||
export async function consoleInput(): Promise<string> {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
});
|
||||
|
||||
const answer = await rl.question(pc.cyanBright("You✨:"));
|
||||
rl.close();
|
||||
return answer;
|
||||
}
|
||||
|
||||
export async function retrievalAugmentedGeneration(
|
||||
nodes: NodeWithScore[],
|
||||
prompt: string,
|
||||
): Promise<string> {
|
||||
let mainText: string = "";
|
||||
|
||||
for (const node of nodes) {
|
||||
mainText += `\t{information: '${node.node.getContent(
|
||||
MetadataMode.ALL,
|
||||
)}', relevanceScore: '${node.score ?? "no score"}'}\n`;
|
||||
}
|
||||
|
||||
const { text } = await generateText({
|
||||
model: openai("gpt-4.1"),
|
||||
prompt: `[\n${mainText}\n]\n\nBased on the information you are given and on the relevance score of that (where -1 means no score available), answer to this user prompt: '${prompt}'`,
|
||||
});
|
||||
|
||||
return text;
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ES2022",
|
||||
"lib": ["ES2022"],
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"declaration": true,
|
||||
"declarationMap": true,
|
||||
"sourceMap": true,
|
||||
"types": ["node"],
|
||||
"moduleResolution": "bundler",
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"resolveJsonModule": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
# LlamaParse Demo
|
||||
|
||||
A TypeScript demo application showcasing the power of **LlamaParse** - an intelligent document parsing service from [LlamaCloud](https://cloud.llamaindex.ai). This demo allows you to parse various document formats and generate AI-powered summaries using OpenAI's GPT models.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Features](#features)
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Installation](#installation)
|
||||
- [Usage](#usage)
|
||||
- [Start the Demo](#start-the-demo)
|
||||
- [Development Mode](#development-mode)
|
||||
- [Build the Project](#build-the-project)
|
||||
- [Code Quality](#code-quality)
|
||||
- [Quick Commands Reference](#quick-commands-reference)
|
||||
- [How It Works](#how-it-works)
|
||||
- [API Dependencies](#api-dependencies)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Common Issues](#common-issues)
|
||||
- [License](#license)
|
||||
- [Contributing](#contributing)
|
||||
|
||||
## Features
|
||||
|
||||
- 📄 **Document Parsing**: Parse PDFs, Word docs, and other formats using LlamaParse
|
||||
- 🤖 **AI Summaries**: Generate intelligent summaries using OpenAI GPT-4
|
||||
- 🎨 **Beautiful CLI**: Styled console interface with colors and ASCII art
|
||||
- ⚡ **Fast Development**: Hot reload support with watch mode
|
||||
- 🛠️ **TypeScript**: Full TypeScript support with strict type checking
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (version 18 or higher)
|
||||
- pnpm package manager
|
||||
- OpenAI API key
|
||||
- LlamaCloud API key
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/run-llama/llama_cloud_services
|
||||
cd lama_cloud_services/examples-ts/parse/
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
pnpm install
|
||||
```
|
||||
|
||||
3. Set up your environment variables:
|
||||
|
||||
```bash
|
||||
# Add your API keys to your environment
|
||||
export OPENAI_API_KEY="your-openai-api-key"
|
||||
export LLAMA_CLOUD_API_KEY="your-llamacloud-api-key"
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Start the Demo
|
||||
|
||||
```bash
|
||||
pnpm run start
|
||||
```
|
||||
|
||||
The application will display a welcome screen and prompt you to enter the path to a document you'd like to process.
|
||||
|
||||
### Development Mode
|
||||
|
||||
For development with hot reload:
|
||||
|
||||
```bash
|
||||
pnpm run dev
|
||||
```
|
||||
|
||||
### Build the Project
|
||||
|
||||
```bash
|
||||
pnpm run build
|
||||
```
|
||||
|
||||
### Code Quality
|
||||
|
||||
Format code:
|
||||
|
||||
```bash
|
||||
pnpm run format
|
||||
```
|
||||
|
||||
Lint code:
|
||||
|
||||
```bash
|
||||
pnpm run lint
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Document Input**: Enter the path to your document when prompted
|
||||
2. **Parsing**: LlamaParse processes the document and extracts structured content
|
||||
3. **AI Summary**: The extracted content is sent to OpenAI GPT-4 for summarization
|
||||
4. **Results**: View the AI-generated summary in your terminal
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Module Resolution Errors**: Ensure you're using Node.js 18+ and have all dependencies installed
|
||||
2. **API Key Issues**: Verify your OpenAI and LlamaCloud API keys are correctly set
|
||||
3. **File Path Errors**: Use absolute paths or ensure relative paths are correct from the project root
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see the [LICENSE](../../LICENSE) file for details.
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Make your changes
|
||||
4. Run `pnpm run format` and `pnpm run lint`
|
||||
5. Submit a pull request
|
||||
@@ -0,0 +1,15 @@
|
||||
import js from "@eslint/js";
|
||||
import globals from "globals";
|
||||
import tseslint from "typescript-eslint";
|
||||
import { defineConfig } from "eslint/config";
|
||||
|
||||
export default defineConfig([
|
||||
{
|
||||
files: ["**/*.{js,mjs,cjs,ts,mts,cts}"],
|
||||
plugins: { js },
|
||||
extends: ["js/recommended"],
|
||||
languageOptions: { globals: globals.browser },
|
||||
},
|
||||
{ files: ["**/*.js"], languageOptions: { sourceType: "script" } },
|
||||
tseslint.configs.recommended,
|
||||
]);
|
||||
@@ -0,0 +1,47 @@
|
||||
{
|
||||
"name": "llamaparse-demo",
|
||||
"version": "0.1.0",
|
||||
"description": "Demo for LlamaParse in TypeScript",
|
||||
"type": "module",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"There are no tests\"",
|
||||
"start": "pnpm exec tsx src/index.ts",
|
||||
"lint": "eslint ./src/",
|
||||
"format": "prettier --write ./src/",
|
||||
"build": "tsc",
|
||||
"dev": "pnpm exec tsx --watch src/index.ts"
|
||||
},
|
||||
"keywords": [
|
||||
"ai",
|
||||
"ocr",
|
||||
"parsing",
|
||||
"intelligent-document-processing",
|
||||
"pdf",
|
||||
"llms"
|
||||
],
|
||||
"author": "LlamaIndex",
|
||||
"license": "MIT",
|
||||
"packageManager": "pnpm@10.12.4",
|
||||
"devDependencies": {
|
||||
"@eslint/js": "^9.32.0",
|
||||
"@types/figlet": "^1.7.0",
|
||||
"@types/node": "^24.1.0",
|
||||
"@typescript-eslint/eslint-plugin": "^8.38.0",
|
||||
"@typescript-eslint/parser": "^8.38.0",
|
||||
"eslint": "^9.32.0",
|
||||
"globals": "^16.3.0",
|
||||
"jiti": "^2.5.1",
|
||||
"prettier": "^3.6.2",
|
||||
"typescript": "^5.8.3",
|
||||
"typescript-eslint": "^8.38.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ai-sdk/openai": "^1.3.23",
|
||||
"ai": "^4.3.19",
|
||||
"consola": "^3.4.2",
|
||||
"figlet": "^1.8.2",
|
||||
"llama-cloud-services": "link:../../ts/llama_cloud_services",
|
||||
"picocolors": "^1.1.1"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
import { LlamaParseReader } from "llama-cloud-services";
|
||||
import { logger } from "./logger";
|
||||
import pc from "picocolors";
|
||||
import { consoleInput, generateSummary, renderLogo } from "./utils";
|
||||
|
||||
export async function main(): Promise<number> {
|
||||
const reader = new LlamaParseReader({ resultType: "markdown" });
|
||||
await renderLogo();
|
||||
logger.log(
|
||||
`Welcome to ${pc.bold(
|
||||
pc.magentaBright("✨LlamaParse Demo✨"),
|
||||
)}, our demo for ${pc.bold(pc.green("LlamaParse🦙"))}, a ${pc.bold(
|
||||
pc.cyan("LlamaCloud☁️"),
|
||||
)} (https://cloud.llamaindex.ai) product!.\nType the path to the document you would like to process below👇\nIf you wish to exit, just type ${pc.bold(
|
||||
pc.gray("quit"),
|
||||
)}.\n`,
|
||||
);
|
||||
while (true) {
|
||||
const userInput = await consoleInput();
|
||||
if (userInput.toLowerCase() == "quit") {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
const documents = await reader.loadData(userInput);
|
||||
const summary = await generateSummary(documents); // Added await here
|
||||
logger.log(`${pc.bold(pc.cyan("AI-generated summary✨"))}:\n${summary}`);
|
||||
} catch (error) {
|
||||
logger.error(`Error processing file: ${error}`);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -0,0 +1,8 @@
|
||||
import { createConsola } from "consola";
|
||||
import type { ConsolaInstance } from "consola";
|
||||
|
||||
export const logger: ConsolaInstance = createConsola({
|
||||
formatOptions: {
|
||||
date: false,
|
||||
},
|
||||
});
|
||||
@@ -0,0 +1,51 @@
|
||||
import { generateText } from "ai";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { Document } from "llamaindex";
|
||||
import * as readline from "readline/promises";
|
||||
import figlet from "figlet";
|
||||
import pc from "picocolors";
|
||||
|
||||
export async function renderLogo(): Promise<void> {
|
||||
const logoText = figlet.textSync("LlamaParse Demo", {
|
||||
font: "ANSI Shadow",
|
||||
horizontalLayout: "default",
|
||||
verticalLayout: "default",
|
||||
width: 100,
|
||||
whitespaceBreak: true,
|
||||
});
|
||||
|
||||
// Add some styling with picocolors
|
||||
const styledLogo = pc.bold(pc.magentaBright(logoText));
|
||||
|
||||
// Add some padding/margin
|
||||
console.log("\n");
|
||||
console.log(styledLogo);
|
||||
console.log(pc.gray("─".repeat(60)));
|
||||
console.log("\n");
|
||||
}
|
||||
|
||||
export async function consoleInput(): Promise<string> {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
});
|
||||
|
||||
const answer = await rl.question("Path to your file: ");
|
||||
rl.close();
|
||||
return answer;
|
||||
}
|
||||
|
||||
export async function generateSummary(documents: Document[]): Promise<string> {
|
||||
let mainText: string = "";
|
||||
|
||||
for (const document of documents) {
|
||||
mainText += `${document.text}\n\n---\n\n`;
|
||||
}
|
||||
|
||||
const { text } = await generateText({
|
||||
model: openai("gpt-4.1"),
|
||||
prompt: `</chat>\n\t<text>${mainText}</text>\n\t<instructions>Could you please generate a summary of the given text?</instructions>\n</chat>`,
|
||||
});
|
||||
|
||||
return text;
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ES2022",
|
||||
"lib": ["ES2022"],
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"declaration": true,
|
||||
"declarationMap": true,
|
||||
"sourceMap": true,
|
||||
"types": ["node"],
|
||||
"moduleResolution": "bundler",
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"resolveJsonModule": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
# LlamaCloud Services Examples - Python
|
||||
> **⚠️ DEPRECATION NOTICE**
|
||||
>
|
||||
> This repository and its packages are deprecated and will be maintained until **May 1, 2026**.
|
||||
>
|
||||
> **Please migrate to the new packages:**
|
||||
> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))
|
||||
> - **TypeScript**: `npm install @llamaindex/llama-cloud` ([GitHub](https://github.com/run-llama/llama-cloud-ts))
|
||||
>
|
||||
> The new packages provide the same functionality with improved performance, better support, and active development.
|
||||
|
||||
|
||||
In this folder you will find several python notebooks that contain examples regarding:
|
||||
|
||||
- [LlamaParse](./parse/)
|
||||
- [LlamaExtract](./extract/)
|
||||
- [LlamaCloudIndex](./index/)
|
||||
|
||||
Follow the instructions in each notebook to get started!
|
||||
@@ -1,302 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LlamaParse Agent\n",
|
||||
"\n",
|
||||
"This demo walks through using an OpenAI Agent with [LlamaParse](https://cloud.llamaindex.ai)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install llama-parse llama-index llama-index-postprocessor-sbert-rerank"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import Settings\n",
|
||||
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
|
||||
"from llama_index.llms.openai import OpenAI\n",
|
||||
"\n",
|
||||
"Settings.embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")\n",
|
||||
"Settings.llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Parsing \n",
|
||||
"\n",
|
||||
"For parsing, lets use a [recent paper](https://huggingface.co/papers/2403.09611) on Multi-Modal pretraining"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget https://arxiv.org/pdf/2403.09611.pdf -O paper.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below, we can tell the parser to skip content we don't want. In this case, the references section will just add noise to a RAG system."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"parser = LlamaParse(\n",
|
||||
" result_type=\"markdown\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 81251f39-01be-434e-99e8-1c1b83b82098\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"documents = await parser.aload_data(\"paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Embeddings have been explicitly disabled. Using MockEmbedding.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"41it [00:00, 26765.21it/s]\n",
|
||||
"100%|██████████| 41/41 [00:13<00:00, 2.98it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"\n",
|
||||
"from llama_index.core.node_parser import (\n",
|
||||
" MarkdownElementNodeParser,\n",
|
||||
" SentenceSplitter,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# explicitly extract tables with the MarkdownElementNodeParser\n",
|
||||
"node_parser = MarkdownElementNodeParser(num_workers=8)\n",
|
||||
"nodes = node_parser.get_nodes_from_documents(documents)\n",
|
||||
"nodes, objects = node_parser.get_nodes_and_objects(nodes)\n",
|
||||
"\n",
|
||||
"# Chain splitters to ensure chunk size requirements are met\n",
|
||||
"nodes = SentenceSplitter(chunk_size=512, chunk_overlap=20).get_nodes_from_documents(\n",
|
||||
" nodes\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Chat over the paper, lets find out what it is about!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import VectorStoreIndex, SummaryIndex\n",
|
||||
"\n",
|
||||
"vector_index = VectorStoreIndex(nodes=nodes)\n",
|
||||
"summary_index = SummaryIndex(nodes=nodes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.agent.openai import OpenAIAgent\n",
|
||||
"from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
|
||||
"from llama_index.postprocessor.colbert_rerank import ColbertRerank\n",
|
||||
"\n",
|
||||
"tools = [\n",
|
||||
" QueryEngineTool(\n",
|
||||
" vector_index.as_query_engine(\n",
|
||||
" similarity_top_k=8, node_postprocessors=[ColbertRerank(top_n=3)]\n",
|
||||
" ),\n",
|
||||
" metadata=ToolMetadata(\n",
|
||||
" name=\"search\",\n",
|
||||
" description=\"Search the document, pass the entire user message in the query\",\n",
|
||||
" ),\n",
|
||||
" ),\n",
|
||||
" QueryEngineTool(\n",
|
||||
" summary_index.as_query_engine(),\n",
|
||||
" metadata=ToolMetadata(\n",
|
||||
" name=\"summarize\",\n",
|
||||
" description=\"Summarize the document using the user message\",\n",
|
||||
" ),\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"agent = OpenAIAgent.from_tools(tools=tools, verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Added user message to memory: What is the summary of the paper?\n",
|
||||
"=== Calling Function ===\n",
|
||||
"Calling function: summarize with args: {\"input\":\"summary\"}\n",
|
||||
"Got output: The research focuses on developing Multimodal Large Language Models (MLLMs) by incorporating image-caption, interleaved image-text, and text-only data for pre-training. It highlights the importance of factors like the image encoder, resolution, and token count, while downplaying the design of the vision-language connector. With models scaling up to 30B parameters, the MM1 family demonstrates impressive performance in pre-training metrics and competitive outcomes on diverse multimodal benchmarks. It demonstrates abilities such as in-context learning and multi-image reasoning, aiming to provide valuable insights for creating MLLMs that benefit the research community.\n",
|
||||
"========================\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# note -- this will take a while with local LLMs, its sending every node in the document to the LLM\n",
|
||||
"resp = agent.chat(\"What is the summary of the paper?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The summary of the paper highlights the development of Multimodal Large Language Models (MLLMs) by incorporating image-caption, interleaved image-text, and text-only data for pre-training. The research emphasizes factors like the image encoder, resolution, and token count, while de-emphasizing the design of the vision-language connector. The MM1 family of models, scaling up to 30B parameters, shows impressive performance in pre-training metrics and competitive outcomes on various multimodal benchmarks. These models demonstrate capabilities such as in-context learning and multi-image reasoning, aiming to provide valuable insights for creating MLLMs that benefit the research community.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(resp))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Added user message to memory: How do the authors evaluate their work?\n",
|
||||
"=== Calling Function ===\n",
|
||||
"Calling function: search with args: {\"input\":\"evaluation methods\"}\n",
|
||||
"Got output: The evaluation methods involve synthesizing all benchmark results into a single meta-average number to simplify comparisons. This is achieved by normalizing the evaluation metrics with respect to a baseline configuration, standardizing the results for each task, adjusting every metric by dividing it by its respective baseline, and then averaging across all metrics.\n",
|
||||
"========================\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"resp = agent.chat(\"How do the authors evaluate their work?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The authors evaluate their work by synthesizing all benchmark results into a single meta-average number to simplify comparisons. They normalize the evaluation metrics with respect to a baseline configuration, standardize the results for each task, adjust every metric by dividing it by its respective baseline, and then average across all metrics for evaluation.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(resp))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama-parse-aNC435Vv-py3.10",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
sample_files/
|
||||
@@ -0,0 +1,815 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Batch Parse with LlamaCloud Directories\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to use LlamaCloud's batch processing API to parse multiple files in a directory. The workflow includes:\n",
|
||||
"\n",
|
||||
"1. **Creating a Directory** - Set up a directory to organize your files\n",
|
||||
"2. **Uploading Files** - Upload multiple files to the directory\n",
|
||||
"3. **Starting a Batch Parse Job** - Kick off batch processing on all files\n",
|
||||
"4. **Monitoring Progress** - Check the status and view results\n",
|
||||
"\n",
|
||||
"This is useful when you need to parse many documents at once, as the batch API handles the orchestration and provides progress tracking."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0c2b5e1a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> **⚠️ DEPRECATION NOTICE**>> This example uses the deprecated `llama-cloud-services` package, which will be maintained until **May 1, 2026**.>> **Please migrate to:**> - **Python**: `pip install llama-cloud>=1.0` ([GitHub](https://github.com/run-llama/llama-cloud-py))> - **New Package Documentation**: https://docs.cloud.llamaindex.ai/>> The new package provides the same functionality with improved performance and support."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup and Installation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-cloud python-dotenv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"import httpx\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Set your API key\n",
|
||||
"LLAMA_CLOUD_API_KEY = os.environ.get(\"LLAMA_CLOUD_API_KEY\", \"llx-...\")\n",
|
||||
"\n",
|
||||
"# Optional: Set base URL (defaults to https://api.cloud.llamaindex.ai if not set)\n",
|
||||
"LLAMA_CLOUD_BASE_URL = os.environ.get(\n",
|
||||
" \"LLAMA_CLOUD_BASE_URL\", \"https://api.cloud.llamaindex.ai\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Optional: Set project_id if you have one, otherwise it will use your default project\n",
|
||||
"PROJECT_ID = os.environ.get(\"LLAMA_CLOUD_PROJECT_ID\", None)\n",
|
||||
"\n",
|
||||
"print(\"✅ API key configured\")\n",
|
||||
"print(f\" Base URL: {LLAMA_CLOUD_BASE_URL}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup HTTP Client\n",
|
||||
"\n",
|
||||
"Since the current version of the llama-cloud SDK has some issues with the beta endpoints, we'll use direct HTTP requests with httpx for reliability."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create HTTP client with authentication\n",
|
||||
"headers = {\n",
|
||||
" \"Authorization\": f\"Bearer {LLAMA_CLOUD_API_KEY}\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"✅ HTTP client configured\")\n",
|
||||
"print(f\" Using base URL: {LLAMA_CLOUD_BASE_URL}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 1: Create a Directory\n",
|
||||
"\n",
|
||||
"First, we'll create a directory to organize our files. Directories help you group related files together for batch processing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"# Create a directory with a timestamp in the name\n",
|
||||
"timestamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
|
||||
"directory_name = f\"batch-parse-demo-{timestamp}\"\n",
|
||||
"\n",
|
||||
"# Create directory using HTTP request\n",
|
||||
"response = httpx.post(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/directories\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": PROJECT_ID},\n",
|
||||
" json={\n",
|
||||
" \"name\": directory_name,\n",
|
||||
" \"description\": \"Demo directory for batch parse example\",\n",
|
||||
" },\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code in [200, 201]:\n",
|
||||
" directory = response.json()\n",
|
||||
" directory_id = directory[\"id\"]\n",
|
||||
" project_id = directory[\"project_id\"]\n",
|
||||
"\n",
|
||||
" print(f\"✅ Created directory: {directory['name']}\")\n",
|
||||
" print(f\" Directory ID: {directory_id}\")\n",
|
||||
" print(f\" Project ID: {project_id}\")\n",
|
||||
"else:\n",
|
||||
" raise Exception(\n",
|
||||
" f\"Failed to create directory: {response.status_code} - {response.text}\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 2: Upload Files to the Directory\n",
|
||||
"\n",
|
||||
"Now we'll upload some files to our directory. For this demo, we'll download some sample PDFs and upload them.\n",
|
||||
"\n",
|
||||
"You can replace these with your own files."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a directory for sample files\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"os.makedirs(\"sample_files\", exist_ok=True)\n",
|
||||
"\n",
|
||||
"# Sample documents to download\n",
|
||||
"sample_docs = {\n",
|
||||
" \"attention.pdf\": \"https://arxiv.org/pdf/1706.03762.pdf\",\n",
|
||||
" \"bert.pdf\": \"https://arxiv.org/pdf/1810.04805.pdf\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Download sample documents\n",
|
||||
"for filename, url in sample_docs.items():\n",
|
||||
" filepath = f\"sample_files/{filename}\"\n",
|
||||
" if not os.path.exists(filepath):\n",
|
||||
" print(f\"📥 Downloading {filename}...\")\n",
|
||||
" response = requests.get(url)\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" with open(filepath, \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
" print(f\" ✅ Downloaded {filename}\")\n",
|
||||
" else:\n",
|
||||
" print(f\" ❌ Failed to download {filename}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"📁 {filename} already exists\")\n",
|
||||
"\n",
|
||||
"print(\"\\n✅ Sample files ready!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-10",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload Files to Directory\n",
|
||||
"\n",
|
||||
"Now let's upload the files to our directory using the `upload_file_to_directory` endpoint."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-11",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"uploaded_files = []\n",
|
||||
"\n",
|
||||
"# Workaround: Use direct HTTP requests instead of SDK due to SDK bug\n",
|
||||
"import httpx\n",
|
||||
"\n",
|
||||
"for filename in os.listdir(\"sample_files\"):\n",
|
||||
" if filename.endswith(\".pdf\"):\n",
|
||||
" filepath = f\"sample_files/{filename}\"\n",
|
||||
"\n",
|
||||
" print(f\"📤 Uploading {filename}...\")\n",
|
||||
"\n",
|
||||
" # Upload file using direct HTTP request (SDK has a bug with file uploads)\n",
|
||||
" with open(filepath, \"rb\") as f:\n",
|
||||
" # Prepare the multipart form data correctly\n",
|
||||
" files = {\"upload_file\": (filename, f, \"application/pdf\")}\n",
|
||||
"\n",
|
||||
" # Make the request directly\n",
|
||||
" response = httpx.post(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/directories/{directory_id}/files/upload\",\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" files=files,\n",
|
||||
" headers={\"Authorization\": f\"Bearer {LLAMA_CLOUD_API_KEY}\"},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code in [200, 201]:\n",
|
||||
" directory_file = response.json()\n",
|
||||
" uploaded_files.append(directory_file)\n",
|
||||
" print(f\" ✅ Uploaded: {directory_file.get('display_name')}\")\n",
|
||||
" print(f\" File ID: {directory_file.get('id')}\")\n",
|
||||
" else:\n",
|
||||
" print(f\" ❌ Upload failed: {response.status_code}\")\n",
|
||||
" print(f\" Error: {response.text[:200]}\")\n",
|
||||
"\n",
|
||||
"print(f\"\\n✅ Uploaded {len(uploaded_files)} files to directory\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-12",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 3: Create a Batch Parse Job\n",
|
||||
"\n",
|
||||
"Now that we have files in our directory, let's create a batch parse job to process them all at once.\n",
|
||||
"\n",
|
||||
"The batch processing API uses the same configuration as LlamaParse."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-13",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Configure the parse job\n",
|
||||
"# This configuration will apply to all files in the directory\n",
|
||||
"job_config = {\n",
|
||||
" \"job_name\": \"parse_raw_file_job\", # Must match the JobNames enum value\n",
|
||||
" \"partitions\": {},\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"type\": \"parse\",\n",
|
||||
" \"lang\": \"en\",\n",
|
||||
" \"fast_mode\": True,\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"✅ Job configuration created\")\n",
|
||||
"print(f\" Language: {job_config['parameters']['lang']}\")\n",
|
||||
"print(f\" Fast mode: {job_config['parameters']['fast_mode']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-14",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit the Batch Job\n",
|
||||
"\n",
|
||||
"Now let's submit the batch job to process all files in the directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-15",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(f\"🚀 Submitting batch parse job for directory: {directory_id}\")\n",
|
||||
"print(f\" Processing {len(uploaded_files)} files...\\n\")\n",
|
||||
"\n",
|
||||
"# Submit batch job using HTTP request\n",
|
||||
"response = httpx.post(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" json={\n",
|
||||
" \"directory_id\": directory_id,\n",
|
||||
" \"job_config\": job_config,\n",
|
||||
" \"page_size\": 100, # Number of files to fetch per batch\n",
|
||||
" \"continue_as_new_threshold\": 10, # Workflow continuation threshold\n",
|
||||
" },\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code in [200, 201]:\n",
|
||||
" batch_job = response.json()\n",
|
||||
" batch_job_id = batch_job[\"id\"]\n",
|
||||
"\n",
|
||||
" print(\"✅ Batch job submitted successfully!\")\n",
|
||||
" print(f\" Batch Job ID: {batch_job_id}\")\n",
|
||||
" print(f\" Workflow ID: {batch_job.get('workflow_id')}\")\n",
|
||||
" print(f\" Status: {batch_job.get('status')}\")\n",
|
||||
" print(f\" Total Items: {batch_job.get('total_items')}\")\n",
|
||||
"else:\n",
|
||||
" raise Exception(\n",
|
||||
" f\"Failed to create batch job: {response.status_code} - {response.text}\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-16",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 4: Monitor Job Progress\n",
|
||||
"\n",
|
||||
"Now let's monitor the batch job progress. We'll poll the status endpoint to see how the job is progressing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def print_job_status(status_data):\n",
|
||||
" \"\"\"Helper function to print job status in a readable format.\"\"\"\n",
|
||||
" job = status_data[\"job\"]\n",
|
||||
" progress_pct = status_data[\"progress_percentage\"]\n",
|
||||
"\n",
|
||||
" print(f\"\\n{'='*60}\")\n",
|
||||
" print(f\"Job Status: {job['status']}\")\n",
|
||||
" print(f\"{'='*60}\")\n",
|
||||
" print(f\"Total Items: {job['total_items']}\")\n",
|
||||
" print(f\"Completed: {job['processed_items']}\")\n",
|
||||
" print(f\"Failed: {job['failed_items']}\")\n",
|
||||
" print(f\"Skipped: {job['skipped_items']}\")\n",
|
||||
" print(f\"Progress: {progress_pct:.1f}%\")\n",
|
||||
"\n",
|
||||
" if job.get(\"completed_at\"):\n",
|
||||
" print(f\"Completed At: {job['completed_at']}\")\n",
|
||||
" elif job.get(\"started_at\"):\n",
|
||||
" print(f\"Started At: {job['started_at']}\")\n",
|
||||
"\n",
|
||||
" print(f\"{'='*60}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Poll for status updates\n",
|
||||
"print(\"🔄 Monitoring batch job progress...\")\n",
|
||||
"print(\n",
|
||||
" \"Note: It may take a few seconds for the workflow to initialize and count files.\\n\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"max_polls = 60 # Maximum number of status checks (increased for longer jobs)\n",
|
||||
"poll_interval = 10 # Seconds between checks\n",
|
||||
"\n",
|
||||
"for i in range(max_polls):\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing/{batch_job_id}\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" status_data = response.json()\n",
|
||||
" print_job_status(status_data)\n",
|
||||
"\n",
|
||||
" # Check if job is complete\n",
|
||||
" job_status = status_data[\"job\"][\"status\"]\n",
|
||||
" if job_status in [\"completed\", \"failed\", \"cancelled\"]:\n",
|
||||
" print(f\"\\n✅ Job finished with status: {job_status}\")\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" if i < max_polls - 1:\n",
|
||||
" print(f\"\\n⏳ Waiting {poll_interval} seconds before next check...\")\n",
|
||||
" time.sleep(poll_interval)\n",
|
||||
" else:\n",
|
||||
" print(f\"Error getting status: {response.status_code} - {response.text}\")\n",
|
||||
" break\n",
|
||||
"else:\n",
|
||||
" print(f\"\\n⚠️ Reached maximum polling attempts. Job may still be running.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-18",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 5: View Job Items\n",
|
||||
"\n",
|
||||
"Let's look at the individual items in the batch job to see which files were processed successfully."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-19",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get all items in the batch job\n",
|
||||
"response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing/{batch_job_id}/items\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id, \"limit\": 100},\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" items_response = response.json()\n",
|
||||
"\n",
|
||||
" print(f\"\\n📋 Batch Job Items ({items_response['total_size']} total)\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" for item in items_response[\"items\"]:\n",
|
||||
" status_emoji = (\n",
|
||||
" \"✅\"\n",
|
||||
" if item[\"status\"] == \"completed\"\n",
|
||||
" else \"❌\"\n",
|
||||
" if item[\"status\"] == \"failed\"\n",
|
||||
" else \"⏳\"\n",
|
||||
" )\n",
|
||||
" print(f\"{status_emoji} {item['item_name']}\")\n",
|
||||
" print(f\" Status: {item['status']}\")\n",
|
||||
" print(f\" Item ID: {item['item_id']}\")\n",
|
||||
"\n",
|
||||
" if item.get(\"error_message\"):\n",
|
||||
" print(f\" Error: {item['error_message']}\")\n",
|
||||
"\n",
|
||||
" print()\n",
|
||||
"else:\n",
|
||||
" print(f\"Error listing items: {response.status_code} - {response.text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-20",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 6: Retrieve Processing Results\n",
|
||||
"\n",
|
||||
"For each completed file, we can retrieve the processing results to see where the parsed output is stored."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-21",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get processing results for a specific item\n",
|
||||
"if items_response[\"items\"]:\n",
|
||||
" first_item = items_response[\"items\"][0]\n",
|
||||
"\n",
|
||||
" print(f\"\\n🔍 Processing results for: {first_item['item_name']}\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing/items/{first_item['item_id']}/processing-results\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" results = response.json()\n",
|
||||
"\n",
|
||||
" print(f\"Item: {results['item_name']}\")\n",
|
||||
" print(f\"Total processing runs: {len(results['processing_results'])}\\n\")\n",
|
||||
"\n",
|
||||
" for i, result in enumerate(results[\"processing_results\"], 1):\n",
|
||||
" print(f\"Run {i}:\")\n",
|
||||
" print(f\" Job Type: {result['job_type']}\")\n",
|
||||
" print(f\" Processed At: {result['processed_at']}\")\n",
|
||||
" print(f\" Parameters Hash: {result['parameters_hash']}\")\n",
|
||||
"\n",
|
||||
" if result.get(\"output_s3_path\"):\n",
|
||||
" print(f\" Output S3 Path: {result['output_s3_path']}\")\n",
|
||||
"\n",
|
||||
" if result.get(\"output_metadata\"):\n",
|
||||
" print(f\" Output Metadata: {result['output_metadata']}\")\n",
|
||||
"\n",
|
||||
" print()\n",
|
||||
" else:\n",
|
||||
" print(f\"Error getting results: {response.status_code} - {response.text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cell-22",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Optional: List All Batch Jobs\n",
|
||||
"\n",
|
||||
"You can also list all batch jobs in your project to see the history of batch processing operations."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cell-23",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# List all parse jobs in the project\n",
|
||||
"response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/beta/batch-processing\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id, \"job_type\": \"parse\", \"limit\": 10},\n",
|
||||
" timeout=60.0,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" jobs_response = response.json()\n",
|
||||
"\n",
|
||||
" print(f\"\\n📊 Recent Batch Parse Jobs ({jobs_response['total_size']} total)\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" for job in jobs_response[\"items\"]:\n",
|
||||
" status_emoji = (\n",
|
||||
" \"✅\"\n",
|
||||
" if job[\"status\"] == \"completed\"\n",
|
||||
" else \"❌\"\n",
|
||||
" if job[\"status\"] == \"failed\"\n",
|
||||
" else \"⏳\"\n",
|
||||
" )\n",
|
||||
" print(f\"{status_emoji} Job ID: {job['id']}\")\n",
|
||||
" print(f\" Status: {job['status']}\")\n",
|
||||
" print(f\" Directory: {job['directory_id']}\")\n",
|
||||
" print(f\" Total Items: {job['total_items']}\")\n",
|
||||
" print(f\" Completed: {job['processed_items']}\")\n",
|
||||
" print(f\" Created: {job['created_at']}\")\n",
|
||||
" print()\n",
|
||||
"else:\n",
|
||||
" print(f\"Error listing jobs: {response.status_code} - {response.text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "uug7591rkq",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 7: Retrieve Parsed Text Results\n",
|
||||
"\n",
|
||||
"Once the batch job is complete, each BatchJobItem will have a `job_id` field that maps to a parse job ID. We can use this ID with the standard parse client methods to fetch the actual parsed text results."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "vpp0vxtc0y",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get all completed items and their job IDs\n",
|
||||
"completed_items = [\n",
|
||||
" item for item in items_response[\"items\"] if item[\"status\"] == \"completed\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"print(f\"📄 Found {len(completed_items)} completed items\\n\")\n",
|
||||
"print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
"# Display the job_id for each completed item\n",
|
||||
"for item in completed_items:\n",
|
||||
" print(f\"📝 {item['item_name']}\")\n",
|
||||
" print(f\" Item ID: {item['item_id']}\")\n",
|
||||
" print(f\" Parse Job ID: {item['job_id']}\")\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4gck6hwpnl6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Fetch Parsed Text for a Specific Document\n",
|
||||
"\n",
|
||||
"Now let's use the `job_id` to retrieve the actual parsed text content using the parse client methods."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "g191kvgxxvk",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get the parsed text for the first completed item\n",
|
||||
"if completed_items:\n",
|
||||
" first_completed = completed_items[0]\n",
|
||||
"\n",
|
||||
" print(f\"📖 Retrieving parsed text for: {first_completed['item_name']}\")\n",
|
||||
" print(f\" Using Parse Job ID: {first_completed['job_id']}\\n\")\n",
|
||||
" print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
" # Use the job_id to fetch the parse result\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{first_completed['job_id']}/result/text\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" parse_result = response.text\n",
|
||||
"\n",
|
||||
" print(f\"✅ Retrieved parsed text ({len(parse_result)} characters)\\n\")\n",
|
||||
"\n",
|
||||
" # Display first 1000 characters as a preview\n",
|
||||
" print(\"Preview (first 1000 characters):\")\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
" print(parse_result[:1000])\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
"\n",
|
||||
" if len(parse_result) > 1000:\n",
|
||||
" print(f\"\\n... and {len(parse_result) - 1000} more characters\")\n",
|
||||
" else:\n",
|
||||
" print(\n",
|
||||
" f\"Error retrieving parse result: {response.status_code} - {response.text}\"\n",
|
||||
" )\n",
|
||||
"else:\n",
|
||||
" print(\"⚠️ No completed items found to retrieve results from\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2olccb4l8fj",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retrieve Parsed Results in Other Formats\n",
|
||||
"\n",
|
||||
"You can also retrieve the parsed results in JSON or Markdown format using different client methods."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "lcqsfxiw0sr",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if completed_items:\n",
|
||||
" first_completed = completed_items[0]\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"📋 Retrieving parse results in different formats for: {first_completed['item_name']}\\n\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Get as JSON (includes structured data with pages, images, etc.)\n",
|
||||
" print(\"1️⃣ Retrieving as JSON...\")\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{first_completed['job_id']}/result/json\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" json_result = response.json()\n",
|
||||
" print(f\" ✅ JSON result with {len(json_result['pages'])} pages\")\n",
|
||||
" print(f\" Keys: {list(json_result.keys())}\\n\")\n",
|
||||
" else:\n",
|
||||
" print(f\" Error: {response.status_code}\\n\")\n",
|
||||
"\n",
|
||||
" # Get as Markdown\n",
|
||||
" print(\"2️⃣ Retrieving as Markdown...\")\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{first_completed['job_id']}/result/markdown\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" markdown_result = response.text\n",
|
||||
" print(f\" ✅ Markdown result ({len(markdown_result)} characters)\\n\")\n",
|
||||
"\n",
|
||||
" # Display markdown preview\n",
|
||||
" print(\"Markdown Preview (first 500 characters):\")\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
" print(markdown_result[:500])\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
"\n",
|
||||
" if len(markdown_result) > 500:\n",
|
||||
" print(f\"\\n... and {len(markdown_result) - 500} more characters\")\n",
|
||||
" else:\n",
|
||||
" print(f\" Error: {response.status_code}\")\n",
|
||||
"else:\n",
|
||||
" print(\"⚠️ No completed items found to retrieve results from\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "lr61wqkfq3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Batch Process All Parsed Results\n",
|
||||
"\n",
|
||||
"You can also loop through all completed items to retrieve and process all the parsed results."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "kltydf9xzkl",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Process all completed items\n",
|
||||
"print(f\"🔄 Processing all {len(completed_items)} completed items...\\n\")\n",
|
||||
"print(f\"{'='*80}\\n\")\n",
|
||||
"\n",
|
||||
"all_results = {}\n",
|
||||
"\n",
|
||||
"for item in completed_items:\n",
|
||||
" print(f\"📄 Processing: {item['item_name']}\")\n",
|
||||
" print(f\" Parse Job ID: {item['job_id']}\")\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" # Retrieve the parsed text for this item\n",
|
||||
" response = httpx.get(\n",
|
||||
" f\"{LLAMA_CLOUD_BASE_URL}/api/v1/parsing/job/{item['job_id']}/result/text\",\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"project_id\": project_id},\n",
|
||||
" timeout=60.0,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" parsed_text = response.text\n",
|
||||
"\n",
|
||||
" all_results[item[\"item_name\"]] = {\n",
|
||||
" \"job_id\": item[\"job_id\"],\n",
|
||||
" \"text\": parsed_text,\n",
|
||||
" \"length\": len(parsed_text),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" print(f\" ✅ Retrieved {len(parsed_text)} characters\")\n",
|
||||
" else:\n",
|
||||
" all_results[item[\"item_name\"]] = {\n",
|
||||
" \"job_id\": item[\"job_id\"],\n",
|
||||
" \"error\": f\"HTTP {response.status_code}\",\n",
|
||||
" }\n",
|
||||
" print(f\" ❌ Error: HTTP {response.status_code}\")\n",
|
||||
"\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\" ❌ Error: {str(e)}\")\n",
|
||||
" all_results[item[\"item_name\"]] = {\"job_id\": item[\"job_id\"], \"error\": str(e)}\n",
|
||||
"\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"print(f\"{'='*80}\")\n",
|
||||
"print(f\"\\n✅ Processed {len(all_results)} items\")\n",
|
||||
"print(f\"\\nSummary:\")\n",
|
||||
"for name, result in all_results.items():\n",
|
||||
" if \"error\" in result:\n",
|
||||
" print(f\" ❌ {name}: Error - {result['error']}\")\n",
|
||||
" else:\n",
|
||||
" print(f\" ✅ {name}: {result['length']:,} characters\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,529 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c148b65e-e8a6-476e-86ba-bf6a73d479c7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RAG over the Caltrain Weekend Schedule \n",
|
||||
"\n",
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_parse/blob/main/examples/caltrain/caltrain_text_mode.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
|
||||
"\n",
|
||||
"This example shows off LlamaParse parsing capabilities to build a functioning query pipeline over the Caltrain weekend schedule, a big timetable containing all trains northbound and southbound and their stops in various cities.\n",
|
||||
"\n",
|
||||
"Naive parsing solutions mess up in representing this tabular representation, leading to LLM hallucinations. In contrast, LlamaParse text-mode spatially lays out the table in a neat format, enabling more sophisticated LLMs like gpt-4-turbo to understand the spacing and reason over all the numbers.\n",
|
||||
"\n",
|
||||
"**NOTE**: LlamaParse markdown mode doesn't quite work yet - it's in development!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ef115dbe-b834-4639-828e-e2c11aef710b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"Download the data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e6ae2e38-30c9-4865-aa13-47780bc3848f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "335ce1d0-757a-4f09-846e-21c409768871",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget \"https://www.caltrain.com/media/31602/download?inline?inline\" -O caltrain_schedule_weekend.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "45fa9120-65bb-4772-9db7-53e7cecf9adc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize LlamaParse\n",
|
||||
"\n",
|
||||
"Initialize LlamaParse in `text` mode which will represent complex documents incl. text, tables, and figures as nicely formatted text."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "54aa9579-84d4-49bc-ab54-5474e69c1188",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/jerryliu/Programming/llama_parse/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 5f73353a-1f4b-480d-9eea-58d1d22b75f6\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"docs = LlamaParse(result_type=\"text\").load_data(\"./caltrain_schedule_weekend.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "602756b2-9ea1-4519-a8e3-c773ec624205",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Take a look at the below text (and zoom out from the browser to really get the effect!). You'll see that the entire table is nicely laid out."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4928281a-591a-4653-b451-b2b8112a7101",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ZONE 2ZONE 3ZONE 4ZONE 4 ZONE 3ZONE 2ZONE 1ZONE 1\n",
|
||||
" Printer-Friendly Caltrain Schedule\n",
|
||||
" Northbound – WEEKEND SERVICE to SAN FRANCISCO 2XX Local\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" Train No. 221 225 229 233 237 241 245 249 253 257 261 265 269 273 *277 *281\n",
|
||||
" Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n",
|
||||
" Tamien 7:12a 9:05a 10:05a 11:05a 1:05p 3:05p 5:05p 7:05p 9:05p 11:05p\n",
|
||||
" San Jose Diridon 7:19a 9:12a 10:12a 11:12a 12:12p 1:12p 2:12p 3:12p 4:12p 5:12p 6:12p 7:12p 8:12p 9:12p 10:19p 11:12p\n",
|
||||
" Santa Clara 7:25a 9:18a 10:18a 11:18a 12:18p 1:18p 2:18p 3:18p 4:18p 5:18p 6:18p 7:18p 8:18p 9:18p 10:25p 11:18p\n",
|
||||
" Lawrence 7:31a 9:24a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:31p 11:24p\n",
|
||||
" Sunnyvale 7:35a 9:28a 10:28a 11:28a 12:28p 1:28p 2:28p 3:28p 4:28p 5:28p 6:28p 7:28p 8:28p 9:28p 10:35p 11:28p\n",
|
||||
" Mountain View 7:40a 9:34a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:40p 11:34p\n",
|
||||
" San Antonio 7:43a 9:37a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:44p 11:37p\n",
|
||||
" California Ave 7:48a 9:42a 10:42a 11:42a 12:42p 1:42p 2:42p 3:42p 4:42p 5:42p 6:42p 7:42p 8:42p 9:42p 10:48p 11:42p\n",
|
||||
" Palo Alto 7:52a 9:46a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:53p 11:46p\n",
|
||||
" Menlo Park 7:55a 9:50a 10:50a 11:50a 12:50p 1:50p 2:50p 3:50p 4:50p 5:50p 6:50p 7:50p 8:50p 9:50p 10:56p 11:50p\n",
|
||||
" Redwood City 8:01a 9:56a 10:56a 11:56a 12:56p 1:56p 2:56p 3:56p 4:56p 5:56p 6:56p 7:56p 8:56p 9:56p 11:02p 11:56p\n",
|
||||
" San Carlos 8:05a 10:01a 11:01a 12:01p 1:01p 2:01p 3:01p 4:01p 5:01p 6:01p 7:01p 8:01p 9:01p 10:01p 11:07p 12:01a\n",
|
||||
" Belmont 8:09a 10:04a 11:04a 12:04p 1:04p 2:04p 3:04p 4:04p 5:04p 6:04p 7:04p 8:04p 9:04p 10:04p 11:10p 12:04a\n",
|
||||
" Hillsdale 8:12a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:14p 12:08a\n",
|
||||
" Hayward Park 8:15a 10:11a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:17p 12:11a\n",
|
||||
" San Mateo 8:19a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:21p 12:15a\n",
|
||||
" Burlingame 8:22a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:25p 12:19a\n",
|
||||
" Broadway 8:25a 10:22a 11:22a 12:22p 1:22p 2:22p 3:22p 4:22p 5:22p 6:22p 7:22p 8:22p 9:22p 10:22p 11:28p 12:22a\n",
|
||||
" Millbrae 8:29a 10:26a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:32p 12:26a\n",
|
||||
" San Bruno 8:34a 10:30a 11:30a 12:30p 1:30p 2:30p 3:30p 4:30p 5:30p 6:30p 7:30p 8:30p 9:30p 10:30p 11:37p 12:30a\n",
|
||||
" S. San Francisco 8:38a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:41p 12:34a\n",
|
||||
" Bayshore 8:44a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:47p 12:41a\n",
|
||||
" 22 ndStreet 8:50a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:46p 11:53p 12:46a\n",
|
||||
" San Francisco 8:56a 10:52a 11:53a 12:53p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:59p 12:52a\n",
|
||||
" *On SAP Center event days, Train 277 or Train 281departure from San Jose Diridon station may be delayed and will depart no later than 10:30p or 11:30p respectively.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" Southbound – WEEKEND SERVICE to SAN JOSE 2XX Local\n",
|
||||
" Train No. 224 228 232 236 240 244 248 252 256 260 264 268 272 276 280 284\n",
|
||||
" Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n",
|
||||
" San Francisco 8:28a 9:58a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 12:05a\n",
|
||||
" 22 ndStreet 8:33a 10:03a 11:03a 12:03p 1:03p 2:03p 3:03p 4:03p 5:03p 6:03p 7:03p 8:03p 9:03p 10:03p 11:03p 12:10a\n",
|
||||
" Bayshore 8:38a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:08p 12:15a\n",
|
||||
" S. San Francisco 8:45a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:15p 12:22a\n",
|
||||
" San Bruno 8:49a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:19p 12:26a\n",
|
||||
" Millbrae 8:53a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:24p 11:24p 12:31a\n",
|
||||
" Broadway 8:57a 10:27a 11:27a 12:27p 1:27p 2:27p 3:27p 4:27p 5:27p 6:27p 7:27p 8:27p 9:27p 10:27p 11:27p 12:35a\n",
|
||||
" Burlingame 9:00a 10:31a 11:31a 12:31p 1:31p 2:31p 3:31p 4:31p 5:31p 6:31p 7:31p 8:31p 9:31p 10:31p 11:31p 12:38a\n",
|
||||
" San Mateo 9:04a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:34p 12:41a\n",
|
||||
" Hayward Park 9:07a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:37p 11:37p 12:45a\n",
|
||||
" Hillsdale 9:10a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:41p 12:48a\n",
|
||||
" Belmont 9:14a 10:44a 11:44a 12:44p 1:44p 2:44p 3:44p 4:44p 5:44p 6:44p 7:44p 8:44p 9:44p 10:44p 11:44p 12:52a\n",
|
||||
" San Carlos 9:17a 10:48a 11:48a 12:48p 1:48p 2:48p 3:48p 4:48p 5:48p 6:48p 7:48p 8:48p 9:48p 10:48p 11:48p 12:55a\n",
|
||||
" Redwood City 9:21a 10:52a 11:52a 12:52p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:52p 12:59a\n",
|
||||
" Menlo Park 9:28a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 11:58p 1:05a\n",
|
||||
" Palo Alto 9:32a 11:02a 12:02p 1:02p 2:02p 3:02p 4:02p 5:02p 6:02p 7:02p 8:02p 9:02p 10:02p 11:02p 12:02a 1:09a\n",
|
||||
" California Avenue 9:36a 11:06a 12:06p 1:06p 2:06p 3:06p 4:06p 5:06p 6:06p 7:06p 8:06p 9:06p 10:06p 11:06p 12:06a 1:12a\n",
|
||||
" San Antonio 9:41a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:11p 12:10a 1:17a\n",
|
||||
" Mountain View 9:45a 11:16a 12:16p 1:16p 2:16p 3:16p 4:16p 5:16p 6:16p 7:16p 8:16p 9:16p 10:16p 11:16p 12:15a 1:21a\n",
|
||||
" Sunnyvale 9:51a 11:21a 12:21p 1:21p 2:21p 3:21p 4:21p 5:21p 6:21p 7:21p 8:21p 9:21p 10:21p 11:21p 12:20a 1:26a\n",
|
||||
" Lawrence 9:55a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:26p 12:25a 1:31a\n",
|
||||
" Santa Clara 10:01a 11:32a 12:32p 1:32p 2:32p 3:32p 4:32p 5:32p 6:32p 7:32p 8:32p 9:32p 10:32p 11:32p 12:31a 1:37a\n",
|
||||
" San Jose Diridon 10:10a 11:40a 12:40p 1:38p 2:40p 3:38p 4:40p 5:38p 6:40p 7:38p 8:40p 9:38p 10:40p 11:38p 12:39a 1:44a\n",
|
||||
" Tamien 10:15a 11:45a 12:45p 2:45p 4:45p 6:45p 8:45p 10:45p 12:44a 1:49a\n",
|
||||
" EFFECTIVE September 12, 2022 Timetable subject to change without notice.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8f5064d4-3e33-4f67-9b2e-46787161538f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize Query Engine\n",
|
||||
"\n",
|
||||
"We now initialize a query engine over this data. Here we use a baseline summary index, which doesn't do vector indexing/chunking and instead dumps the entire text into the prompt.\n",
|
||||
"\n",
|
||||
"We see that the LLM (gpt-4-turbo) is able to provide all the stops for train no 225 northbound."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b3e985b6-9d38-449f-9cf9-aae166824eed",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import SummaryIndex\n",
|
||||
"from llama_index.llms.openai import OpenAI\n",
|
||||
"\n",
|
||||
"llm = OpenAI(model=\"gpt-4o\")\n",
|
||||
"index = SummaryIndex.from_documents(docs)\n",
|
||||
"query_engine = index.as_query_engine(llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "66eb0976-2cd6-4b14-9083-124baae9ed5d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = query_engine.query(\n",
|
||||
" \"What are the stops (and times) for train no 237 northbound?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7dc6f275-07f4-429e-9335-f50982fe974c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The stops and times for train no. 237 northbound are as follows:\n",
|
||||
"\n",
|
||||
"- San Jose Diridon: 12:12 PM\n",
|
||||
"- Santa Clara: 12:18 PM\n",
|
||||
"- Lawrence: 12:24 PM\n",
|
||||
"- Sunnyvale: 12:28 PM\n",
|
||||
"- Mountain View: 12:34 PM\n",
|
||||
"- San Antonio: 12:37 PM\n",
|
||||
"- California Ave: 12:42 PM\n",
|
||||
"- Palo Alto: 12:46 PM\n",
|
||||
"- Menlo Park: 12:50 PM\n",
|
||||
"- Redwood City: 12:56 PM\n",
|
||||
"- San Carlos: 1:01 PM\n",
|
||||
"- Belmont: 1:04 PM\n",
|
||||
"- Hillsdale: 1:08 PM\n",
|
||||
"- Hayward Park: 1:11 PM\n",
|
||||
"- San Mateo: 1:15 PM\n",
|
||||
"- Burlingame: 1:19 PM\n",
|
||||
"- Broadway: 1:22 PM\n",
|
||||
"- Millbrae: 1:26 PM\n",
|
||||
"- San Bruno: 1:30 PM\n",
|
||||
"- S. San Francisco: 1:34 PM\n",
|
||||
"- Bayshore: 1:41 PM\n",
|
||||
"- 22nd Street: 1:46 PM\n",
|
||||
"- San Francisco: 1:52 PM\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "229c4cb0-cf94-4a9f-bc7c-590388f50c1f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = query_engine.query(\n",
|
||||
" \"What are all the trains (and times) that end at Tamien going Southbound?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6cf9fce0-5067-48f6-a7ef-62aa9e2edc3d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"It gets most of the answers correct (to be fair it misses two trains)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "51cf03ff-7728-4815-ab72-3bf54fc4a2c0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The trains that end at Tamien going Southbound are:\n",
|
||||
"\n",
|
||||
"- Train 224 at 10:15a\n",
|
||||
"- Train 228 at 11:45a\n",
|
||||
"- Train 240 at 2:45p\n",
|
||||
"- Train 248 at 4:45p\n",
|
||||
"- Train 256 at 6:45p\n",
|
||||
"- Train 264 at 8:45p\n",
|
||||
"- Train 272 at 10:45p\n",
|
||||
"- Train 284 at 1:49a\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e51e7feb-b74f-4101-8963-933ac7ec9763",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Try Baseline\n",
|
||||
"\n",
|
||||
"In contrast, we try a baseline approach with the default PDF reader (PyPDF) in `SimpleDirectoryReader`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "364e5155-cc75-4302-a754-9444ae28e6b1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import SimpleDirectoryReader\n",
|
||||
"from llama_index.core import SummaryIndex\n",
|
||||
"from llama_index.llms.openai import OpenAI\n",
|
||||
"\n",
|
||||
"llm = OpenAI(model=\"gpt-4o\")\n",
|
||||
"input_file = \"caltrain_schedule_weekend.pdf\"\n",
|
||||
"reader = SimpleDirectoryReader(input_files=[input_file])\n",
|
||||
"base_docs = reader.load_data()\n",
|
||||
"index = SummaryIndex.from_documents(base_docs)\n",
|
||||
"base_query_engine = index.as_query_engine(llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a4011389-2d27-4a1a-bf8d-7309da28ab15",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Southbound – WEEKEND SERVICE to SAN JOSE\n",
|
||||
"Train No. 224 228 232 236 240 244 248 252 256 260 264 268 272 276 280 284\n",
|
||||
"Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n",
|
||||
"San Francisco 8:28a 9:58a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 12:05a\n",
|
||||
"22nd Street 8:33a 10:03a 11:03a 12:03p 1:03p 2:03p 3:03p 4:03p 5:03p 6:03p 7:03p 8:03p 9:03p 10:03p 11:03p 12:10a\n",
|
||||
"Bayshore 8:38a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:08p 12:15a\n",
|
||||
"S. San Francisco 8:45a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:15p 12:22a\n",
|
||||
"San Bruno 8:49a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:19p 12:26a\n",
|
||||
"Millbrae 8:53a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:24p 11:24p 12:31a\n",
|
||||
"Broadway 8:57a 10:27a 11:27a 12:27p 1:27p 2:27p 3:27p 4:27p 5:27p 6:27p 7:27p 8:27p 9:27p 10:27p 11:27p 12:35a\n",
|
||||
"Burlingame 9:00a 10:31a 11:31a 12:31p 1:31p 2:31p 3:31p 4:31p 5:31p 6:31p 7:31p 8:31p 9:31p 10:31p 11:31p 12:38a\n",
|
||||
"San Mateo 9:04a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:34p 12:41a\n",
|
||||
"Hayward Park 9:07a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:37p 11:37p 12:45a\n",
|
||||
"Hillsdale 9:10a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:41p 12:48a\n",
|
||||
"Belmont 9:14a 10:44a 11:44a 12:44p 1:44p 2:44p 3:44p 4:44p 5:44p 6:44p 7:44p 8:44p 9:44p 10:44p 11:44p 12:52a\n",
|
||||
"San Carlos 9:17a 10:48a 11:48a 12:48p 1:48p 2:48p 3:48p 4:48p 5:48p 6:48p 7:48p 8:48p 9:48p 10:48p 11:48p 12:55a\n",
|
||||
"Redwood City 9:21a 10:52a 11:52a 12:52p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:52p 12:59a\n",
|
||||
"Menlo Park 9:28a 10:58a 11:58a 12:58p 1:58p 2:58p 3:58p 4:58p 5:58p 6:58p 7:58p 8:58p 9:58p 10:58p 11:58p 1:05a\n",
|
||||
"Palo Alto 9:32a 11:02a 12:02p 1:02p 2:02p 3:02p 4:02p 5:02p 6:02p 7:02p 8:02p 9:02p 10:02p 11:02p 12:02a 1:09a\n",
|
||||
"California Avenue 9:36a 11:06a 12:06p 1:06p 2:06p 3:06p 4:06p 5:06p 6:06p 7:06p 8:06p 9:06p 10:06p 11:06p 12:06a 1:12a\n",
|
||||
"San Antonio 9:41a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:11p 12:10a 1:17a\n",
|
||||
"Mountain View 9:45a 11:16a 12:16p 1:16p 2:16p 3:16p 4:16p 5:16p 6:16p 7:16p 8:16p 9:16p 10:16p 11:16p 12:15a 1:21a\n",
|
||||
"Sunnyvale 9:51a 11:21a 12:21p 1:21p 2:21p 3:21p 4:21p 5:21p 6:21p 7:21p 8:21p 9:21p 10:21p 11:21p 12:20a 1:26a\n",
|
||||
"Lawrence 9:55a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:26p 12:25a 1:31a\n",
|
||||
"Santa Clara 10:01a 11:32a 12:32p 1:32p 2:32p 3:32p 4:32p 5:32p 6:32p 7:32p 8:32p 9:32p 10:32p 11:32p 12:31a 1:37a\n",
|
||||
"San Jose Diridon 10:10a 11:40a 12:40p 1:38p 2:40p 3:38p 4:40p 5:38p 6:40p 7:38p 8:40p 9:38p 10:40p 11:38p 12:39a 1:44a\n",
|
||||
"Tamien 10:15a 11:45a 12:45p 2:45p 4:45p 6:45p 8:45p 10:45p 12:44a 1:49aPrinter-Friendly Caltrain Schedule\n",
|
||||
"Northbound – WEEKEND SERVICE to SAN FRANCISCO\n",
|
||||
"Train No. 221 225 229 233 237 241 245 249 253 257 261 265 269 273 *277 *281\n",
|
||||
"Service Types L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2 L2\n",
|
||||
"Tamien 7:12a 9:05a 10:05a 11:05a 1:05p 3:05p 5:05p 7:05p 9:05p 11:05p\n",
|
||||
"San Jose Diridon 7:19a 9:12a 10:12a 11:12a 12:12p 1:12p 2:12p 3:12p 4:12p 5:12p 6:12p 7:12p 8:12p 9:12p 10:19p 11:12p\n",
|
||||
"Santa Clara 7:25a 9:18a 10:18a 11:18a 12:18p 1:18p 2:18p 3:18p 4:18p 5:18p 6:18p 7:18p 8:18p 9:18p 10:25p 11:18p\n",
|
||||
"Lawrence 7:31a 9:24a 10:24a 11:24a 12:24p 1:24p 2:24p 3:24p 4:24p 5:24p 6:24p 7:24p 8:24p 9:24p 10:31p 11:24p\n",
|
||||
"Sunnyvale 7:35a 9:28a 10:28a 11:28a 12:28p 1:28p 2:28p 3:28p 4:28p 5:28p 6:28p 7:28p 8:28p 9:28p 10:35p 11:28p\n",
|
||||
"Mountain View 7:40a 9:34a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:40p 11:34p\n",
|
||||
"San Antonio 7:43a 9:37a 10:37a 11:37a 12:37p 1:37p 2:37p 3:37p 4:37p 5:37p 6:37p 7:37p 8:37p 9:37p 10:44p 11:37p\n",
|
||||
"California Ave 7:48a 9:42a 10:42a 11:42a 12:42p 1:42p 2:42p 3:42p 4:42p 5:42p 6:42p 7:42p 8:42p 9:42p 10:48p 11:42p\n",
|
||||
"Palo Alto 7:52a 9:46a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:53p 11:46p\n",
|
||||
"Menlo Park 7:55a 9:50a 10:50a 11:50a 12:50p 1:50p 2:50p 3:50p 4:50p 5:50p 6:50p 7:50p 8:50p 9:50p 10:56p 11:50p\n",
|
||||
"Redwood City 8:01a 9:56a 10:56a 11:56a 12:56p 1:56p 2:56p 3:56p 4:56p 5:56p 6:56p 7:56p 8:56p 9:56p 11:02p 11:56p\n",
|
||||
"San Carlos 8:05a 10:01a 11:01a 12:01p 1:01p 2:01p 3:01p 4:01p 5:01p 6:01p 7:01p 8:01p 9:01p 10:01p 11:07p 12:01a\n",
|
||||
"Belmont 8:09a 10:04a 11:04a 12:04p 1:04p 2:04p 3:04p 4:04p 5:04p 6:04p 7:04p 8:04p 9:04p 10:04p 11:10p 12:04a\n",
|
||||
"Hillsdale 8:12a 10:08a 11:08a 12:08p 1:08p 2:08p 3:08p 4:08p 5:08p 6:08p 7:08p 8:08p 9:08p 10:08p 11:14p 12:08a\n",
|
||||
"Hayward Park 8:15a 10:11a 11:11a 12:11p 1:11p 2:11p 3:11p 4:11p 5:11p 6:11p 7:11p 8:11p 9:11p 10:11p 11:17p 12:11a\n",
|
||||
"San Mateo 8:19a 10:15a 11:15a 12:15p 1:15p 2:15p 3:15p 4:15p 5:15p 6:15p 7:15p 8:15p 9:15p 10:15p 11:21p 12:15a\n",
|
||||
"Burlingame 8:22a 10:19a 11:19a 12:19p 1:19p 2:19p 3:19p 4:19p 5:19p 6:19p 7:19p 8:19p 9:19p 10:19p 11:25p 12:19a\n",
|
||||
"Broadway 8:25a 10:22a 11:22a 12:22p 1:22p 2:22p 3:22p 4:22p 5:22p 6:22p 7:22p 8:22p 9:22p 10:22p 11:28p 12:22a\n",
|
||||
"Millbrae 8:29a 10:26a 11:26a 12:26p 1:26p 2:26p 3:26p 4:26p 5:26p 6:26p 7:26p 8:26p 9:26p 10:26p 11:32p 12:26a\n",
|
||||
"San Bruno 8:34a 10:30a 11:30a 12:30p 1:30p 2:30p 3:30p 4:30p 5:30p 6:30p 7:30p 8:30p 9:30p 10:30p 11:37p 12:30a\n",
|
||||
"S. San Francisco 8:38a 10:34a 11:34a 12:34p 1:34p 2:34p 3:34p 4:34p 5:34p 6:34p 7:34p 8:34p 9:34p 10:34p 11:41p 12:34a\n",
|
||||
"Bayshore 8:44a 10:41a 11:41a 12:41p 1:41p 2:41p 3:41p 4:41p 5:41p 6:41p 7:41p 8:41p 9:41p 10:41p 11:47p 12:41a\n",
|
||||
"22nd Street 8:50a 10:46a 11:46a 12:46p 1:46p 2:46p 3:46p 4:46p 5:46p 6:46p 7:46p 8:46p 9:46p 10:46p 11:53p 12:46a\n",
|
||||
"San Francisco 8:56a 10:52a 11:53a 12:53p 1:52p 2:52p 3:52p 4:52p 5:52p 6:52p 7:52p 8:52p 9:52p 10:52p 11:59p 12:52aZONE 2 ZONE 3 ZONE 4 ZONE 4 ZONE 3 ZONE 2 ZONE 1 ZONE 12XX Local\n",
|
||||
"2XX Local\n",
|
||||
"EFFECTIVE September 12, 2022 Timetable subject to change without notice. *On SAP Center event days, Train 277 or Train 281departure from San Jose Diridon station may be delayed and will depart no later than 10:30p or 11:30p respectively.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(base_docs[0].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "42203c70-7ca7-4200-bf47-6282eefca3bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"base_response = base_query_engine.query(\n",
|
||||
" \"What are the stops (and times) for train no 237 northbound?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "06aa47b6-0f31-4b2d-90f0-bf6c74befd38",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train No. 237 northbound stops at the following stations and times:\n",
|
||||
"\n",
|
||||
"- Tamien: 1:05p\n",
|
||||
"- San Jose Diridon: 1:12p\n",
|
||||
"- Santa Clara: 1:18p\n",
|
||||
"- Lawrence: 1:24p\n",
|
||||
"- Sunnyvale: 1:28p\n",
|
||||
"- Mountain View: 1:34p\n",
|
||||
"- San Antonio: 1:37p\n",
|
||||
"- California Ave: 1:42p\n",
|
||||
"- Palo Alto: 1:46p\n",
|
||||
"- Menlo Park: 1:50p\n",
|
||||
"- Redwood City: 1:56p\n",
|
||||
"- San Carlos: 2:01p\n",
|
||||
"- Belmont: 2:04p\n",
|
||||
"- Hillsdale: 2:08p\n",
|
||||
"- Hayward Park: 2:11p\n",
|
||||
"- San Mateo: 2:15p\n",
|
||||
"- Burlingame: 2:19p\n",
|
||||
"- Broadway: 2:22p\n",
|
||||
"- Millbrae: 2:26p\n",
|
||||
"- San Bruno: 2:30p\n",
|
||||
"- S. San Francisco: 2:34p\n",
|
||||
"- Bayshore: 2:41p\n",
|
||||
"- 22nd Street: 2:46p\n",
|
||||
"- San Francisco: 2:52p\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(base_response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4f3c1de7-3351-4cd8-991c-34a777952194",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"base_response = base_query_engine.query(\n",
|
||||
" \"What are all the trains (and times) that end at Tamien going Southbound?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "513b1007-7508-4fb1-836c-de9353433a67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that the trains don't line up with the times!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "108edb92-76af-406b-a139-8b9e7c6528f2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The trains that end at Tamien going Southbound are:\n",
|
||||
"\n",
|
||||
"- Train 224 at 10:15a\n",
|
||||
"- Train 228 at 11:45a\n",
|
||||
"- Train 240 at 2:45p\n",
|
||||
"- Train 252 at 4:45p\n",
|
||||
"- Train 264 at 6:45p\n",
|
||||
"- Train 276 at 8:45p\n",
|
||||
"- Train 284 at 10:45p\n",
|
||||
"- Train 284 at 12:44a\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(base_response))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama_parse",
|
||||
"language": "python",
|
||||
"name": "llama_parse"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,759 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Advanced RAG with LlamaParse\n",
|
||||
"\n",
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_parse/blob/main/examples/demo_advanced.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
|
||||
"\n",
|
||||
"This notebook is a complete walkthrough for using LlamaParse with advanced indexing/retrieval techniques in LlamaIndex over the Apple 10K Filing. \n",
|
||||
"\n",
|
||||
"This allows us to ask sophisticated questions that aren't possible with \"naive\" parsing/indexing techniques with existing models.\n",
|
||||
"\n",
|
||||
"Note for this example, we are using the `llama_index >=0.10.4` version"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install llama-index\n",
|
||||
"!pip install llama-index-core==0.10.6.post1\n",
|
||||
"!pip install llama-index-embeddings-openai\n",
|
||||
"!pip install llama-index-postprocessor-flag-embedding-reranker\n",
|
||||
"!pip install git+https://github.com/FlagOpen/FlagEmbedding.git\n",
|
||||
"!pip install llama-parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget \"https://s2.q4cdn.com/470004039/files/doc_financials/2021/q4/_10-K-2021-(As-Filed).pdf\" -O apple_2021_10k.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Some OpenAI and LlamaParse details"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# API access to llama-cloud\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n",
|
||||
"\n",
|
||||
"# Using OpenAI API for embeddings/llms\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.llms.openai import OpenAI\n",
|
||||
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
|
||||
"from llama_index.core import VectorStoreIndex\n",
|
||||
"from llama_index.core import Settings\n",
|
||||
"\n",
|
||||
"embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")\n",
|
||||
"llm = OpenAI(model=\"gpt-3.5-turbo-0125\")\n",
|
||||
"\n",
|
||||
"Settings.llm = llm\n",
|
||||
"Settings.embed_model = embed_model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using brand new `LlamaParse` PDF reader for PDF Parsing\n",
|
||||
"\n",
|
||||
"we also compare two different retrieval/query engine strategies:\n",
|
||||
"1. Using raw Markdown text as nodes for building index and apply simple query engine for generating the results;\n",
|
||||
"2. Using `MarkdownElementNodeParser` for parsing the `LlamaParse` output Markdown results and building recursive retriever query engine for generation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id cac11eca-71db-4dab-b72b-c67d31e551f3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"documents = LlamaParse(result_type=\"markdown\").load_data(\"./apple_2021_10k.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from copy import deepcopy\n",
|
||||
"from llama_index.core.schema import TextNode\n",
|
||||
"from llama_index.core import VectorStoreIndex\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_page_nodes(docs, separator=\"\\n---\\n\"):\n",
|
||||
" \"\"\"Split each document into page node, by separator.\"\"\"\n",
|
||||
" nodes = []\n",
|
||||
" for doc in docs:\n",
|
||||
" doc_chunks = doc.text.split(separator)\n",
|
||||
" for doc_chunk in doc_chunks:\n",
|
||||
" node = TextNode(\n",
|
||||
" text=doc_chunk,\n",
|
||||
" metadata=deepcopy(doc.metadata),\n",
|
||||
" )\n",
|
||||
" nodes.append(node)\n",
|
||||
"\n",
|
||||
" return nodes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page_nodes = get_page_nodes(documents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core.node_parser import MarkdownElementNodeParser\n",
|
||||
"\n",
|
||||
"node_parser = MarkdownElementNodeParser(\n",
|
||||
" llm=OpenAI(model=\"gpt-3.5-turbo-0125\"), num_workers=8\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nodes = node_parser.get_nodes_from_documents(documents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"base_nodes, objects = node_parser.get_nodes_and_objects(nodes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"This table provides information about a company's state of incorporation or organization and its corresponding I.R.S. Employer Identification Number.,\\nwith the following table title:\\nCompany Incorporation Information,\\nwith the following columns:\\n- California: None\\n- 94-2404110: None\\n\""
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"objects[0].get_content()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# dump both indexed tables and page text into the vector index\n",
|
||||
"recursive_index = VectorStoreIndex(nodes=base_nodes + objects + page_nodes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Apple Inc.\n",
|
||||
"\n",
|
||||
"**CONSOLIDATED STATEMENTS OF OPERATIONS (In millions, except number of shares which are reflected in thousands and per share amounts)**\n",
|
||||
"| |September 25, 2021|September 26, 2020|September 28, 2019|\n",
|
||||
"|---|---|---|---|\n",
|
||||
"|Net sales:|$297,392|$220,747|$213,883|\n",
|
||||
"|Products| | | |\n",
|
||||
"|Services|$68,425|$53,768|$46,291|\n",
|
||||
"|Total net sales|$365,817|$274,515|$260,174|\n",
|
||||
"|Cost of sales:| | | |\n",
|
||||
"|Products|$192,266|$151,286|$144,996|\n",
|
||||
"|Services|$20,715|$18,273|$16,786|\n",
|
||||
"|Total cost of sales|$212,981|$169,559|$161,782|\n",
|
||||
"|Gross margin|$152,836|$104,956|$98,392|\n",
|
||||
"|Operating expenses:| | | |\n",
|
||||
"|Research and development|$21,914|$18,752|$16,217|\n",
|
||||
"|Selling, general and administrative|$21,973|$19,916|$18,245|\n",
|
||||
"|Total operating expenses|$43,887|$38,668|$34,462|\n",
|
||||
"|Operating income|$108,949|$66,288|$63,930|\n",
|
||||
"|Other income/(expense), net|$258|$803|$1,807|\n",
|
||||
"|Income before provision for income taxes|$109,207|$67,091|$65,737|\n",
|
||||
"|Provision for income taxes|$14,527|$9,680|$10,481|\n",
|
||||
"|Net income|$94,680|$57,411|$55,256|\n",
|
||||
"|Earnings per share:| | | |\n",
|
||||
"|Basic|$5.67|$3.31|$2.99|\n",
|
||||
"|Diluted|$5.61|$3.28|$2.97|\n",
|
||||
"|Shares used in computing earnings per share:| | | |\n",
|
||||
"|Basic|16,701,272|17,352,119|18,471,336|\n",
|
||||
"|Diluted|16,864,919|17,528,214|18,595,651|\n",
|
||||
"\n",
|
||||
"See accompanying Notes to Consolidated Financial Statements.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(page_nodes[31].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker\n",
|
||||
"\n",
|
||||
"reranker = FlagEmbeddingReranker(\n",
|
||||
" top_n=5,\n",
|
||||
" model=\"BAAI/bge-reranker-large\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"recursive_query_engine = recursive_index.as_query_engine(\n",
|
||||
" similarity_top_k=5, node_postprocessors=[reranker], verbose=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"233\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(len(nodes))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup Baseline\n",
|
||||
"\n",
|
||||
"For comparison, we setup a naive RAG pipeline with default parsing and standard chunking, indexing, retrieval."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import SimpleDirectoryReader\n",
|
||||
"\n",
|
||||
"reader = SimpleDirectoryReader(input_files=[\"apple_2021_10k.pdf\"])\n",
|
||||
"base_docs = reader.load_data()\n",
|
||||
"raw_index = VectorStoreIndex.from_documents(base_docs)\n",
|
||||
"raw_query_engine = raw_index.as_query_engine(\n",
|
||||
" similarity_top_k=5, node_postprocessors=[reranker]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using `new LlamaParse` as pdf data parsing methods and retrieve tables with two different methods\n",
|
||||
"we compare base query engine vs recursive query engine with tables"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Table Query Task: Queries for Table Question Answering"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"***********Basic Query Engine***********\n",
|
||||
"The purchases of marketable securities in 2020 amounted to $163.4 billion.\n",
|
||||
"\u001b[1;3;38;2;11;159;203mRetrieval entering 59368b87-e602-4bd1-88a7-7526fd6ab83f: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query Purchases of marketable securities in 2020\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering dfd97f47-eb4d-4bab-8a22-9bbbc0096a4b: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query Purchases of marketable securities in 2020\n",
|
||||
"\u001b[0m\n",
|
||||
"***********New LlamaParse+ Recursive Retriever Query Engine***********\n",
|
||||
"$114,938\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"Purchases of marketable securities in 2020\"\n",
|
||||
"\n",
|
||||
"response_1 = raw_query_engine.query(query)\n",
|
||||
"print(\"\\n***********Basic Query Engine***********\")\n",
|
||||
"print(response_1)\n",
|
||||
"\n",
|
||||
"response_2 = recursive_query_engine.query(query)\n",
|
||||
"print(\"\\n***********New LlamaParse+ Recursive Retriever Query Engine***********\")\n",
|
||||
"print(response_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"This table provides information on hedged assets and liabilities for the years 2021 and 2020, including current and non-current marketable securities and term debt.,\n",
|
||||
"with the following table title:\n",
|
||||
"Hedged Assets and Liabilities Summary,\n",
|
||||
"with the following columns:\n",
|
||||
"- 2021: None\n",
|
||||
"- 2020: None\n",
|
||||
"\n",
|
||||
"| |2021|2020|\n",
|
||||
"|---|---|---|\n",
|
||||
"|Hedged assets/(liabilities):| | |\n",
|
||||
"|Current and non-current marketable securities|$15,954|$16,270|\n",
|
||||
"|Current and non-current term debt|$(17,857)|$(21,033)|\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(response_2.source_nodes[2].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"***********Basic Query Engine***********\n",
|
||||
"0.03%, 0.75%, 1.43%\n",
|
||||
"\u001b[1;3;38;2;11;159;203mRetrieval entering a5afa785-217f-4e72-87cf-15da11632ec0: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query effective interest rates of all debt issuances in 2021\n",
|
||||
"\u001b[0m\n",
|
||||
"***********New LlamaParse+ Recursive Retriever Query Engine***********\n",
|
||||
"0.48% – 0.63%, 0.03% – 4.78%, 0.75% – 2.81%, 1.43% – 2.86%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"effective interest rates of all debt issuances in 2021\"\n",
|
||||
"\n",
|
||||
"response_1 = raw_query_engine.query(query)\n",
|
||||
"print(\"\\n***********Basic Query Engine***********\")\n",
|
||||
"print(response_1)\n",
|
||||
"\n",
|
||||
"response_2 = recursive_query_engine.query(query)\n",
|
||||
"print(\"\\n***********New LlamaParse+ Recursive Retriever Query Engine***********\")\n",
|
||||
"print(response_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Term Debt\n",
|
||||
"As of September 25, 2021 , the Company had outstanding floating- and fixed-rate notes with varying maturities for an aggregate \n",
|
||||
"principal amount of $118.1 billion (collectively the “Notes”). The Notes are senior unsecured obligations and interest is payable in \n",
|
||||
"arrears. The following table provides a summary of the Company’s term debt as of September 25, 2021 and September 26, \n",
|
||||
"2020 :\n",
|
||||
"Maturities\n",
|
||||
"(calendar year)2021 2020\n",
|
||||
"Amount\n",
|
||||
"(in millions)Effective\n",
|
||||
"Interest RateAmount\n",
|
||||
"(in millions)Effective\n",
|
||||
"Interest Rate\n",
|
||||
"2013 – 2020 debt issuances:\n",
|
||||
"Floating-rate notes 2022 $ 1,750 0.48% – 0.63% $ 2,250 0.60% – 1.39%\n",
|
||||
"Fixed-rate 0.000% – 4.650% notes 2022 – 2060 95,813 0.03% – 4.78% 103,828 0.03% – 4.78%\n",
|
||||
"Second quarter 2021 debt issuance:\n",
|
||||
"Fixed-rate 0.700% – 2.800% notes 2026 – 2061 14,000 0.75% – 2.81% — — %\n",
|
||||
"Fourth quarter 2021 debt issuance:\n",
|
||||
"Fixed-rate 1.400% – 2.850% notes 2028 – 2061 6,500 1.43% – 2.86% — — %\n",
|
||||
"Total term debt 118,063 106,078 \n",
|
||||
"Unamortized premium/(discount) and issuance \n",
|
||||
"costs, net (380) (314) \n",
|
||||
"Hedge accounting fair value adjustments 1,036 1,676 \n",
|
||||
"Less: Current portion of term debt (9,613) (8,773) \n",
|
||||
"Total non-current portion of term debt $ 109,106 $ 98,667 \n",
|
||||
"To manage interest rate risk on certain of its U.S. dollar–denominated fixed- or floating-rate notes, the Company has entered into \n",
|
||||
"interest rate swaps to effectively convert the fixed interest rates to floating interest rates or the floating interest rates to fixed \n",
|
||||
"interest rates on a portion of these notes. Additionally, to manage foreign currency risk on certain of its foreign currency–\n",
|
||||
"denominated notes, the Company has entered into foreign currency swaps to effectively convert these notes to U.S. dollar–\n",
|
||||
"denominated notes.\n",
|
||||
"The effective interest rates for the Notes include the interest on the Notes, amortization of the discount or premium and, if \n",
|
||||
"applicable, adjustments related to hedging. The Company recogni zed $2.6 billion , $2.8 billion and $3.2 billion of interest expense \n",
|
||||
"on its term debt for 2021 , 2020 and 2019 , respectively.\n",
|
||||
"The future principal payments for the Company’s Notes as of September 25, 2021 , are as follows (in millions):\n",
|
||||
"2022 $ 9,583 \n",
|
||||
"2023 11,391 \n",
|
||||
"2024 10,202 \n",
|
||||
"2025 10,914 \n",
|
||||
"2026 11,408 \n",
|
||||
"Thereafter 64,565 \n",
|
||||
"Total term debt $ 118,063 \n",
|
||||
"As of September 25, 2021 and September 26, 2020 , the fair value of the Company’s Notes, based on Level 2 inputs, was $125.3 \n",
|
||||
"billion and $117.1 billion , respectively.\n",
|
||||
"Apple Inc. | 2021 Form 10-K | 45\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(response_1.source_nodes[0].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"***********Basic Query Engine***********\n",
|
||||
"The U.S. Tax Cuts and Jobs Act of 2017 had an impact on income taxes in 2020, as evidenced by a decrease in the provision for income taxes compared to the prior year.\n",
|
||||
"\u001b[1;3;38;2;11;159;203mRetrieval entering b9416f35-ebf1-45d6-9a29-b59e435ab42d: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 8d8d5733-ff30-4535-9376-7f761b5900ea: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 82f301e5-199a-4aa2-bbdf-ef97898c0326: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 86f666b4-254b-487f-9870-8ee09aef07a9: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020\n",
|
||||
"\u001b[0m\n",
|
||||
"***********New LlamaParse+ Recursive Retriever Query Engine***********\n",
|
||||
"The U.S. Tax Cuts and Jobs Act of 2017 had a negative impact on income taxes in 2020.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"Impacts of the U.S. Tax Cuts and Jobs Act of 2017 on income taxes in 2020\"\n",
|
||||
"\n",
|
||||
"response_1 = raw_query_engine.query(query)\n",
|
||||
"print(\"\\n***********Basic Query Engine***********\")\n",
|
||||
"print(response_1)\n",
|
||||
"\n",
|
||||
"response_2 = recursive_query_engine.query(query)\n",
|
||||
"print(\"\\n***********New LlamaParse+ Recursive Retriever Query Engine***********\")\n",
|
||||
"print(response_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Other Income/(Expense), Net\n",
|
||||
"The following table shows the detail of OI&E for 2021 , 2020 and 2019 (in millions):\n",
|
||||
"2021 2020 2019\n",
|
||||
"Interest and dividend income $ 2,843 $ 3,763 $ 4,961 \n",
|
||||
"Interest expense (2,645) (2,873) (3,576) \n",
|
||||
"Other income/(expense), net 60 (87) 422 \n",
|
||||
"Total other income/(expense), net $ 258 $ 803 $ 1,807 \n",
|
||||
"Note 5 – Income Taxe s\n",
|
||||
"Provision for Income Taxes and Effective Tax Rat e\n",
|
||||
"The provision for income taxes for 2021 , 2020 and 2019 , consisted of the following (in millions):\n",
|
||||
"2021 2020 2019\n",
|
||||
"Federal:\n",
|
||||
"Current $ 8,257 $ 6,306 $ 6,384 \n",
|
||||
"Deferred (7,176) (3,619) (2,939) \n",
|
||||
"Total 1,081 2,687 3,445 \n",
|
||||
"State:\n",
|
||||
"Current 1,620 455 475 \n",
|
||||
"Deferred (338) 21 (67) \n",
|
||||
"Total 1,282 476 408 \n",
|
||||
"Foreign:\n",
|
||||
"Current 9,424 3,134 3,962 \n",
|
||||
"Deferred 2,740 3,383 2,666 \n",
|
||||
"Total 12,164 6,517 6,628 \n",
|
||||
"Provision for income taxes $ 14,527 $ 9,680 $ 10,481 \n",
|
||||
"The foreign provision for income taxes is based on foreign pretax earnings of $68.7 billion , $38.1 billion and $44.3 billion in 2021 , \n",
|
||||
"2020 and 2019 , respectively.\n",
|
||||
"A reconciliation of the provision for income taxes, with the amount computed by applying the statutory federal income tax rate \n",
|
||||
"(21% in 2021 , 2020 and 2019 ) to income before provision for income taxes for 2021 , 2020 and 2019 , is as follows (dollars in \n",
|
||||
"millions):\n",
|
||||
"2021 2020 2019\n",
|
||||
"Computed expected tax $ 22,933 $ 14,089 $ 13,805 \n",
|
||||
"State taxes, net of federal effect 1,151 423 423 \n",
|
||||
"Impacts of the U.S. Tax Cuts and Jobs Act of 2017 — (582) — \n",
|
||||
"Earnings of foreign subsidiaries (4,715) (2,534) (2,625) \n",
|
||||
"Foreign-derived intangible income deduction (1,372) (169) (149) \n",
|
||||
"Research and development credit, net (1,033) (728) (548) \n",
|
||||
"Excess tax benefits from equity awards (2,137) (930) (639) \n",
|
||||
"Other (300) 111 214 \n",
|
||||
"Provision for income taxes $ 14,527 $ 9,680 $ 10,481 \n",
|
||||
"Effective tax rate 13.3% 14.4% 15.9% \n",
|
||||
"Apple Inc. | 2021 Form 10-K | 41\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(response_1.source_nodes[0].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"***********Basic Query Engine***********\n",
|
||||
"$3,619 million in 2019, $7,176 million in 2020, and $1,081 million in 2021\n",
|
||||
"\u001b[1;3;38;2;11;159;203mRetrieval entering 12b1355a-f9e6-4b08-a19a-3ffc00dc5b9f: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query federal deferred tax in 2019-2021\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 82f301e5-199a-4aa2-bbdf-ef97898c0326: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query federal deferred tax in 2019-2021\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 8d8d5733-ff30-4535-9376-7f761b5900ea: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query federal deferred tax in 2019-2021\n",
|
||||
"\u001b[0m\n",
|
||||
"***********New LlamaParse+ Recursive Retriever Query Engine***********\n",
|
||||
"$2,939, $3,619, $7,176\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"federal deferred tax in 2019-2021\"\n",
|
||||
"\n",
|
||||
"response_1 = raw_query_engine.query(query)\n",
|
||||
"print(\"\\n***********Basic Query Engine***********\")\n",
|
||||
"print(response_1)\n",
|
||||
"\n",
|
||||
"response_2 = recursive_query_engine.query(query)\n",
|
||||
"print(\"\\n***********New LlamaParse+ Recursive Retriever Query Engine***********\")\n",
|
||||
"print(response_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"***********Basic Query Engine***********\n",
|
||||
"State deferred income tax for 2019: $454 million\n",
|
||||
"State deferred income tax for 2020: $21 million\n",
|
||||
"State deferred income tax for 2021: -$338 million\n",
|
||||
"\u001b[1;3;38;2;11;159;203mRetrieval entering 12b1355a-f9e6-4b08-a19a-3ffc00dc5b9f: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query give me the deferred state income tax in 2019-2021 (include +/-)\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 8d8d5733-ff30-4535-9376-7f761b5900ea: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query give me the deferred state income tax in 2019-2021 (include +/-)\n",
|
||||
"\u001b[0m\n",
|
||||
"***********New LlamaParse+ Recursive Retriever Query Engine***********\n",
|
||||
"Deferred state income tax for the years 2019-2021:\n",
|
||||
"- 2019: ($67) million\n",
|
||||
"- 2020: $21 million\n",
|
||||
"- 2021: ($338) million\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"give me the deferred state income tax in 2019-2021 (include +/-)\"\n",
|
||||
"\n",
|
||||
"response_1 = raw_query_engine.query(query)\n",
|
||||
"print(\"\\n***********Basic Query Engine***********\")\n",
|
||||
"print(response_1)\n",
|
||||
"\n",
|
||||
"response_2 = recursive_query_engine.query(query)\n",
|
||||
"print(\"\\n***********New LlamaParse+ Recursive Retriever Query Engine***********\")\n",
|
||||
"print(response_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Summary of income tax provisions for Federal, State, and Foreign entities over the years 2019, 2020, and 2021.,\n",
|
||||
"with the following table title:\n",
|
||||
"Income Tax Provisions by Entity and Year,\n",
|
||||
"with the following columns:\n",
|
||||
"- Entity: The type of entity (Federal, State, Foreign)\n",
|
||||
"- 2019: Income tax provisions for the year 2019\n",
|
||||
"- 2020: Income tax provisions for the year 2020\n",
|
||||
"- 2021: Income tax provisions for the year 2021\n",
|
||||
"\n",
|
||||
"| |2021|2020|2019|\n",
|
||||
"|---|---|---|---|\n",
|
||||
"|Federal:| | | |\n",
|
||||
"|Current|$8,257|$6,306|$6,384|\n",
|
||||
"|Deferred|(7,176)|(3,619)|(2,939)|\n",
|
||||
"|Total|1,081|2,687|3,445|\n",
|
||||
"|State:| | | |\n",
|
||||
"|Current|1,620|455|475|\n",
|
||||
"|Deferred|(338)|21|(67)|\n",
|
||||
"|Total|1,282|476|408|\n",
|
||||
"|Foreign:| | | |\n",
|
||||
"|Current|9,424|3,134|3,962|\n",
|
||||
"|Deferred|2,740|3,383|2,666|\n",
|
||||
"|Total|12,164|6,517|6,628|\n",
|
||||
"|Provision for income taxes|$14,527|$9,680|$10,481|\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(response_2.source_nodes[0].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"***********Basic Query Engine***********\n",
|
||||
"$1,620 million in 2019, $455 million in 2020, $475 million in 2021\n",
|
||||
"\u001b[1;3;38;2;11;159;203mRetrieval entering 82f301e5-199a-4aa2-bbdf-ef97898c0326: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering 8d8d5733-ff30-4535-9376-7f761b5900ea: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering b9416f35-ebf1-45d6-9a29-b59e435ab42d: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203mRetrieval entering a029e464-575f-4dd6-afad-7cc0bbc5dbf9: TextNode\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200mRetrieving from object TextNode with query current state taxes per year in 2019-2021 (include +/-)\n",
|
||||
"\u001b[0m\n",
|
||||
"***********New LlamaParse+ Recursive Retriever Query Engine***********\n",
|
||||
"$475 in 2019, $455 in 2020, $1,620 in 2021.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"current state taxes per year in 2019-2021 (include +/-)\"\n",
|
||||
"\n",
|
||||
"response_1 = raw_query_engine.query(query)\n",
|
||||
"print(\"\\n***********Basic Query Engine***********\")\n",
|
||||
"print(response_1)\n",
|
||||
"\n",
|
||||
"response_2 = recursive_query_engine.query(query)\n",
|
||||
"print(\"\\n***********New LlamaParse+ Recursive Retriever Query Engine***********\")\n",
|
||||
"print(response_2)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama_parse",
|
||||
"language": "python",
|
||||
"name": "llama_parse"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,136 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Using the Raw API\n",
|
||||
"\n",
|
||||
"This notebook walks through how to use the raw API and how"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--2024-02-02 11:11:39-- https://arxiv.org/pdf/1706.03762.pdf\n",
|
||||
"Resolving arxiv.org (arxiv.org)... 151.101.131.42, 151.101.3.42, 151.101.67.42, ...\n",
|
||||
"Connecting to arxiv.org (arxiv.org)|151.101.131.42|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 2215244 (2.1M) [application/pdf]\n",
|
||||
"Saving to: ‘./attention.pdf’\n",
|
||||
"\n",
|
||||
"./attention.pdf 100%[===================>] 2.11M --.-KB/s in 0.08s \n",
|
||||
"\n",
|
||||
"2024-02-02 11:11:39 (27.3 MB/s) - ‘./attention.pdf’ saved [2215244/2215244]\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!wget \"https://arxiv.org/pdf/1706.03762.pdf\" -O \"./attention.pdf\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"api_key = \"llx-...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import mimetypes\n",
|
||||
"import requests\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
|
||||
"file_path = \"./attention.pdf\"\n",
|
||||
"base_url = \"https://api.cloud.llamaindex.ai/api/parsing\"\n",
|
||||
"\n",
|
||||
"with open(file_path, \"rb\") as f:\n",
|
||||
" mime_type = mimetypes.guess_type(file_path)[0]\n",
|
||||
" files = {\"file\": (f.name, f, mime_type)}\n",
|
||||
"\n",
|
||||
" # send the request, upload the file\n",
|
||||
" url = f\"{base_url}/upload\"\n",
|
||||
" response = requests.post(url, headers=headers, files=files)\n",
|
||||
"\n",
|
||||
"response.raise_for_status()\n",
|
||||
"# get the job id for the result_url\n",
|
||||
"job_id = response.json()[\"id\"]\n",
|
||||
"result_type = \"text\" # or \"markdown\"\n",
|
||||
"result_url = f\"{base_url}/job/{job_id}/result/{result_type}\"\n",
|
||||
"\n",
|
||||
"# check for the result until its ready\n",
|
||||
"while True:\n",
|
||||
" response = requests.get(result_url, headers=headers)\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" time.sleep(2)\n",
|
||||
"\n",
|
||||
"# download the result\n",
|
||||
"result = response.json()\n",
|
||||
"output = result[result_type]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Provided proper attribution is provided, Google hereby grants permission to\n",
|
||||
" reproduce the tables and figures in this paper solely for use in journalistic or\n",
|
||||
" scholarly works.\n",
|
||||
" Attention Is All You Need\n",
|
||||
"arXiv:1706.03762v7 [cs.CL] 2 Aug 2023\n",
|
||||
" Ashish Vaswani∗ Noam Shazeer∗ Niki Parmar∗ Jakob Uszkoreit∗\n",
|
||||
" Google Brain Google Brain Google Research Google Research\n",
|
||||
" avaswani@google.com noam@google.com nikip@google.com usz@google.com\n",
|
||||
" Llion Jones∗ Aidan N. Gomez∗ † Łukasz Kaiser∗\n",
|
||||
" Google Research University of Toronto \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(output[:1000])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama-parse-aNC435Vv-py3.11",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,295 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Using llama-parse with AstraDB"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook, we show a basic RAG-style example that uses `llama-parse` to parse a PDF document, store the corresponding document into a vector store (`AstraDB`) and finally, perform some basic queries against that store. The notebook is modeled after the quick start notebooks and hence is meant as a way of getting started with `llama-parse`, backed by a vector database."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Requirements"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# First, install the required dependencies\n",
|
||||
"%pip install --quiet llama-index llama-parse llama-index-vector-stores-astra-db llama-index-llms-openai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configuration"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"from getpass import getpass\n",
|
||||
"\n",
|
||||
"# Get all required API keys and parameters\n",
|
||||
"llama_cloud_api_key = getpass(\"Enter your Llama Index Cloud API Key: \")\n",
|
||||
"api_endpoint = input(\"Enter your Astra DB API Endpoint: \")\n",
|
||||
"token = getpass(\"Enter your Astra DB Token: \")\n",
|
||||
"namespace = (\n",
|
||||
" input(\"Enter your Astra DB namespace (optional, must exist on Astra): \") or None\n",
|
||||
")\n",
|
||||
"openai_api_key = getpass(\"Enter your OpenAI API Key: \")\n",
|
||||
"\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = llama_cloud_api_key\n",
|
||||
"openai.api_key = openai_api_key"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using llama-parse to parse a PDF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Download complete.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Grab a PDF from Arxiv for indexing\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"# The URL of the file you want to download\n",
|
||||
"url = \"https://arxiv.org/pdf/1706.03762.pdf\"\n",
|
||||
"# The local path where you want to save the file\n",
|
||||
"file_path = \"./attention.pdf\"\n",
|
||||
"\n",
|
||||
"# Perform the HTTP request\n",
|
||||
"response = requests.get(url)\n",
|
||||
"\n",
|
||||
"# Check if the request was successful\n",
|
||||
"if response.status_code == 200:\n",
|
||||
" # Open the file in binary write mode and save the content\n",
|
||||
" with open(file_path, \"wb\") as file:\n",
|
||||
" file.write(response.content)\n",
|
||||
" print(\"Download complete.\")\n",
|
||||
"else:\n",
|
||||
" print(\"Error downloading the file.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id ce3909a7-54cf-438b-849a-fe9a903b0c71\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"documents = LlamaParse(result_type=\"text\").load_data(file_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'rmer - model architecture.\\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\\nrespectively.\\n3.1 Encoder and Decoder Stacks\\nEncoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two\\nsub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-\\nwise fully connected feed-forward network. We employ a residual connection [11] around each of\\nthe two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is\\nLayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer\\nitself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding\\nlayers, produce outputs of dimension dmodel = 512.\\nDecoder: The decoder is also composed of a stack of N = 6 identical layers. In addition '"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Take a quick look at some of the parsed text from the document:\n",
|
||||
"documents[0].get_content()[10000:11000]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Storing into Astra DB"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.vector_stores.astra_db import AstraDBVectorStore\n",
|
||||
"\n",
|
||||
"astra_db_store = AstraDBVectorStore(\n",
|
||||
" token=token,\n",
|
||||
" api_endpoint=api_endpoint,\n",
|
||||
" namespace=namespace,\n",
|
||||
" collection_name=\"astra_v_table_llamaparse\",\n",
|
||||
" embedding_dimension=1536,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core.node_parser import SimpleNodeParser\n",
|
||||
"\n",
|
||||
"node_parser = SimpleNodeParser()\n",
|
||||
"\n",
|
||||
"nodes = node_parser.get_nodes_from_documents(documents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
|
||||
"from llama_index.core import VectorStoreIndex, StorageContext\n",
|
||||
"\n",
|
||||
"storage_context = StorageContext.from_defaults(vector_store=astra_db_store)\n",
|
||||
"\n",
|
||||
"index = VectorStoreIndex(\n",
|
||||
" nodes=nodes,\n",
|
||||
" storage_context=storage_context,\n",
|
||||
" embed_model=OpenAIEmbedding(api_key=openai_api_key),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Simple RAG Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_engine = index.as_query_engine(similarity_top_k=15)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"***********New LlamaParse+ Basic Query Engine***********\n",
|
||||
"Multi-Head Attention is also known as multi-headed self-attention.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What is Multi-Head Attention also known as?\"\n",
|
||||
"\n",
|
||||
"response_1 = query_engine.query(query)\n",
|
||||
"print(\"\\n***********New LlamaParse+ Basic Query Engine***********\")\n",
|
||||
"print(response_1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'We used beam search as described in the previous section, but no\\ncheckpoint averaging. We present these results in Table 3.\\nIn Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions,\\nkeeping the amount of computation constant, as described in Section 3.2.2. While single-head\\nattention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads.\\nIn Table 3 rows (B), we observe that reducing the attention key size dk hurts model quality. This\\nsuggests that determining compatibility is not easy and that a more sophisticated compatibility\\nfunction than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected,\\nbigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our\\nsinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical\\nresults to the base model.\\n6.3 English Constituency Parsing\\nTo evaluate if the Transformer can generalize to other tasks we performed experiments on English\\nconstituency parsing. This task presents specific challenges: the output is subject to strong structural\\nconstraints and is significantly longer than the input. Furthermore, RNN sequence-to-sequence\\nmodels have not been able to attain state-of-the-art results in small-data regimes [37].\\nWe trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the\\nPenn Treebank [25], about 40K training sentences. We also trained it in a semi-supervised setting,\\nusing the larger high-confidence and BerkleyParser corpora from with approximately 17M sentences\\n[37]. We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens\\nfor the semi-supervised setting.\\nWe performed only a small number of experiments to select the dropout, both attention and residual\\n(section 5.4), learning rates and beam size on the Section 22 development set, all other parameters\\nremained unchanged from the English-to-German base translation model. During inference, we\\n 9\\n---\\nTable 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23\\nof WSJ)\\n Parser Training WSJ 23 F1\\n Vinyals & Kaiser el al. (2014) [37] WSJ only, discriminative 88.3\\n Petrov et al. (2006) [29] WSJ only, discriminative 90.4\\n Zhu et al. (2013) [40] WSJ only, discriminative 90.4\\n Dyer et al. (2016) [8] WSJ only, discriminative 91.7\\n Transformer (4 layers) WSJ only, discriminative 91.3\\n Zhu et al. (2013) [40] semi-supervised 91.3\\n Huang & Harper (2009) [14] semi-supervised 91.3\\n McClosky et al. (2006) [26] semi-supervised 92.1\\n Vinyals & Kaiser el al. (2014) [37] semi-supervised 92.1\\n Transformer (4 layers) semi-supervised 92.7\\n Luong et al. (2015) [23] multi-task 93.0\\n Dyer et al. (2016) [8] generative 93.3\\nincreased the maximum output length to input length + 300. We used a beam size of 21 and α = 0.3\\nfor both WSJ only and the semi-supervised setting.\\nOur results in Table 4 show that despite the lack of task-specific tuning our model performs sur-\\nprisingly well, yielding better results than all previously reported models with the exception of the\\nRecurrent Neural Network Grammar [8].\\nIn contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-\\nParser [29] even when training only on the WSJ training set of 40K sentences.\\n7 Conclusion\\nIn this work, we presented the Transformer, the first sequence transduction model based entirely on\\nattention, replacing the recurrent layers most commonly used in encoder-decoder architectures with\\nmulti-headed self-attention.\\nFor translation tasks, the Transformer can be trained significantly faster than architectures based\\non recurrent or convolutional layers.'"
|
||||
]
|
||||
},
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Take a look at one of the source nodes from the response\n",
|
||||
"response_1.source_nodes[0].get_content()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,183 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LlamaParse Usage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-index llama-parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--2024-02-02 11:10:10-- https://arxiv.org/pdf/1706.03762.pdf\n",
|
||||
"Resolving arxiv.org (arxiv.org)... 151.101.131.42, 151.101.3.42, 151.101.67.42, ...\n",
|
||||
"Connecting to arxiv.org (arxiv.org)|151.101.131.42|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 2215244 (2.1M) [application/pdf]\n",
|
||||
"Saving to: ‘./attention.pdf’\n",
|
||||
"\n",
|
||||
"./attention.pdf 100%[===================>] 2.11M --.-KB/s in 0.08s \n",
|
||||
"\n",
|
||||
"2024-02-02 11:10:10 (25.9 MB/s) - ‘./attention.pdf’ saved [2215244/2215244]\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!wget \"https://arxiv.org/pdf/1706.03762.pdf\" -O \"./attention.pdf\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id dd0b8e31-0c09-4497-b78a-cc1c92f1d6cf\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"documents = LlamaParse(result_type=\"text\").load_data(\"./attention.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ad\n",
|
||||
"relying entirely on an attention mechanism to draw global dependencies between input and output.\n",
|
||||
"The Transformer allows for significantly more parallelization and can reach a new state of the art in\n",
|
||||
"translation quality after being trained for as little as twelve hours on eight P100 GPUs.\n",
|
||||
"2 Background\n",
|
||||
"The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n",
|
||||
"[16], ByteNet [18] and ConvS2S [9], all of which use convolutional neural networks as basic building\n",
|
||||
"block, computing hidden representations in parallel for all input and output positions. In these models,\n",
|
||||
"the number of operations required to relate signals from two arbitrary input or output positions grows\n",
|
||||
"in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes\n",
|
||||
"it more difficult to learn dependencies between distant positions [12]. In the Transformer this is\n",
|
||||
"reduced to a constant number of operations, albeit at the cost of reduced effective res\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(documents[0].text[6000:7000])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id d4531453-1bbb-48c4-8324-ae9fea9f2fa2\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"documents = LlamaParse(result_type=\"markdown\").load_data(\"./attention.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ction describes the training regime for our models.\n",
|
||||
"\n",
|
||||
"##### Training Data and Batching\n",
|
||||
"\n",
|
||||
"We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million\n",
|
||||
"sentence pairs. Sentences were encoded using byte-pair encoding [3], which has a shared source-\n",
|
||||
"target vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT\n",
|
||||
"2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece\n",
|
||||
"vocabulary [38]. Sentence pairs were batched together by approximate sequence length. Each training\n",
|
||||
"batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000\n",
|
||||
"target tokens.\n",
|
||||
"\n",
|
||||
"##### Hardware and Schedule\n",
|
||||
"\n",
|
||||
"We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using\n",
|
||||
"the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We\n",
|
||||
"trained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the\n",
|
||||
"bo...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(documents[0].text[20000:21000] + \"...\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,531 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LlamaParse - Fast checking Insurance Contract for Coverage\n",
|
||||
"\n",
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_parse/blob/main/examples/demo_insurance.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
|
||||
"\n",
|
||||
"In this notebook we will look at how LlamaParse can be used to extract structured coverage information from an insurance policy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation of required packages"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-index llama-parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download an insurance policy fron IRDAI\n",
|
||||
"\n",
|
||||
"The Insurance Regulatory and Development Authority of India (IRDAI) maintains a great resource: https://policyholder.gov.in/web/guest/non-life-insurance-products where all insurance policies available in India are publicly available for download! Let's download a complex health insurance policy as an example."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget \"https://policyholder.gov.in/documents/37343/931203/NBHTGBP22011V012223.pdf/c392bcc1-f6a8-cadd-ab84-495b3273d2c3?version=1.0&t=1669350459879&download=true\" -O \"./policy.pdf\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initializing LlamaIndex and LlamaParse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.llms.openai import OpenAI\n",
|
||||
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
|
||||
"from llama_index.core import VectorStoreIndex\n",
|
||||
"from llama_index.core import Settings\n",
|
||||
"\n",
|
||||
"# for the purpose of this example, we will use the small model embedding and gpt3.5\n",
|
||||
"embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")\n",
|
||||
"llm = OpenAI(model=\"gpt-3.5-turbo-0125\")\n",
|
||||
"\n",
|
||||
"Settings.llm = llm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Vanilla Approach - Parse the Policy with LlamaParse into Markdown"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id b8946573-c911-4e00-8921-1bad1cda3d64\n",
|
||||
"......"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"documents = LlamaParse(result_type=\"markdown\").load_data(\"./policy.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"## Preamble\n",
|
||||
"\n",
|
||||
"This ‘Travel Infinity’ Policy is a contract of insurance between You and Us which is subject to payment of full premium in advance and the terms, conditions and exclusions of this Policy. Expense incurred outside the policy period will NOT be covered. Unutilized Sum Insured will expire at the end of the policy year. All applicable benefits, details and limits are mentioned in your Certificate of insurance. We will cover only allopathic treatments in this policy.\n",
|
||||
"\n",
|
||||
"## Defined Terms\n",
|
||||
"\n",
|
||||
"The terms listed below in this Section and used elsewhere in the Policy in Initial Capitals shall have the meaning set out against them in this Section.\n",
|
||||
"\n",
|
||||
"### Standard Definitions\n",
|
||||
"\n",
|
||||
"|2.1|Accident or Accidental|means sudden, unforeseen and involuntary event caused by external, visible and violent means.|\n",
|
||||
"|---|---|---|\n",
|
||||
"|2.2|Co-payment|means a cost sharing requirement under a health insurance policy that provides that the policyholder/insured will bear a specified percentage of the admissible claims a\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(documents[0].text[0:1000])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Markdown Element Node Parser\n",
|
||||
"Our markdown element node parser works well for parsing the markdown output of LlamaParse into a set of table and text nodes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core.node_parser import MarkdownElementNodeParser\n",
|
||||
"\n",
|
||||
"node_parser = MarkdownElementNodeParser(\n",
|
||||
" llm=OpenAI(model=\"gpt-3.5-turbo-0125\"), num_workers=8\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nodes = node_parser.get_nodes_from_documents(documents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"base_nodes, objects = node_parser.get_nodes_and_objects(nodes)\n",
|
||||
"\n",
|
||||
"recursive_index = VectorStoreIndex(nodes=base_nodes + objects)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_engine = recursive_index.as_query_engine(similarity_top_k=25)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Querying the model for coverage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"You are covered for the expenses incurred on any alternate travel booking under any mode of transport, up to the limit of the Sum Insured as mentioned in the Certificate of insurance, if the delay of the airlines was caused due to specific reasons outlined in the policy. The amount you are covered for will depend on the specific terms and conditions of your policy, including the maximum coverage limit specified in the Certificate of insurance.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query_1 = \"My trip was delay and I paid 45, how much am I cover for?\"\n",
|
||||
"\n",
|
||||
"response_1 = query_engine.query(query_1)\n",
|
||||
"print(str(response_1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The information is split across the document which leads to retrieval issues. Let's try some parsing instructions to improve our result."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id ec9e77c9-6ad9-4c9b-9efb-c9f659b0d481\n",
|
||||
"....."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"documents_with_instruction = LlamaParse(\n",
|
||||
" result_type=\"markdown\",\n",
|
||||
" parsing_instruction=\"\"\"\n",
|
||||
"This document is an insurance policy.\n",
|
||||
"When a benefits/coverage/exlusion is describe in the document ammend to it add a text in the follwing benefits string format (where coverage could be an exclusion).\n",
|
||||
"\n",
|
||||
"For {nameofrisk} and in this condition {whenDoesThecoverageApply} the coverage is {coverageDescription}. \n",
|
||||
" \n",
|
||||
"If the document contain a benefits TABLE that describe coverage amounts, do not ouput it as a table, but instead as a list of benefits string.\n",
|
||||
" \n",
|
||||
"\"\"\",\n",
|
||||
").load_data(\"./policy.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let see how the 2 parsing compare (change target page to explore)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"## Inpatient treatment\n",
|
||||
"\n",
|
||||
"Claim Form (filled and signed by pe Insured)\n",
|
||||
"Hospital Daily Cash\n",
|
||||
"Release of Medical information Form (filled and signed by pe Insured)\n",
|
||||
"Waiver of Deductible\n",
|
||||
"Original papological and diagnostic reports, discharge summary indoor case papers (if any) and prescriptions issued by pe treating Medical practitioner or Network Provider\n",
|
||||
"Optional Co-payment\n",
|
||||
"Adventure Sports Cover\n",
|
||||
"Home to Home Cover\n",
|
||||
"Passport and Visa copy wip Entry Stamp of Country of Visit and exit Stamp from India\n",
|
||||
"Extension to in-patient care\n",
|
||||
"Ambulance Charge\n",
|
||||
"FIR report of police (if applicable)\n",
|
||||
"\n",
|
||||
"## Out-patient treatment\n",
|
||||
"\n",
|
||||
"Cancer Screening & Mammographic Examination\n",
|
||||
"Original bills and receipts for:\n",
|
||||
"1. Charges paid towards Hospital accommodation, nursing facilities, and oper medical services rendered\n",
|
||||
"2. Fees paid to pe Medical Practitioner and for special nursing charges\n",
|
||||
"3. Charges incurred towards any and all test and / or examinations rendered in connection wip pe treatment\n",
|
||||
"4. Charges incurred towards medicines or drugs purchased from a registered pharmacy oper pan pe Network provider duly supported by pe prescriptions of pe Medical Practitioner attending to pe Insured Person\n",
|
||||
"5. Any oper document as required by pe Company to assist pe Claim\n",
|
||||
"\n",
|
||||
"## Medical evacuation\n",
|
||||
"\n",
|
||||
"Medical reports and transportation details issued by the evacuation agency, prescriptions and medical report by the attending Medical Practitioner furnishing the name of the Insured Person and details of treatment rendered along with the statement confirming the necessity of evacuation.\n",
|
||||
"\n",
|
||||
"Documentary proof for expenses incurred towards the Medical Evacuation.\n",
|
||||
"\n",
|
||||
"## Compassionate visit\n",
|
||||
"\n",
|
||||
"A certificate from the Medical Practitioner recommending the presence in the form of special assistance to be rendered by an additional member during the entire period of hospitalization. The certificate shall also specify the minimum period in which person is admitted in the hospital.\n",
|
||||
"\n",
|
||||
"Discharge summary of the Hospital furnishing details including the date of admission and date of discharge.\n",
|
||||
"\n",
|
||||
"Stamped boarding pass with invoice used for the travel by the Immediate Family Member.\n",
|
||||
"\n",
|
||||
"Copy passport of Immediate Family Member with entry and exit stamp.\n",
|
||||
"\n",
|
||||
"## Escort of Minor Child\n",
|
||||
"\n",
|
||||
"A certificate from the Medical Practitioner specifying the minimum period of Hospitalization.\n",
|
||||
"\n",
|
||||
"Discharge summary of the Hospital furnishing details including the date of admission and date of discharge.\n",
|
||||
"\n",
|
||||
"Stamped Boarding pass used for the return travel of the child to the Country of Residence.\n",
|
||||
"\n",
|
||||
"Stamped Boarding pass of the attendant from the Country of Residence to the place of hospitalization (if attendant is necessary).\n",
|
||||
"\n",
|
||||
"Copy of passport of the child with entry and exit stamp.\n",
|
||||
"\n",
|
||||
"## Upgradation to Business Class\n",
|
||||
"\n",
|
||||
"A certificate from the Medical Practitioner specifying the minimum period of Hospitalization.\n",
|
||||
"\n",
|
||||
"Discharge summary of the Hospital furnishing the details including the date of admission and date of discharge.\n",
|
||||
"\n",
|
||||
"Product Name: Travel infinity | Product UIN: NBHTGBP22011V012223\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"=========================================================\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Insurance Policy\n",
|
||||
"\n",
|
||||
"## Benefits:\n",
|
||||
"\n",
|
||||
"- For Inpatient treatment and in this condition when admitted to a hospital, the coverage is reimbursement for medical expenses incurred.\n",
|
||||
"- For Hospital Daily Cash and in this condition when hospitalized, the coverage is daily cash benefit.\n",
|
||||
"- For Waiver of Deductible and in this condition when a deductible is applicable, the coverage is waiver of the deductible amount.\n",
|
||||
"- For Optional Co-payment and in this condition when a co-payment is required, the coverage is optional co-payment.\n",
|
||||
"- For Adventure Sports Cover and in this condition when participating in adventure sports, the coverage is coverage for injuries related to adventure sports.\n",
|
||||
"- For Home to Home Cover and in this condition when requiring medical evacuation, the coverage is assistance for repatriation to home country.\n",
|
||||
"- For Extension to in-patient care and in this condition when extended hospital stay is necessary, the coverage is extension of coverage for in-patient care.\n",
|
||||
"- For Ambulance Charge and in this condition when ambulance services are utilized, the coverage is reimbursement for ambulance charges.\n",
|
||||
"- For Out-patient treatment and in this condition when receiving outpatient medical care, the coverage is reimbursement for outpatient medical expenses.\n",
|
||||
"- For Cancer Screening & Mammographic Examination and in this condition when undergoing cancer screening or mammographic examination, the coverage is coverage for these preventive services.\n",
|
||||
"- For New Born baby Cover and in this condition when a newborn is covered under the policy, the coverage is medical expenses coverage for the newborn.\n",
|
||||
"- For Maternity and in this condition when maternity services are required, the coverage is coverage for maternity expenses.\n",
|
||||
"- For Complete pre-existing disease cover and in this condition when seeking treatment for pre-existing conditions, the coverage is coverage for pre-existing conditions.\n",
|
||||
"- For Medical sum insured replenishment in case of hospitalization due to accident and in this condition when hospitalized due to an accident, the coverage is replenishment of the sum insured.\n",
|
||||
"- For Waiver of sublimit for insured above 60 years of age and in this condition when the insured is above 60 years of age, the coverage is waiver of sublimits.\n",
|
||||
"- For Psychiatric Counseling and in this condition when seeking psychiatric counseling, the coverage is coverage for psychiatric counseling services.\n",
|
||||
"- For Physiotherapy and in this condition when undergoing physiotherapy, the coverage is coverage for physiotherapy sessions.\n",
|
||||
"- For Terrorism cover and in this condition when affected by terrorism, the coverage is coverage for medical expenses related to terrorism incidents.\n",
|
||||
"- For Medical tele-consultation and in this condition when consulting a medical practitioner remotely, the coverage is coverage for tele-consultation services.\n",
|
||||
"- For Medical evacuation and in this condition when requiring medical evacuation, the coverage is coverage for medical evacuation services.\n",
|
||||
"- For Compassionate visit and in this condition when requiring a compassionate visit, the coverage is coverage for travel expenses for a family member to visit.\n",
|
||||
"- For Escort of Minor Child and in this condition when escorting a minor child for medical treatment, the coverage is coverage for escort services for the child.\n",
|
||||
"- For Upgradation to Business Class and in this condition when requiring upgradation to business class for medical travel, the coverage is coverage for upgradation to business class.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"target_page = 45\n",
|
||||
"pages_vanilla = documents[0].text.split(\"\\n---\\n\")\n",
|
||||
"pages_with_instructions = documents_with_instruction[0].text.split(\"\\n---\\n\")\n",
|
||||
"\n",
|
||||
"print(pages_vanilla[target_page])\n",
|
||||
"print(\"\\n\\n=========================================================\\n\\n\")\n",
|
||||
"print(pages_with_instructions[target_page])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"node_parser_instruction = MarkdownElementNodeParser(\n",
|
||||
" llm=OpenAI(model=\"gpt-3.5-turbo-0125\"), num_workers=8\n",
|
||||
")\n",
|
||||
"nodes_instruction = node_parser.get_nodes_from_documents(documents_with_instruction)\n",
|
||||
"(\n",
|
||||
" base_nodes_instruction,\n",
|
||||
" objects_instruction,\n",
|
||||
") = node_parser_instruction.get_nodes_and_objects(nodes_instruction)\n",
|
||||
"\n",
|
||||
"recursive_index_instruction = VectorStoreIndex(\n",
|
||||
" nodes=base_nodes_instruction + objects_instruction\n",
|
||||
")\n",
|
||||
"query_engine_instruction = recursive_index_instruction.as_query_engine(\n",
|
||||
" similarity_top_k=25\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Comparing Instruction-Augmented Parsing vs. Vanilla Parsing\n",
|
||||
"\n",
|
||||
"When we parse the document with natural language instructions to add context on insurance coverage, we are able to correctly answer a wide range of queries in our RAG pipeline. In contrast, a RAG pipeline built with the vanilla method is not able to answer these queries."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Vanilla:\n",
|
||||
"You are covered for the amount you paid due to the trip delay, up to the limit specified in the certificate of insurance.\n",
|
||||
"With instructions:\n",
|
||||
"For Trip Delay coverage, you are covered for a fixed benefit amount as mentioned in the certificate of insurance for every block of hours of delay.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query_1 = \"My trip was delayed and I paid 45, how much am I covered for?\"\n",
|
||||
"\n",
|
||||
"response_1 = query_engine.query(query_1)\n",
|
||||
"print(\"Vanilla:\")\n",
|
||||
"print(response_1)\n",
|
||||
"\n",
|
||||
"print(\"With instructions:\")\n",
|
||||
"response_1_i = query_engine_instruction.query(query_1)\n",
|
||||
"print(response_1_i)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Looking at the policy it says in list I that one expense not covered is Baby food"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Vanilla:\n",
|
||||
"Baby food is not explicitly mentioned in the provided context information regarding insurance coverages and benefits.\n",
|
||||
"With instructions:\n",
|
||||
"Baby food is excluded from coverage according to the policy terms.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query_2 = \"I just had a baby, is baby food covered?\"\n",
|
||||
"\n",
|
||||
"response_2 = query_engine.query(query_2)\n",
|
||||
"print(\"Vanilla:\")\n",
|
||||
"print(response_2)\n",
|
||||
"\n",
|
||||
"print(\"With instructions:\")\n",
|
||||
"response_2_i = query_engine_instruction.query(query_2)\n",
|
||||
"print(response_2_i)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Vanilla:\n",
|
||||
"Gauze used in your operation would typically be covered under the \"Emergency In-patient Medical Treatment\" or \"Emergency In-patient Medical Treatment with OPD\" benefits of the policy.\n",
|
||||
"With instructions:\n",
|
||||
"Gauze is not covered for use in your operation as it falls under the category of items that are excluded from coverage in the insurance policy.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query_3 = \"How is gauze used in my operation covered?\"\n",
|
||||
"\n",
|
||||
"response_3 = query_engine.query(query_3)\n",
|
||||
"print(\"Vanilla:\")\n",
|
||||
"print(response_3)\n",
|
||||
"\n",
|
||||
"print(\"With instructions:\")\n",
|
||||
"response_3_i = query_engine_instruction.query(query_3)\n",
|
||||
"print(response_3_i)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama_parse",
|
||||
"language": "python",
|
||||
"name": "llama_parse"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,444 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "28d15ea5-a3eb-4ee5-9d91-8dbd95e53129",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Multi-Language Support in LlamaParse\n",
|
||||
"\n",
|
||||
"LlamaParse supports users to specify a `language` parameter before uploading documents, giving users better OCR capabilities over non-English PDFs, parsing images into more accurate representations.\n",
|
||||
"\n",
|
||||
"You can specify 80+ different languages: see this file for a full list of supported languages: https://github.com/run-llama/llama_parse/blob/main/llama_parse/base.py.\n",
|
||||
"\n",
|
||||
"This notebook shows a demo of this in action. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "15539193-2f5c-4ecf-9ca4-9aee6f888468",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-index llama-parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "87322210-c21c-43d6-b459-2e8a828ac576",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2b5cabdf-342a-42d2-8ad4-0ba7c46cdfb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load in a French PDF\n",
|
||||
"\n",
|
||||
"We load in the 2022 annual report from Agence France Tresor."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e81e0a08-3a99-42e6-adcc-00bb4ce1c3d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget \"https://www.dropbox.com/scl/fi/fxg17log5ydwoflhxmgrb/treasury_report.pdf?rlkey=mdintk0o2uuzkple26vc4v6fd&dl=1\" -O treasury_report.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ecfc578c-3c7f-4ec1-aa06-51565c28632b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 476966e1-9e04-49e7-a5dc-952b053b8b94\n",
|
||||
"......"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"parser = LlamaParse(result_type=\"text\", language=\"fr\")\n",
|
||||
"documents = parser.load_data(\"./treasury_report.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0c37db27-3496-4a59-918b-701c9ad7706d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ET GESTION DE LA DETTE DE L’ÉTAT\n",
|
||||
" P.56 FOCUS OAT VERTES\n",
|
||||
" P.60 CONTRÔLE DES RISQUES & POST-MARCHÉ\n",
|
||||
" Chiffres de l’exercice 2022 P.64 À 105\n",
|
||||
" P.65 ACTIVITÉ DE L’AFT\n",
|
||||
" P.84 RAPPORT STATISTIQUE\n",
|
||||
" FICHES TECHNIQUES GLOSSAIRES LISTE DES ABRÉVIATIONS\n",
|
||||
" P.106 P.118 P.122\n",
|
||||
" AGENCE FRANCE TRÉSOR - RAPPORT D’ACTIVITÉ 2022 3\n",
|
||||
"---\n",
|
||||
" Édito\n",
|
||||
" 111 Avec une croissance\n",
|
||||
" de +2,5 %, la France a illustré\n",
|
||||
" une nouvelle fois sa résilience\n",
|
||||
" économique face aux chocs.\n",
|
||||
"4 AGENCE FRANCE TRÉSOR - RAPPORT D’ACTIVITÉ 2022\n",
|
||||
"---\n",
|
||||
" L’économie française en 2022 :\n",
|
||||
" résilience face aux chocs géopolitiques\n",
|
||||
" et économiques\n",
|
||||
" sa résilience économique face aux lors du dernier trimestre de 2022.\n",
|
||||
"LE DÉBUT DE chocs. Cette croissance a été permise Malgré un climat des affaires impacté\n",
|
||||
"L’ANNÉE 2022 grâce à une forte demande intérieure par l’inflation, le soutien apporté\n",
|
||||
" alimentée par le dynamisme de aux TPE/PME leur a permis de faire\n",
|
||||
"SEMBLAIT l’investissement et, en dépit de face aux défis énergétiques tout en\n",
|
||||
" l’inflation, d’une résilience de la préservant l’emploi.\n",
|
||||
"ENGAGÉ DANS consommation des ménages sur une\n",
|
||||
" grande partie de l’année. Afin de combattre l’inflation qui a\n",
|
||||
"UNE DYNAMIQUE largement dépassé la cible de 2 %,\n",
|
||||
" Le taux d’inflation des prix à la la BCE, de concert avec les banques\n",
|
||||
"EFFICACE DE consommation français est resté l’un centrales des principales économies\n",
|
||||
"SORTIE DE CRISE des plus bas d’Europe avec +6,0 % développées, a adapté sa fonction de\n",
|
||||
" en 2022, s’appuyant, d’une part, sur réaction en mettant fin aux politiques\n",
|
||||
"PORTÉE PAR l’atout structurel que représente un d’assouplissement monétaire qu’elle\n",
|
||||
" mix énergétique parmi les moins menait depuis la crise financière de\n",
|
||||
"UNE REPRISE exposés à la Russie et, d’autre part, 2008. Ainsi, dès juillet 2022, et pour\n",
|
||||
" sur les politiques proactives du la première fois en 10 ans, la BCE a\n",
|
||||
"ÉCONOMIQUE gouvernement avec la mise en place augmenté ses taux directeurs. Les\n",
|
||||
" du bouclier tarifaire, de la remise taux d’emprunts de l’État à 10 ans se\n",
|
||||
"INÉDITE carburant et du chèque énergie. sont ainsi progressivement éloignés\n",
|
||||
"AMORCÉE Ces dispositifs, temporaires, ont de leur territoire négatif pour\n",
|
||||
" été progressivement supprimés : la atteindre 3,10 % en fin d’année.\n",
|
||||
"EN 2021. remise carburant, d’abord prolongée\n",
|
||||
" jusqu’à mi-novembre a pris fin Cette décision s’est également\n",
|
||||
"Le déclenchement de la guerre en en décembre 2022, tandis que le accompagnée de la fin du\n",
|
||||
"Ukraine par la Russie dès février a chèque énergie exceptionnel a pris programme d’achat d’urgence (PEPP)\n",
|
||||
"rebattu les cartes de cet équilibre, fin en mars 2023. mis en place pendant la pandémie,\n",
|
||||
"provoquant des bouleversements suivi de la réduction progressive de\n",
|
||||
"majeurs sur les plans géopolitiques et Le marché du travail français a par son bilan, à un rythme mensuel de 15\n",
|
||||
"économiques, avec le déploiement ailleurs montré toute sa robustesse, milliards d’euros par mois.\n",
|
||||
"de sanctions à l’encontre de la Russie la dynamique de reprise initiée en\n",
|
||||
"et une forte poussée inflationniste. 2021 ainsi que l’effet des réformes L’Agence France Trésor a fait face à ce\n",
|
||||
"Face à cette situation, les principales structurelles engagées les années contexte de grands bouleversements\n",
|
||||
"banques centrales mondiales, dont précédentes permettant au taux géopolitiques, économiques et\n",
|
||||
"la Banque centrale européenne d’emploi des Français âgés de 15 à 64 financiers en s’appuyant sur ses\n",
|
||||
"(BCE), ont engagé une politique de ans d’atteindre fin 2022 un niveau principes de régularité, de prévisibilité\n",
|
||||
"normalisation monétaire rapide de 68,1 %, un record depuis 1975. et de transparence. Cette stratégie\n",
|
||||
"pour lutter contre l’inflation. La reprise économique de début s’est de nouveau révélée robuste et,\n",
|
||||
"Parallèlement, le gouvernement d’année et les effets positifs du plan alliée à l’engagement et à l’efficacité\n",
|
||||
"français a mis en place des mesures France Relance ont permis la création de ses équipes, ainsi qu’à la qualité\n",
|
||||
"(à hauteur de 43,6 milliards d’euros de 337 100 emplois, essentiellement de crédit de la signature de la France,\n",
|
||||
"sur l’année 2022) pour protéger les dans le secteur salarié marchand. Ce lui a permis d’accomplir sa mission\n",
|
||||
"entreprises et les ménages. dynamisme a aussi conduit à la chute de financement de l’action publique\n",
|
||||
" du taux de chômage, atteignant son au bénéfice de tous.\n",
|
||||
"Avec une croissance de +2,5 %, la niveau le plus bas depuis mars 2008\n",
|
||||
"France a illustré une nouvelle fois avec 7,2 % de demandeurs d’emploi\n",
|
||||
" Emmanuel Moulin\n",
|
||||
" DIRECTEUR GÉNÉRAL DU TRÉSOR\n",
|
||||
" ET PRÉSIDENT DE L’AFT\n",
|
||||
" AGENCE FRANCE TRÉSOR - RAPPORT D’ACTIVITÉ 2022 5\n",
|
||||
"---\n",
|
||||
" du directeur général Le mot\n",
|
||||
" 011 En 2022, le choc d’inflation\n",
|
||||
" et la normalisation\n",
|
||||
" de la politique monétaire\n",
|
||||
" ont mis fin à une décennie\n",
|
||||
" de taux historiquement bas.\n",
|
||||
"6 AGENCE FRANCE TRÉSOR - RAPPORT D’ACTIVITÉ 2022\n",
|
||||
"---\n",
|
||||
" MALGRÉ UN CONTEXTE DE MARCHÉ MOUVEMENTÉ ET LES MESURES D’AMPLEUR\n",
|
||||
" PRISES POUR LIMITER L’IMPACT DE L’INFLATION SUR LES MÉNAGES ET\n",
|
||||
" LES ENTREPRISES, LE PROGRAMME DE FINANCEMENT À MOYEN ET LONG TERME\n",
|
||||
" EST DEMEURÉ INCHANGÉ À 260 MILLIARDS D’EUROS, STABLE PAR RAPPORT\n",
|
||||
" À 2021, ET LA DETTE DE COURT TERME A ÉTÉ RÉDUITE DE 7 MILLIARDS D’EUROS.\n",
|
||||
"En janvier 2022, la normalisation de d’obligations indexées sur l’inflation, la dette de court terme a été réduite\n",
|
||||
"la politique monétaire en zone euro sur lequel a été enregistré un de 7 milliards d’euros. En effet, le\n",
|
||||
"était une perspective de moyen supplément d’indexation supérieur dynamisme des recettes fiscales et\n",
|
||||
"terme. Quelques semaines plus tard, de 17 milliards d’euros à celui de la trésorerie levée lors de la crise\n",
|
||||
"l’invasion de l’Ukraine par la Russie l’année 2021. Il s’est également sanit\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(documents[0].get_content()[1000:10000])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "be161577-7b1e-4710-b721-f549feb8e6d0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download Chinese PDF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ac332ea3-cfff-4216-b292-62410a26c336",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--2024-02-28 16:41:26-- https://www.dropbox.com/scl/fi/g5ojyzk4m44hl7neut6vc/chinese_pdf.pdf?rlkey=45reu51kjvdvic6zucr8v9sh3&dl=1\n",
|
||||
"Resolving www.dropbox.com (www.dropbox.com)... 162.125.13.18\n",
|
||||
"Connecting to www.dropbox.com (www.dropbox.com)|162.125.13.18|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 302 Found\n",
|
||||
"Location: https://uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com/cd/0/inline/COJ69Wg2e7wH9S0ELzl4j4znoonRSQS-JJrH6mxy_vcrvY-KV7f10kMyQH6IYmtfMh_9xcDNOYnLkWkwMTYItwE1XQB5nqXbjmLJ4jLbDrMeu7-b49m796ctxevwnp7k1_U/file?dl=1# [following]\n",
|
||||
"--2024-02-28 16:41:27-- https://uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com/cd/0/inline/COJ69Wg2e7wH9S0ELzl4j4znoonRSQS-JJrH6mxy_vcrvY-KV7f10kMyQH6IYmtfMh_9xcDNOYnLkWkwMTYItwE1XQB5nqXbjmLJ4jLbDrMeu7-b49m796ctxevwnp7k1_U/file?dl=1\n",
|
||||
"Resolving uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com (uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com)... 162.125.13.15\n",
|
||||
"Connecting to uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com (uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com)|162.125.13.15|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 302 Found\n",
|
||||
"Location: /cd/0/inline2/COKEp-d6ZqzrIIaPRlanov72wwnd7GX5eNSPnsxug0A8pOpek8hO6eFxp84cY3_NMBRsAqtX-IIVPpcfYHNoV__mpu1SsOV8wV8a68DwVKaVJRJriY_KV8lEFocvLgf7c7mhrREbIJ1UBN2fx6S_qWegwVIen1z1-pw-K7icMnA3EKJNqM9DFtqx9ct0FI4vdYGsv8ckLF26WgAhs96k1cHn-VRJle4SKstdYs8EmBxiuFLXZRCL3gljwAsLu3J6WRvis9v7VJ2zNhgrcT-ZnVujlpQGoGWLLPmREKffK608Xfz1XE35DzO28e_mm4SUPRfsP2mvIUrJUtUrhobR4siqQRGojxi0S7-da4Y7fpB4Tw/file?dl=1 [following]\n",
|
||||
"--2024-02-28 16:41:27-- https://uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com/cd/0/inline2/COKEp-d6ZqzrIIaPRlanov72wwnd7GX5eNSPnsxug0A8pOpek8hO6eFxp84cY3_NMBRsAqtX-IIVPpcfYHNoV__mpu1SsOV8wV8a68DwVKaVJRJriY_KV8lEFocvLgf7c7mhrREbIJ1UBN2fx6S_qWegwVIen1z1-pw-K7icMnA3EKJNqM9DFtqx9ct0FI4vdYGsv8ckLF26WgAhs96k1cHn-VRJle4SKstdYs8EmBxiuFLXZRCL3gljwAsLu3J6WRvis9v7VJ2zNhgrcT-ZnVujlpQGoGWLLPmREKffK608Xfz1XE35DzO28e_mm4SUPRfsP2mvIUrJUtUrhobR4siqQRGojxi0S7-da4Y7fpB4Tw/file?dl=1\n",
|
||||
"Reusing existing connection to uc7a03fdb7d960dbedb23e9298ab.dl.dropboxusercontent.com:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 8074860 (7.7M) [application/binary]\n",
|
||||
"Saving to: ‘chinese_pdf.pdf’\n",
|
||||
"\n",
|
||||
"chinese_pdf.pdf 100%[===================>] 7.70M 37.9MB/s in 0.2s \n",
|
||||
"\n",
|
||||
"2024-02-28 16:41:28 (37.9 MB/s) - ‘chinese_pdf.pdf’ saved [8074860/8074860]\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!wget \"https://www.dropbox.com/scl/fi/g5ojyzk4m44hl7neut6vc/chinese_pdf.pdf?rlkey=45reu51kjvdvic6zucr8v9sh3&dl=1\" -O chinese_pdf.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "45235b17-08f0-48f1-92aa-06711225860b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 0089f0b6-29ee-4e94-a8bf-49a137666f15\n",
|
||||
".........."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"parser = LlamaParse(result_type=\"text\", language=\"ch_sim\")\n",
|
||||
"documents = parser.load_data(\"./chinese_pdf.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f0d546cc-6549-4cf5-8b37-0896f4e8d43d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"中国投资有限责任公司2022年度报告 5\n",
|
||||
"---\n",
|
||||
"企业文化与核心价值观\n",
|
||||
"使命 核心价值观\n",
|
||||
" 致力于实现国家外汇资金多元化投资,在可接受风险范围内 责任 合力\n",
|
||||
" 实现股东权益最大化,以服务于国家经济发展和深化金融体\n",
|
||||
" 制改革的需要 忠于使命、勤勉尽责 立足大局、有效协同\n",
|
||||
" 是公司遵奉的核心价值取向 是实现公司可持续发展的关键\n",
|
||||
" 愿景 专业 进取\n",
|
||||
" 成为受人尊重的国际一流主权财富基金 坚持良好的专业精神和职业操守 求知进取、追求卓越\n",
|
||||
" 是公司成功的基石 是公司成功和发展壮大的内驱力\n",
|
||||
"---\n",
|
||||
"01 我们将一以贯之地践行全球发展倡议,充分维护投资东道国利益,\n",
|
||||
" 积极投身可持续投资,助力世界经济实现更高质量、更有韧性的发展。\n",
|
||||
" 致 辞\n",
|
||||
" 3 中国投资有限责任公司2022年度报告 中国投资有限责任公司2022年度报告 4\n",
|
||||
"---\n",
|
||||
" “行之力则知愈进,知之深则行愈达。”站在新的历史起点上,中投公司\n",
|
||||
" 将继续秉承精益求精、追求卓越的专业精神,与国内外合作伙伴一起深化\n",
|
||||
" 合作,共聚力量、共迎挑战、共享成果,开启打造世界一流主权财富基金\n",
|
||||
" 的新篇章,为助力全球经济发展作出新贡献! #Ave彭纯\n",
|
||||
" 董事长\n",
|
||||
" 2022年,是中投公司成立十五周年。\n",
|
||||
"董事长致辞 自2007年成立以来,中投公司坚守长期机构投资者定位,坚持国际化、市场化、专业化、负责任原则,搭\n",
|
||||
" 建起符合大型国际投资机构特点的治理架构,形成了系统完备的投资管理体系,经受住了国际金融危机、世纪\n",
|
||||
" 疫情等多个历史罕见的风险与挑战。如今,公司对外投资业务覆盖国际市场主要资产类别以及全球110多个国家\n",
|
||||
" 和地区,培养了一支高素质专业化的投资管理人才队伍,搭建了互利共赢的投资合作“朋友圈”,长期投资收\n",
|
||||
" 益超越董事会制定的考核目标,为促进国家外汇资产保值增值、服务国内国际双循环作出了积极贡献,在推动\n",
|
||||
" 全球投资合作、助力世界经济增长中贡献了中投力量,书写了中国主权财富基金不平凡的创业发展史。\n",
|
||||
"5 中国投资有限责任公司2022年度报告 中国投资有限责任公司2022年度报告 6\n",
|
||||
"---\n",
|
||||
" 2022年以来,全球地缘政治风险显著攀升,产业链供应链持续调整重构,美欧央行大幅加息,国际资本 我们守正创新,坚决践行双碳与可持续发展理念。更加包容、更加普惠、更有韧性的发展是全球\n",
|
||||
"市场剧烈震荡,MSCI全球股票指数、彭博全球债券指数一度自高点下跌超过22%、13%。面对风高浪急的国 可持续发展的关键。我们积极履行负责任投资者理念,制定《关于践行双碳目标和可持续投资行动的意见》,\n",
|
||||
"际环境和前所未有的巨大挑战,公司保持战略定力,发挥长期机构投资者优势,不断优化资产配置和投资策 积极开展气候变化、能源转型等主题投资。我们发布《运营碳中和行动计划》,明确时间表和路线图,全力实\n",
|
||||
"略,着力提升总组合韧性,加强重点领域风险防控,年度投资收益跑赢大市;截至2022年底,过去十年对外 现节能减排目标。我们探索以绿色资源引领乡村发展的新方法,在四个定点帮扶县持续推进巩固脱贫成果与乡\n",
|
||||
"投资年化净收益率按美元计算为6.43%,超出十年业绩目标26个基点;自成立以来累计年化国有资本增值率达 村振兴的有效衔接,助力民生保障与产业扶持,积极履行企业社会责任。\n",
|
||||
"到12.67%,圆满完成五年战略规划主要目标任务。 面向未来,我们坚信,发展与合作是破解全球性问题的“钥匙”。中投公司将一以贯之地践行全球发展倡\n",
|
||||
" 我们矢志不渝,积极打造世界一流主权财富基金。长期资本对于促进世界经济持续发展有着不 议,秉持互利共赢理念,以资本为纽带,促进国际产业交流合作,推动世界互联互通;充分维护投资东道国利\n",
|
||||
"可替代的作用。我们坚持国际化、市场化、专业化、负责任原则,快速恢复常态化对外交流交往,按照互利共 益,与东道国共创价值、共享价值;积极投身可持续投资,推动被投企业履行社会责任,助力世界经济实现更\n",
|
||||
"赢原则深化与国内外各类机构合作,持续为世界经济发展提供长期资本支持。我们积极创新对外投资方式,稳 高质量、更有韧性的发展。\n",
|
||||
"健运行多支新型双边基金,新设相关投资合作平台,深入推进中国市场价值创造,促进被投资公司拓展市场空\n",
|
||||
"间,助推国际投资与产业合作高质量发展。 经济全球化的潮流不可阻挡。我们呼吁各国携起手来,做多边主义的坚定维护者,打造更加开放有序的投\n",
|
||||
" 资环境,便利资本和资源要素在全球顺畅流动。我们尊重各方的利益关切,在开放中捕捉投资机遇,以务实合\n",
|
||||
" 我们直面挑战,着力加强自主投资能力建设。面对持续动荡的国际金融市场,我们锚定配置方 作应对共同挑战,并肩前进分享发展红利,推动世界经济平稳运行和持续增长。\n",
|
||||
"向,强化研究驱动,有序实施组合调整、策略优化,及时调整公开市场投资布局,质量并重推进非公开市场投\n",
|
||||
"资,完成另类资产投资占比50%的资产配置目标,对外投资总组合的韧性和质量不断提高。我们持续深化投资 “行之力则知愈进,知之深则行愈达。”过去的十五年,是中投人不惧挑战、接续奋斗的十五\n",
|
||||
"管理体制机制改革,统一非公开市场投资决策制度流程,配强投资决策专职委员并设立支持团队,投资管理科 年。 2023年是中投人落实新一轮战略规划的开局之年。上半年,在风高浪急的国际环境下,中投公司锚定战略目\n",
|
||||
"学化、专业化水平得到进一步提升。 标,统筹好发展和安全,取得了良好业绩,实现了良好开局。近期,公司部分董事更换,我们对离任董事在指导和支\n",
|
||||
" 持公司完善公司治理、深化投资管理体制机制改革、应对国际市场风险挑战等方面所作的贡献表示衷心感谢,对新\n",
|
||||
" 我们勇担使命,坚定走好中国特色金融发展之路。面对新征程新要求,我们坚持发挥“积极股 任董事表示热烈欢迎。站在新的历史起点上,中投公司将完整、准确、全面贯彻新发展理念,积极助力构建新发展格\n",
|
||||
"东”作用,督促控参股金融企业优化产品服务、加大资源倾斜力度,全力支持稳经济稳增长。我们积极创新完 局,牢牢把握高质量发展首要任务,继续秉承精益求精、追求卓越的专业精神,与国内外合作伙伴一起深化合作,共\n",
|
||||
"善“汇金模式”,推动优化国有金融资本布局,以市场化方式参与问题金融机构救助,助力金融市场稳定健康 聚力量、共迎挑战、共享成果,开启打造世界一流主权财富基金的新篇章,为助力全球经济发展作出新贡献!\n",
|
||||
"发展。我们主动适应新形势新要求,围绕国有金融资本管理体系建设等重大课题深入研究,压实派出董事自主\n",
|
||||
"履职责任,不断提升机构化履职能力。\n",
|
||||
" 我们坚守底线,持续夯实全面风险管理体系。面对风高浪急的国际环境,我们优化风险管理委员\n",
|
||||
"会设置,修订全面风险管理基本制度,增加风险类别的覆盖度,全面提升风险预见、应对、处置水平。在对外投\n",
|
||||
"资方面,我们严守法律合规底线,健全地缘政治、气候变化等非传统风险防控机制,突出抓好流动性管理,对外\n",
|
||||
"投资总组合风险保持在董事会规定的容忍度内。在国有金融资本受托管理方面,我们建立健全控参股金融企业风\n",
|
||||
"险监测体系,全面开展多维度风险画像,推动控参股金融企业风险减存量、控增量、防变量取得积极成效。\n",
|
||||
"7 中国投资有限责任公司2022年度报告 中国投资有限责任公司2022年度报告 8\n",
|
||||
"---\n",
|
||||
"02 中投公司的组建宗旨是实现国家外汇资金多元化投资,在可接受风\n",
|
||||
" 险范围内实现股东权益最大化,以服务于国家宏观经济发展和深化\n",
|
||||
" 公 司 介 绍 金融体制改革的需要。\n",
|
||||
" 9 中国投资有限责任公司2022年度报告 中国投资有限责任公司2022年度报告 10\n",
|
||||
"---\n",
|
||||
"公司概况中国投资有限责任公司(以下简称“中投公司”)依照《中华人民共和国公司法》(以下简称“《公司 公司治理 中投公司按照《公司法》及《中国投资有限责任公司章程》(以下简称“《中投公司章程》”)中的有关规\n",
|
||||
"法》”)于2007年9月成立,总部设在北京。中投公司的初始资本金为2000亿美元,由中国财政部发行1.55万 定,设立了董事会、监事会和执行委员会(以下简称“执委会”),三者之间权责明确、独立履职、有效制衡。\n",
|
||||
"亿元人民币特别国债募集。截至2022年底,公司总资产达1.24万亿美元。 2022年,中投公司健全完善董事会、监事会运行机制,强化下设专门委员会的职能发挥,持续提升公司治\n",
|
||||
" 中投公司的组建宗旨是实现国家外汇资金多元化投资,在可接受风险范围内实现股东权益最大化,以服务于 理效能。公司根据业务发展需要,优化调整投资管理架构,完善投资决策和投后管理制度机制,深化全面风险管\n",
|
||||
"国家宏观经济发展和深化金融体制改革的需要。 理体系建设,全面提升机构化投资能力。\n",
|
||||
" 中投公司开展境外投资业务与境内金融机构股权管理工作。其中,境外投资业务由下设子公司⸺中投国际\n",
|
||||
"有限责任公司(以下简称“中投国际”)和中投海外直接投资有限责任公司(以下简称“中投海外”)承担,业\n",
|
||||
"务范围包括公开市场股票和债券投资,对冲基金和多资产,泛行业私募股权和私募信用投资,房地产、基础设\n",
|
||||
"施、资源商品、农业等领域的基金投资与直接投资,以及多双边基金管理等。 组织架构图\n",
|
||||
" 中央汇金投资有限责任公司(以下简称“中央汇金”)作为中投公司的子公司,根据国务院授权,对国有重\n",
|
||||
"点金融企业进行股权投资,以出资额为限代表国家依法对国有重点金融企业行使出资人权利和履行出资人义务。 董事会 监事会\n",
|
||||
"中央汇金不开展商业性经营活动,不干预其控股的国有重点金融企业的日常经营活动。 提名与\n",
|
||||
" 薪酬委员会\n",
|
||||
" 中投国际和中投海外开展的境外业务与中央汇金开展的境内业务之间实行严格的“防火墙”政策和措施。\n",
|
||||
" 战略与\n",
|
||||
" 社会责任\n",
|
||||
" 委员会\n",
|
||||
" 风险管理 执行 国际咨询 监督 审计\n",
|
||||
" 委员会 委员会 委员会 委员会 委员会\n",
|
||||
" 境外投资 管理与支持 境内股权\n",
|
||||
" 业务部门 部门 管理部门\n",
|
||||
"11 中国投资有限责任公司2022年度报告 中国投资有限责任公司2022年度报告 12\n",
|
||||
"---\n",
|
||||
"董事会 沈如军\n",
|
||||
" 党委委员、执行董事、副总经理\n",
|
||||
" 中投公司董事会行使《公司法》和《中投公司章程》中规定的有限责任公司董事会的职权,主要包括:审核 1964年出生,管理学博士,高级会计师。历任中国工商银行计划财务部副总经理、\n",
|
||||
"和批准公司的发展战略、经营方针和投资计划;确定公司需向股东报告的重大事项;制定公司年度预决算方案; 北京市分行副行长、财务会计部总经理、山东省分行行长,交通银行执行董事、副\n",
|
||||
"任免公司高级管理人员;决定或授权批准设立内部管理机构等。 行长。现任本公司党委委员、执行董事、副总经理。\n",
|
||||
" 董事会由执行董事、非执行董事、独立董事以及职工董事构成。 丛亮\n",
|
||||
" 2022年,面对复杂严峻的国际经济形势,董事会加强对公司重大经营管理事项的指导和督促,及时听取投 非执行董事\n",
|
||||
"资形势、经营管理、风险防控等汇报,认真审议经营计划、财务预算和决算、业绩考核等重要议题,深入谋划中 1971年出生,经济学博士。历任国家发展和改革委员会国民经济综合司副司长、司\n",
|
||||
"投公司新一轮战略规划,明确发展目标、基本原则和重点举措,为公司下一阶段改革发展描绘新的蓝图。董事会 长,国家发展和改革委员会秘书长、新闻发言人,国家发展和改革委员会副主任,\n",
|
||||
"专门委员会根据授权,重点关注关系企业长远发展的重大事项,为董事会出谋划策,推动公司高质量发展迈上新 国家粮食和物资储备局局长。现任国家发展和改革委员会副主任,并兼任本公司非\n",
|
||||
"台阶。 执行董事。\n",
|
||||
" 许宏才\n",
|
||||
" 非执行董事\n",
|
||||
"董事会成员 1963年出生,经济学学士。历任财政部预算司副司长、司长,财政部部长助理,财\n",
|
||||
" 政部副部长。现任全国人大财政经济委员会副主任委员、全国人大常委会预算工作\n",
|
||||
" 彭 纯 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(documents[0].get_content()[1000:10000])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "640f0679-7f7e-4b0a-a46d-b099ae382fe2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# download another copy with a different name to avoid hitting pdf cache\n",
|
||||
"!wget \"https://www.dropbox.com/scl/fi/g5ojyzk4m44hl7neut6vc/chinese_pdf.pdf?rlkey=45reu51kjvdvic6zucr8v9sh3&dl=1\" -O chinese_pdf2.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bfcacf90-ca67-4bfd-b023-be0af2cb18c5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 99538f59-24f7-4f1e-ab27-4081933fa5ee\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"base_parser = LlamaParse(result_type=\"text\", language=\"en\")\n",
|
||||
"base_documents = parser.load_data(\"./chinese_pdf2.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b264ed4e-647a-4f51-9f79-fdf82b76762a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(base_documents[0].get_content()[1000:10000])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama_parse",
|
||||
"language": "python",
|
||||
"name": "llama_parse"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,544 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LlamaParse - Parsing comic books with parsing intructions\n",
|
||||
"Parsing intructions allow you to instruct our parsing model the same way you would instruct an LLM!\n",
|
||||
"\n",
|
||||
"They can be useful to help the parser get better results on complex document layouts, to extract data in a specific format, or to transform the document in other ways.\n",
|
||||
"\n",
|
||||
"Using Parsing Instruction you will get better results out of LlamaParse on complicated documents, and also be able to simplify your application code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation\n",
|
||||
"\n",
|
||||
"Parsing instructions are part of the llamaParse API. They can be accessed by directly specifying the parsing_instruction parameter in the API or by using the LlamaParse python module (which we will use for this tutorial).\n",
|
||||
"\n",
|
||||
"To install llama-parse, just get it from PIP:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting llama-parse\n",
|
||||
" Downloading llama_parse-0.3.8-py3-none-any.whl (6.7 kB)\n",
|
||||
"Collecting llama-index-core>=0.10.7 (from llama-parse)\n",
|
||||
" Downloading llama_index_core-0.10.19-py3-none-any.whl (15.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.3/15.3 MB\u001b[0m \u001b[31m31.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: PyYAML>=6.0.1 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (6.0.1)\n",
|
||||
"Requirement already satisfied: SQLAlchemy[asyncio]>=1.4.49 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (2.0.28)\n",
|
||||
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.6 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (3.9.3)\n",
|
||||
"Collecting dataclasses-json (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n",
|
||||
"Collecting deprecated>=1.2.9.3 (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n",
|
||||
"Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)\n",
|
||||
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (2023.6.0)\n",
|
||||
"Collecting httpx (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting llamaindex-py-client<0.2.0,>=0.1.13 (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading llamaindex_py_client-0.1.13-py3-none-any.whl (107 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m108.0/108.0 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: nest-asyncio<2.0.0,>=1.5.8 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (1.6.0)\n",
|
||||
"Requirement already satisfied: networkx>=3.0 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (3.2.1)\n",
|
||||
"Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (3.8.1)\n",
|
||||
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (1.25.2)\n",
|
||||
"Collecting openai>=1.1.0 (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading openai-1.13.3-py3-none-any.whl (227 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.4/227.4 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (1.5.3)\n",
|
||||
"Requirement already satisfied: pillow>=9.0.0 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (9.4.0)\n",
|
||||
"Requirement already satisfied: requests>=2.31.0 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (2.31.0)\n",
|
||||
"Requirement already satisfied: tenacity<9.0.0,>=8.2.0 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (8.2.3)\n",
|
||||
"Collecting tiktoken>=0.3.3 (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m43.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: tqdm<5.0.0,>=4.66.1 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (4.66.2)\n",
|
||||
"Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-index-core>=0.10.7->llama-parse) (4.10.0)\n",
|
||||
"Collecting typing-inspect>=0.8.0 (from llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n",
|
||||
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core>=0.10.7->llama-parse) (1.3.1)\n",
|
||||
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core>=0.10.7->llama-parse) (23.2.0)\n",
|
||||
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core>=0.10.7->llama-parse) (1.4.1)\n",
|
||||
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core>=0.10.7->llama-parse) (6.0.5)\n",
|
||||
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core>=0.10.7->llama-parse) (1.9.4)\n",
|
||||
"Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core>=0.10.7->llama-parse) (4.0.3)\n",
|
||||
"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated>=1.2.9.3->llama-index-core>=0.10.7->llama-parse) (1.14.1)\n",
|
||||
"Requirement already satisfied: pydantic>=1.10 in /usr/local/lib/python3.10/dist-packages (from llamaindex-py-client<0.2.0,>=0.1.13->llama-index-core>=0.10.7->llama-parse) (2.6.3)\n",
|
||||
"Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx->llama-index-core>=0.10.7->llama-parse) (3.7.1)\n",
|
||||
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx->llama-index-core>=0.10.7->llama-parse) (2024.2.2)\n",
|
||||
"Collecting httpcore==1.* (from httpx->llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.8/77.8 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx->llama-index-core>=0.10.7->llama-parse) (3.6)\n",
|
||||
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx->llama-index-core>=0.10.7->llama-parse) (1.3.1)\n",
|
||||
"Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx->llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->llama-index-core>=0.10.7->llama-parse) (8.1.7)\n",
|
||||
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->llama-index-core>=0.10.7->llama-parse) (1.3.2)\n",
|
||||
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk<4.0.0,>=3.8.1->llama-index-core>=0.10.7->llama-parse) (2023.12.25)\n",
|
||||
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.1.0->llama-index-core>=0.10.7->llama-parse) (1.7.0)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.31.0->llama-index-core>=0.10.7->llama-parse) (3.3.2)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.31.0->llama-index-core>=0.10.7->llama-parse) (2.0.7)\n",
|
||||
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy[asyncio]>=1.4.49->llama-index-core>=0.10.7->llama-parse) (3.0.3)\n",
|
||||
"Collecting mypy-extensions>=0.3.0 (from typing-inspect>=0.8.0->llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n",
|
||||
"Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json->llama-index-core>=0.10.7->llama-parse)\n",
|
||||
" Downloading marshmallow-3.21.1-py3-none-any.whl (49 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-index-core>=0.10.7->llama-parse) (2.8.2)\n",
|
||||
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-index-core>=0.10.7->llama-parse) (2023.4)\n",
|
||||
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx->llama-index-core>=0.10.7->llama-parse) (1.2.0)\n",
|
||||
"Requirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.10/dist-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json->llama-index-core>=0.10.7->llama-parse) (23.2)\n",
|
||||
"Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.13->llama-index-core>=0.10.7->llama-parse) (0.6.0)\n",
|
||||
"Requirement already satisfied: pydantic-core==2.16.3 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.13->llama-index-core>=0.10.7->llama-parse) (2.16.3)\n",
|
||||
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->llama-index-core>=0.10.7->llama-parse) (1.16.0)\n",
|
||||
"Installing collected packages: dirtyjson, mypy-extensions, marshmallow, h11, deprecated, typing-inspect, tiktoken, httpcore, httpx, dataclasses-json, openai, llamaindex-py-client, llama-index-core, llama-parse\n",
|
||||
"Successfully installed dataclasses-json-0.6.4 deprecated-1.2.14 dirtyjson-1.0.8 h11-0.14.0 httpcore-1.0.4 httpx-0.27.0 llama-index-core-0.10.19 llama-parse-0.3.8 llamaindex-py-client-0.1.13 marshmallow-3.21.1 mypy-extensions-1.0.0 openai-1.13.3 tiktoken-0.6.0 typing-inspect-0.9.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install llama-parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API key\n",
|
||||
"\n",
|
||||
"The use of LlamaParse requires an API key which you can get here: https://cloud.llamaindex.ai/parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Async (Notebook only)\n",
|
||||
"llama-parse is async-first, so running the code in a notebook requires the use of nest_asyncio\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Import the package"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using llamaparse for getting better results (on Manga!)\n",
|
||||
"\n",
|
||||
"Sometimes the layout of a page is unusual and you will get sub-optimal reading order results with LlamaParse. For example, when parsing manga you expect the reading order to be right to left even if the content is in English!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's download an extract of a great manga \"The manga guide to calculus\", by Hiroyuki Kojima (https://www.amazon.com/Manga-Guide-Calculus-Hiroyuki-Kojima/dp/1593271948)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--2024-03-13 13:57:19-- https://drive.usercontent.google.com/uc?id=1tZJhcpepLRdQFJFCFX50QIqLyLgqzZsY&export=download\n",
|
||||
"Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 173.194.211.132, 2607:f8b0:400c:c10::84\n",
|
||||
"Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|173.194.211.132|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 303 See Other\n",
|
||||
"Location: https://drive.usercontent.google.com/download?id=1tZJhcpepLRdQFJFCFX50QIqLyLgqzZsY&export=download [following]\n",
|
||||
"--2024-03-13 13:57:19-- https://drive.usercontent.google.com/download?id=1tZJhcpepLRdQFJFCFX50QIqLyLgqzZsY&export=download\n",
|
||||
"Reusing existing connection to drive.usercontent.google.com:443.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 3041634 (2.9M) [application/octet-stream]\n",
|
||||
"Saving to: ‘./manga.pdf’\n",
|
||||
"\n",
|
||||
"./manga.pdf 100%[===================>] 2.90M --.-KB/s in 0.04s \n",
|
||||
"\n",
|
||||
"2024-03-13 13:57:20 (78.6 MB/s) - ‘./manga.pdf’ saved [3041634/3041634]\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"! wget \"https://drive.usercontent.google.com/uc?id=1tZJhcpepLRdQFJFCFX50QIqLyLgqzZsY&export=download\" -O ./manga.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Without parsing instructions\n",
|
||||
"For the sake of comparison, let's first parse without any instructions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 25bf4202-78d8-4705-88cf-c616ae7c82af\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vanilaParsing = LlamaParse(result_type=\"markdown\").load_data(\"./manga.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you can see below, LlamaParse is not doing a great job here. It is interpreting the grid of comic panels as a table, and trying to fit the dialogue into a table. It's very hard to follow."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"The Asagake Times Sanda-Cho Distributor\n",
|
||||
"\n",
|
||||
"A newspaper distributor? do I have the wrong map?\n",
|
||||
"\n",
|
||||
"You’re looking It’s next for the Sanda-cho door. branch office? Everybody mistakes us for the office because we are larger. What Is a Function? 3\n",
|
||||
"---\n",
|
||||
"## Calculating the Derivative of a Constant, Linear, or Quadratic Function\n",
|
||||
"\n",
|
||||
"|1.|Let’s find the derivative of constant function f(x) = α. The differential coefficient of f(x) at x = a is|\n",
|
||||
"|---|---|\n",
|
||||
"| |lim ε→0 (f(a + ε) - f(a)) / ε = lim ε→0 (α - α) = lim ε→0 0 = 0|\n",
|
||||
"| |Thus, the derivative of f(x) is f′(x) = 0. This makes sense, since our function is constant—the rate of change is 0.|\n",
|
||||
"\n",
|
||||
"Note: The differential coefficient of f(x) at x = a is often simply called the derivative of f(x) at x = a, or just f′(a).\n",
|
||||
"\n",
|
||||
"|2.|Let’s calculate the derivative of linear function f(x) = αx + β. The derivative of f(x) at x = α is|\n",
|
||||
"|---|---|\n",
|
||||
"| |lim ε→0 (f(α + ε) - f(a)) = \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(vanilaParsing[0].text[100:1000])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using parsing instructions\n",
|
||||
"Let's try to parse the manga with custom instructions:\n",
|
||||
"\n",
|
||||
"\"The provided document is a manga comic book. Most pages do NOT have a title. It does not contain tables. Try to reconstruct the dialogue spoken in a cohesive way.\"\n",
|
||||
"\n",
|
||||
"To do so just pass the parsing instruction as a parameter to LlamaParse:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 88ab273e-b2a7-4f84-8e72-e9367cf6b114\n",
|
||||
"."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"parsingInstructionManga = \"\"\"The provided document is a manga comic book. Most pages do NOT have a title.\n",
|
||||
"It does not contain tables.\n",
|
||||
"Try to reconstruct the dialogue spoken in a cohesive way.\"\"\"\n",
|
||||
"withInstructionParsing = LlamaParse(\n",
|
||||
" result_type=\"markdown\", parsing_instruction=parsingInstructionManga\n",
|
||||
").load_data(\"./manga.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's see how it compare with page 3! We encourage you to play with the target page and explore other pages. As you will see, the parsing instruction allowed LlamaParse to make sense of the document!\n",
|
||||
"\n",
|
||||
"<img src=\"https://drive.usercontent.google.com/download?id=1M87rXTIZE8d5v7aHmVZVW6gW3eDGq6ks&authuser=0\" />\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The Asagake Times Sanda-Cho Distributor\n",
|
||||
"\n",
|
||||
"A newspaper distributor? do I have the wrong map?\n",
|
||||
"\n",
|
||||
"You’re looking It’s next for the Sanda-cho door. branch office? Everybody mistakes us for the office because we are larger. What Is a Function? 3\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# The Asagake Times\n",
|
||||
"\n",
|
||||
"Sanda-Cho Distributor\n",
|
||||
"\n",
|
||||
"A newspaper distributor?\n",
|
||||
"\n",
|
||||
"Do I have the wrong map?\n",
|
||||
"\n",
|
||||
"You're looking for the Sanda-cho branch office?\n",
|
||||
"\n",
|
||||
"It's next door.\n",
|
||||
"\n",
|
||||
"Everybody mistakes us for the office because we are larger.\n",
|
||||
"\n",
|
||||
"What Is a Function? 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"target_page = 1\n",
|
||||
"print(vanilaParsing[0].text.split(\"\\n---\\n\")[target_page])\n",
|
||||
"print(\"\\n\\n------------------------------------------------------------\\n\\n\")\n",
|
||||
"print(withInstructionParsing[0].text.split(\"\\n---\\n\")[target_page])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Math - doing more with parsing instuction!\n",
|
||||
"\n",
|
||||
"But this manga is about math and full of equations, why not ask the parser to output them in **LaTeX**?\n",
|
||||
"\n",
|
||||
"<img src=\"https://drive.usercontent.google.com/download?id=1tze3xcQ7axVA-vC_iZeAj_GvYcyNuYDa&authuser=0\" />"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id 3a055e64-d91e-484e-b9b0-99a2e637c08d\n",
|
||||
"."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"parsingInstructionMangaLatex = \"\"\"The provided document is a manga comic book. Most pages do NOT have a title.\n",
|
||||
"It does not contain tables.\n",
|
||||
"Try to reconstruct the dialogue spoken in a cohesive way.\n",
|
||||
"Output any math equation in LATEX markdown (between $$)\"\"\"\n",
|
||||
"withLatex = LlamaParse(\n",
|
||||
" result_type=\"markdown\", parsing_instruction=parsingInstructionMangaLatex\n",
|
||||
").load_data(\"./manga.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"[Without instruction]------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Calculating the Derivative of a Constant, Linear, or Quadratic Function\n",
|
||||
"\n",
|
||||
"|1.|Let’s find the derivative of constant function f(x) = α. The differential coefficient of f(x) at x = a is|\n",
|
||||
"|---|---|\n",
|
||||
"| |lim ε→0 (f(a + ε) - f(a)) / ε = lim ε→0 (α - α) = lim ε→0 0 = 0|\n",
|
||||
"| |Thus, the derivative of f(x) is f′(x) = 0. This makes sense, since our function is constant—the rate of change is 0.|\n",
|
||||
"\n",
|
||||
"Note: The differential coefficient of f(x) at x = a is often simply called the derivative of f(x) at x = a, or just f′(a).\n",
|
||||
"\n",
|
||||
"|2.|Let’s calculate the derivative of linear function f(x) = αx + β. The derivative of f(x) at x = α is|\n",
|
||||
"|---|---|\n",
|
||||
"| |lim ε→0 (f(α + ε) - f(a)) = lim ε→0 (α(a + ε) + β - (αa + β)) = lim ε→0 α = α|\n",
|
||||
"| |Thus, the derivative of f(x) is f′(x) = α, a constant value. This result should also be intuitive—linear functions have a constant rate of change by definition.|\n",
|
||||
"\n",
|
||||
"|3.|Let’s find the derivative of f(x) = x^2, which appeared in the story. The differential coefficient of f(x) at x = a is|\n",
|
||||
"|---|---|\n",
|
||||
"| |lim ε→0 ((a + ε)^2 - a^2) / ε = lim (a^2 + 2aε + ε^2 - a^2) / ε = lim (2aε + ε^2) = lim (2a + ε) = 2a|\n",
|
||||
"| |Thus, the differential coefficient of f(x) at x = a is 2a, or f′(a) = 2a. Therefore, the derivative of f(x) is f′(x) = 2x.|\n",
|
||||
"\n",
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"- The calculation of a limit that appears in calculus is simply a formula calculating an error.\n",
|
||||
"- A limit is used to obtain a derivative.\n",
|
||||
"- The derivative is the slope of the tangent line at a given point.\n",
|
||||
"- The derivative is nothing but the rate of change.\n",
|
||||
"\n",
|
||||
"## Chapter 1 Let’s Differentiate a Function!\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"[With instruction to output math in LATEX!]------------------------------------------------------------\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Derivative of Constant, Linear, or Quadratic Function\n",
|
||||
"\n",
|
||||
"## Calculating the Derivative of a Constant, Linear, or Quadratic Function\n",
|
||||
"\n",
|
||||
"1. Let’s find the derivative of constant function f(x) = α. The differential coefficient of f(x) at x = a is\n",
|
||||
"\n",
|
||||
"$$\n",
|
||||
"\\begin{align*}\n",
|
||||
"&\\lim_{{\\varepsilon \\to 0}} \\left( \\frac{f(a + \\varepsilon) - f(a)}{\\varepsilon} \\right) = \\lim_{{\\varepsilon \\to 0}} \\frac{\\alpha - \\alpha}{\\varepsilon} = \\lim_{{\\varepsilon \\to 0}} 0 = 0 \\\\\n",
|
||||
"\\end{align*}\n",
|
||||
"$$\n",
|
||||
"Thus, the derivative of f(x) is f′(x) = 0. This makes sense, since our function is constant—the rate of change is 0.\n",
|
||||
"\n",
|
||||
"Note: The differential coefficient of f(x) at x = a is often simply called the derivative of f(x) at x = a, or just f′(a).\n",
|
||||
"\n",
|
||||
"2. Let’s calculate the derivative of linear function f(x) = αx + β. The derivative of f(x) at x = α is\n",
|
||||
"\n",
|
||||
"$$\n",
|
||||
"\\begin{align*}\n",
|
||||
"&\\lim_{{\\varepsilon \\to 0}} \\left( \\frac{f(\\alpha + \\varepsilon) - f(a)}{\\varepsilon} \\right) = \\lim_{{\\varepsilon \\to 0}} \\frac{\\alpha(a + \\varepsilon) + \\beta - (\\alpha a + \\beta)}{\\varepsilon} = \\lim_{{\\varepsilon \\to 0}} \\alpha = \\alpha \\\\\n",
|
||||
"\\end{align*}\n",
|
||||
"$$\n",
|
||||
"Thus, the derivative of f(x) is f′(x) = α, a constant value. This result should also be intuitive—linear functions have a constant rate of change by definition.\n",
|
||||
"\n",
|
||||
"3. Let’s find the derivative of f(x) = x2. The differential coefficient of f(x) at x = a is\n",
|
||||
"\n",
|
||||
"$$\n",
|
||||
"\\begin{align*}\n",
|
||||
"&\\lim_{{\\varepsilon \\to 0}} \\left( \\frac{f(a + \\varepsilon) - f(a)}{\\varepsilon} \\right) = \\lim_{{\\varepsilon \\to 0}} \\left( (a + \\varepsilon)^2 - a^2 \\right) = \\lim_{{\\varepsilon \\to 0}} 2a\\varepsilon + \\varepsilon = \\lim_{{\\varepsilon \\to 0}} (2a + \\varepsilon) = 2a \\\\\n",
|
||||
"\\end{align*}\n",
|
||||
"$$\n",
|
||||
"Thus, the differential coefficient of f(x) at x = a is 2a, or f′(a) = 2a. Therefore, the derivative of f(x) is f′(x) = 2x.\n",
|
||||
"\n",
|
||||
"### Summary\n",
|
||||
"\n",
|
||||
"- The calculation of a limit that appears in calculus is simply a formula calculating an error.\n",
|
||||
"- A limit is used to obtain a derivative.\n",
|
||||
"- The derivative is the slope of the tangent line at a given point.\n",
|
||||
"- The derivative is nothing but the rate of change.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"target_page = 2\n",
|
||||
"print(\n",
|
||||
" \"\\n\\n[Without instruction]------------------------------------------------------------\\n\\n\"\n",
|
||||
")\n",
|
||||
"print(vanilaParsing[0].text.split(\"\\n---\\n\")[target_page])\n",
|
||||
"print(\n",
|
||||
" \"\\n\\n[With instruction to output math in LATEX!]------------------------------------------------------------\\n\\n\"\n",
|
||||
")\n",
|
||||
"print(withLatex[0].text.split(\"\\n---\\n\")[target_page])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And here is the result as rendered by https://upmath.me/ .\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"<img src=\"https://drive.usercontent.google.com/download?id=1qGo5bMGYOiIC9MnprcgEByaYjU9YII2Q&authuser=0\" />\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Over this short notebook we saw how to use parsing instructions to increase the quality and accuracy of parsing with LLamaParse!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
@@ -1,367 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RAG for Table Comparisons with LlamaParse + LlamaIndex\n",
|
||||
"\n",
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_parse/blob/main/examples/demo_table_comparisons.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
|
||||
"\n",
|
||||
"This notebook shows you how to do comparisons across both tabular and text data across multiple PDF documents.\n",
|
||||
"\n",
|
||||
"We load in multiple PDFs with embedded tables (2021 and 2020 10K filings for Apple) using LlamaParse, parse each into a hierarchy of tables/text objects, define a recursive retriever over each, and then compose both with a SubQuestionQueryEngine."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"Install core packages, download files, parse documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-index\n",
|
||||
"%pip install llama-index-core\n",
|
||||
"%pip install llama-index-embeddings-openai\n",
|
||||
"%pip install llama-index-question-gen-openai\n",
|
||||
"%pip install llama-index-postprocessor-flag-embedding-reranker\n",
|
||||
"%pip install git+https://github.com/FlagOpen/FlagEmbedding.git\n",
|
||||
"%pip install llama-parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!wget \"https://s2.q4cdn.com/470004039/files/doc_financials/2020/ar/_10-K-2020-(As-Filed).pdf\" -O apple_2020_10k.pdf\n",
|
||||
"!wget \"https://s2.q4cdn.com/470004039/files/doc_financials/2021/q4/_10-K-2021-(As-Filed).pdf\" -O apple_2021_10k.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Some OpenAI and LlamaParse details"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio\n",
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# API access to llama-cloud\n",
|
||||
"os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-\"\n",
|
||||
"\n",
|
||||
"# Using OpenAI API for embeddings/llms\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = \"sk-\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.llms.openai import OpenAI\n",
|
||||
"from llama_index.embeddings.openai import OpenAIEmbedding\n",
|
||||
"from llama_index.core import VectorStoreIndex\n",
|
||||
"from llama_index.core import Settings\n",
|
||||
"\n",
|
||||
"embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")\n",
|
||||
"llm = OpenAI(model=\"gpt-3.5-turbo-0125\")\n",
|
||||
"\n",
|
||||
"Settings.llm = llm\n",
|
||||
"Settings.embed_model = embed_model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using brand new `LlamaParse` PDF reader for PDF Parsing\n",
|
||||
"\n",
|
||||
"we also compare two different retrieval/query engine strategies:\n",
|
||||
"1. Using raw Markdown text as nodes for building index and apply simple query engine for generating the results;\n",
|
||||
"2. Using `MarkdownElementNodeParser` for parsing the `LlamaParse` output Markdown results and building recursive retriever query engine for generation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"docs_2021 = LlamaParse(result_type=\"markdown\").load_data(\"./apple_2021_10k.pdf\")\n",
|
||||
"docs_2020 = LlamaParse(result_type=\"markdown\").load_data(\"./apple_2020_10k.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create Recursive Retriever over each Document\n",
|
||||
"\n",
|
||||
"We define a function to get a recursive retriever from each document. The steps are the following:\n",
|
||||
"- Hierarchically parse the document using our `MarkdownElementNodeParser`, which will embed/summarize embedded tables.\n",
|
||||
"- Load into a vector store. Under the hood we will automatically store links between nodes (e.g. table summary to table text).\n",
|
||||
"- Get a query engine over the vector store, which performs retrieval/synthesis. Under the hood we will automatically perform recursive retrieval if there are links."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core.node_parser import MarkdownElementNodeParser\n",
|
||||
"\n",
|
||||
"node_parser = MarkdownElementNodeParser(\n",
|
||||
" llm=OpenAI(model=\"gpt-3.5-turbo-0125\"), num_workers=8\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pickle\n",
|
||||
"from llama_index.postprocessor.flag_embedding_reranker import (\n",
|
||||
" FlagEmbeddingReranker,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"reranker = FlagEmbeddingReranker(\n",
|
||||
" top_n=5,\n",
|
||||
" model=\"BAAI/bge-reranker-large\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def create_query_engine_over_doc(docs, nodes_save_path=None):\n",
|
||||
" \"\"\"Big function to go from document path -> recursive retriever.\"\"\"\n",
|
||||
" if nodes_save_path is not None and os.path.exists(nodes_save_path):\n",
|
||||
" raw_nodes = pickle.load(open(nodes_save_path, \"rb\"))\n",
|
||||
" else:\n",
|
||||
" raw_nodes = node_parser.get_nodes_from_documents(docs)\n",
|
||||
" if nodes_save_path is not None:\n",
|
||||
" pickle.dump(raw_nodes, open(nodes_save_path, \"wb\"))\n",
|
||||
"\n",
|
||||
" base_nodes, objects = node_parser.get_nodes_and_objects(raw_nodes)\n",
|
||||
"\n",
|
||||
" ### Construct Retrievers\n",
|
||||
" # construct top-level vector index + query engine\n",
|
||||
" vector_index = VectorStoreIndex(nodes=base_nodes + objects)\n",
|
||||
" query_engine = vector_index.as_query_engine(\n",
|
||||
" similarity_top_k=15, node_postprocessors=[reranker]\n",
|
||||
" )\n",
|
||||
" return query_engine, base_nodes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_engine_2021, nodes_2021 = create_query_engine_over_doc(\n",
|
||||
" docs_2021, nodes_save_path=\"2021_nodes.pkl\"\n",
|
||||
")\n",
|
||||
"query_engine_2020, nodes_2020 = create_query_engine_over_doc(\n",
|
||||
" docs_2020, nodes_save_path=\"2020_nodes.pkl\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
|
||||
"from llama_index.core.query_engine import SubQuestionQueryEngine\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# setup base query engine as tool\n",
|
||||
"query_engine_tools = [\n",
|
||||
" QueryEngineTool(\n",
|
||||
" query_engine=query_engine_2021,\n",
|
||||
" metadata=ToolMetadata(\n",
|
||||
" name=\"apple_2021_10k\",\n",
|
||||
" description=(\"Provides information about Apple financials for year 2021\"),\n",
|
||||
" ),\n",
|
||||
" ),\n",
|
||||
" QueryEngineTool(\n",
|
||||
" query_engine=query_engine_2020,\n",
|
||||
" metadata=ToolMetadata(\n",
|
||||
" name=\"apple_2020_10k\",\n",
|
||||
" description=(\"Provides information about Apple financials for year 2020\"),\n",
|
||||
" ),\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"sub_query_engine = SubQuestionQueryEngine.from_defaults(\n",
|
||||
" query_engine_tools=query_engine_tools,\n",
|
||||
" llm=llm,\n",
|
||||
" use_async=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Try out Some Comparisons"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Generated 4 sub questions.\n",
|
||||
"\u001b[1;3;38;2;237;90;200m[apple_2021_10k] Q: What are the deferred assets in 2021?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;90;149;237m[apple_2021_10k] Q: What are the deferred liabilities in 2021?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203m[apple_2020_10k] Q: What are the deferred assets in 2020?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;155;135;227m[apple_2020_10k] Q: What are the deferred liabilities in 2020?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;90;149;237m[apple_2021_10k] A: $7,200\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;155;135;227m[apple_2020_10k] A: $10,138\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200m[apple_2021_10k] A: $25,176 million\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;11;159;203m[apple_2020_10k] A: $19,336\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"response = sub_query_engine.query(\n",
|
||||
" \"Can you compare and contrast the deferred assets and liabilities in 2021 with 2020?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"In 2021, the deferred assets increased by $5,840 million compared to 2020, while the deferred liabilities decreased by $2,938 million in the same period.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Generated 2 sub questions.\n",
|
||||
"\u001b[1;3;38;2;237;90;200m[apple_2021_10k] Q: What is the total number of RSUs in Apple's 2021 financials?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;90;149;237m[apple_2020_10k] Q: What is the total number of RSUs in Apple's 2020 financials?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200m[apple_2021_10k] A: The total number of RSUs in Apple's 2021 financials is 240,427.\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;90;149;237m[apple_2020_10k] A: The total number of RSUs in Apple's 2020 financials is 310,778.\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"response = sub_query_engine.query(\n",
|
||||
" \"Can you compare and contrast the total number of RSUs in 2021 and 2020?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Generated 2 sub questions.\n",
|
||||
"\u001b[1;3;38;2;237;90;200m[apple_2021_10k] Q: What are the risk factors mentioned in the 2021 financial report of Apple?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;90;149;237m[apple_2020_10k] Q: What are the risk factors mentioned in the 2020 financial report of Apple?\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;237;90;200m[apple_2021_10k] A: The risk factors mentioned in the 2021 financial report of Apple include risks related to COVID-19, macroeconomic and industry risks, political events, trade and international disputes, natural disasters, public health issues, industrial accidents, credit risk, fluctuations in foreign currency exchange rates, changes in tax rates and legislation, volatility in the price of the company's stock, and exposure to legal proceedings and claims.\n",
|
||||
"\u001b[0m\u001b[1;3;38;2;90;149;237m[apple_2020_10k] A: The risk factors mentioned in the 2020 financial report of Apple include the impact of the COVID-19 pandemic on the company's business operations, financial condition, and stock price; global and regional economic conditions affecting demand for products and services; competition in global markets with rapid technological changes; potential disruptions in the supply chain due to industrial accidents or public health issues; information technology system failures or network disruptions affecting business operations; risks associated with confidential information security and potential unauthorized access; fluctuations in quarterly net sales and operating results due to various factors; stock price volatility impacting investor confidence and employee retention; financial performance risks related to changes in foreign currency exchange rates affecting sales and earnings.\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"response = sub_query_engine.query(\n",
|
||||
" \"Can you compare and contrast the risk factors in 2021 vs. 2020?\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The risk factors mentioned in the 2021 financial report of Apple include risks related to COVID-19, macroeconomic and industry risks, political events, trade and international disputes, natural disasters, public health issues, industrial accidents, credit risk, fluctuations in foreign currency exchange rates, changes in tax rates and legislation, volatility in the price of the company's stock, and exposure to legal proceedings and claims. In contrast, the risk factors mentioned in the 2020 financial report of Apple focused more on the impact of the COVID-19 pandemic on the company's business operations, financial condition, and stock price; global and regional economic conditions affecting demand for products and services; competition in global markets with rapid technological changes; potential disruptions in the supply chain due to industrial accidents or public health issues; information technology system failures or network disruptions affecting business operations; risks associated with confidential information security and potential unauthorized access; fluctuations in quarterly net sales and operating results due to various factors; stock price volatility impacting investor confidence and employee retention; financial performance risks related to changes in foreign currency exchange rates affecting sales and earnings.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(str(response))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama_parse",
|
||||
"language": "python",
|
||||
"name": "llama_parse"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -1,493 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0db58db5-d4ee-4631-af5b-4fc53eb05170",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RAG with Excel Spreadsheet using LlamaPrase\n",
|
||||
"\n",
|
||||
"<a href=\"https://colab.research.google.com/github/run-llama/llama_parse/blob/main/examples/demo_excel.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
|
||||
"\n",
|
||||
"This notebook constructs a RAG pipeline over a simple DCF template [here](https://eqvista.com/app/uploads/2020/09/Eqvista_DCF-Excel-Template.xlsx).\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5f7d99ad-6ebd-47d0-92a7-566630b0c22a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"We first setup and load the data. If you haven't already, [download the template](https://eqvista.com/app/uploads/2020/09/Eqvista_DCF-Excel-Template.xlsx) and name it `dcf_template.xlxs` locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d867d1a6-cfcf-4f53-952a-f4a6ff2fa205",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install llama-index\n",
|
||||
"%pip install llama-parse"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "103c7983-56d3-45be-b763-d1828d07c43e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nest_asyncio\n",
|
||||
"\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7b694b56-e04b-4d87-aa37-f0725d6b3adb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_parse import LlamaParse\n",
|
||||
"\n",
|
||||
"# api_key = \"llx-\" # get from cloud.llamaindex.ai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c4693c7-c1c8-47b4-8a8c-25d7e9ef9d2c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Started parsing the file under job_id cac11eca-d5da-4d46-90e6-321f40e11611\n",
|
||||
"Started parsing the file under job_id cac11eca-5450-4847-9da0-fa6879c4cf3a\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"parser = LlamaParse(\n",
|
||||
" # api_key=api_key, # can also be set in your env as LLAMA_CLOUD_API_KEY\n",
|
||||
" result_type=\"markdown\",\n",
|
||||
")\n",
|
||||
"docs = parser.load_data(\"./dcf_template.xlsx\")\n",
|
||||
"# docs_txt = LlamaParse(result_type=\"text\").load_data(\"./dcf_template.xlsx\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7302f1c8-e405-4cda-8ff7-1d55185816f7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Cover Page\n",
|
||||
"\n",
|
||||
"|Thank you for downloading our DCF Model excel template. This DCF Model excel template helps you to value your business using Discounted Free Cash Flow or DCF Method. | |\n",
|
||||
"|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n",
|
||||
"| | |\n",
|
||||
"| |Eqvista is an equity management software that allows companies, investors and company shareholders to track, manage, and make intelligent decisions about their companies’ equity.|\n",
|
||||
"| | |\n",
|
||||
"| |GET STARTED- IT'S FREE |\n",
|
||||
"| | |\n",
|
||||
"| |Note: This template is not professional advice and not a substitute for professional advice. |\n",
|
||||
"|Accordingly, before taking any actions based upon such information, we encourage you to consult with the appropriate professionals. | |\n",
|
||||
"| | |\n",
|
||||
"| |@Eqvista Inc. All Rights Reserved |\n",
|
||||
"---\n",
|
||||
"# DCF Model\n",
|
||||
"\n",
|
||||
"|Discounted Cash Flow Excel Template | | | | | | | | | | | |\n",
|
||||
"|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------|-----------|-----------|-----------------------|-----------|-----------------------|--------------|-----------|-----------|-----------|--------------|\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"|Here is a simple discounted cash flow excel template for estimating your company value based on this income valuation approach | | | | | | | | | | | |\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"|Instructions: | | | | | | | | | | | |\n",
|
||||
"|1) Fill out the two assumptions in yellow highlight | | | | | | | | | | | |\n",
|
||||
"|2) Fill in either the 5 year or 3 year weighted average figures in yellow highlight | | | | | | | | | | | |\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"|Assumptions | | | | | | | | | | | |\n",
|
||||
"|Tax Rate |20% | | | | | | | | | | |\n",
|
||||
"|Discount Rate |15% | | | | | | | | | | |\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"|5 Year Weighted Moving Average | | | | | | | | | | | |\n",
|
||||
"|Indication of Company Value |$242,995.43 | | | | | | | | | | |\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"|3 Year Weighted Moving Average | | | | | | | | | | | |\n",
|
||||
"|Indication of Company Value |$158,651.07 | | | | | | | | | | |\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"| |5 Year Weighted Moving Average| | | | | | | | | | |\n",
|
||||
"| |Past Years | | | | |Forecasted Future Years| | | | | |\n",
|
||||
"| |Year 1 |Year 2 |Year 3 |Year 4 |Year 5 |Year 6 |Year 7 |Year 8 |Year 9 |Year 10 |Terminal Value|\n",
|
||||
"|Pre-tax income |50,000.00 |55,000.00 |45,000.00 |52,000.00 |60,000.00 | | | | | | |\n",
|
||||
"|Income Taxes |10,000.00 |11,000.00 |9,000.00 |10,400.00 |12,000.00 | | | | | | |\n",
|
||||
"|Net Income |40,000.00 |44,000.00 |36,000.00 |41,600.00 |48,000.00 | | | | | | |\n",
|
||||
"|Depreciation Expense |5,000.00 |4,000.00 |3,000.00 |2,000.00 |1,000.00 | | | | | | |\n",
|
||||
"|Capital Expenditures |10,000.00 |8,000.00 |5,000.00 |5,000.00 |7,000.00 | | | | | | |\n",
|
||||
"|Debt Repayments |5,000.00 |5,000.00 |5,000.00 |5,000.00 |5,000.00 | | | | | | |\n",
|
||||
"|Net Cash Flow |20,000.00 |27,000.00 |23,000.00 |29,600.00 |35,000.00 |29,093.33 |29,817.78 |30,177.48 |30,469.23 |30,379.74 |287,188.00 |\n",
|
||||
"|Discounting Factor | | | | | |0.8696 |0.7561 |0.6575 |0.5718 |0.4972 |0.4972 |\n",
|
||||
"|Present Value of Future Cash Flow | | | | | |25,298.55 |22,546.52 |19,842.18 |17,420.88 |15,104.10 |142,783.19 |\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"| |3 Year Weighted Moving Average| | | | | | | | | | |\n",
|
||||
"| |Past Years | | |Forecasted Future Years| | | | | | | |\n",
|
||||
"| |Year 1 |Year 2 |Year 3 |Year 4 |Year 5 |Year 6 |Terminal Value| | | | |\n",
|
||||
"|Pre-tax income |50,000.00 |55,000.00 |45,000.00 | | | | | | | | |\n",
|
||||
"|Income Taxes |10,000.00 |11,000.00 |9,000.00 | | | | | | | | |\n",
|
||||
"|Net Income |40,000.00 |44,000.00 |36,000.00 | | | | | | | | |\n",
|
||||
"|Depreciation Expense |5,000.00 |4,000.00 |3,000.00 | | | | | | | | |\n",
|
||||
"|Capital Expenditures |10,000.00 |8,000.00 |5,000.00 | | | | | | | | |\n",
|
||||
"|Debt Repayments |5,000.00 |5,000.00 |5,000.00 | | | | | | | | |\n",
|
||||
"|Net Cash Flow |20,000.00 |27,000.00 |23,000.00 |23,833.33 |24,083.33 |23,819.44 |158,253.59 | | | | |\n",
|
||||
"|Discounting Factor | | | |0.8696 |0.7561 |0.6575 |0.6575 | | | | |\n",
|
||||
"|Present Value of Future Cash Flow | | | |20,724.64 |18,210.46 |15,661.67 |104,054.30 | | | | |\n",
|
||||
"| | | | | | | | | | | | |\n",
|
||||
"|Notes: | | | | | | | | | | | |\n",
|
||||
"|-We based this simple discounted cash flow excel model based on the weighted moving averages (5 year or 3 year) for simplicity, in case a constant growth rate cannot be easily determined.| | | | | | | | | | | |\n",
|
||||
"|-The factors such as Depreciation Expense, Capital Expense and Debt Repayments remain constant, so consider this when looking at the forecasted figures. | | | | | | | | | | | |\n",
|
||||
"|-For the terminal value constant growth rate, we make the assumption of the growth from the last forecasted year compared to the first forecasted year. Adjust in the formula as needed. | | | | | | | | | | | |\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1aedd4bb-7939-4fbc-8f07-d362e24d9772",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configure LLM, Setup Basic Summary Engine\n",
|
||||
"\n",
|
||||
"We setup a basic summary engine which retrieves the entire document as context to put into the prompt."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f7c056a8-d098-4ebe-9341-d9f07081067c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.llms.openai import OpenAI\n",
|
||||
"from llama_index.core import Settings\n",
|
||||
"\n",
|
||||
"llm = OpenAI(model=\"gpt-4-turbo-preview\")\n",
|
||||
"Settings.llm = llm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c0fa2630-ee1b-4ce7-91e9-f9ffff8347f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import SummaryIndex\n",
|
||||
"\n",
|
||||
"index = SummaryIndex.from_documents(docs)\n",
|
||||
"# index = SummaryIndex.from_documents(docs_txt)\n",
|
||||
"\n",
|
||||
"query_engine = index.as_query_engine()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1d39a075-46b8-4dcb-8aee-abd10343bedd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Define Baseline\n",
|
||||
"\n",
|
||||
"Let's define a baseline query engine over this data, using a naive parser (our PandasExcelReader, available on LlamaHub)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "632f918e-7811-4931-8a5f-4aa4850718db",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting openpyxl\n",
|
||||
" Downloading openpyxl-3.1.3-py2.py3-none-any.whl (251 kB)\n",
|
||||
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m251.3/251.3 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hCollecting et-xmlfile\n",
|
||||
" Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\n",
|
||||
"Installing collected packages: et-xmlfile, openpyxl\n",
|
||||
"Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.3\n",
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install llama-index-readers-file\n",
|
||||
"!pip install openpyxl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "85ff09fd-8a99-4aa4-8182-8d0cf30f7b85",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.readers.file import PandasExcelReader\n",
|
||||
"import importlib\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"base_reader = PandasExcelReader()\n",
|
||||
"base_docs = base_reader.load_data(Path(\"dcf_template.xlsx\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ba45f806-58be-4f57-bf42-2721555136cb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Discounted Cash Flow Excel Template \n",
|
||||
" \n",
|
||||
"Here is a simple discounted cash flow excel template for estimating your company value based on this income valuation approach \n",
|
||||
" \n",
|
||||
"Instructions: \n",
|
||||
"1) Fill out the two assumptions in yellow highlight \n",
|
||||
"2) Fill in either the 5 year or 3 year weighted average figures in yellow highlight \n",
|
||||
" \n",
|
||||
" \n",
|
||||
" \n",
|
||||
" \n",
|
||||
"Assumptions \n",
|
||||
"Tax Rate 0.2 \n",
|
||||
"Discount Rate 0.15 \n",
|
||||
" \n",
|
||||
"5 Year Weighted Moving Average \n",
|
||||
"Indication of Company Value 242995.4347636059 \n",
|
||||
" \n",
|
||||
"3 Year Weighted Moving Average \n",
|
||||
"Indication of Company Value 158651.0723286644 \n",
|
||||
" \n",
|
||||
" 5 Year Weighted Moving Average \n",
|
||||
" Past Years Forecasted Future Years \n",
|
||||
" Year 1 Year 2 Year 3 Year 4 Year 5 Year 6 Year 7 Year 8 Year 9 Year 10 Terminal Value\n",
|
||||
"Pre-tax income 50000 55000 45000 52000 60000 \n",
|
||||
"Income Taxes 10000 11000 9000 10400 12000 \n",
|
||||
"Net Income 40000 44000 36000 41600 48000 \n",
|
||||
"Depreciation Expense 5000 4000 3000 2000 1000 \n",
|
||||
"Capital Expenditures 10000 8000 5000 5000 7000 \n",
|
||||
"Debt Repayments 5000 5000 5000 5000 5000 \n",
|
||||
"Net Cash Flow 20000 27000 23000 29600 35000 29093.333333333332 29817.777777777774 30177.481481481478 30469.234567901232 30379.73991769547 287188.0007003137\n",
|
||||
"Discounting Factor 0.8695652173913044 0.7561436672967865 0.6575162324319883 0.5717532455930334 0.4971767352982899 0.4971767352982899\n",
|
||||
"Present Value of Future Cash Flow 25298.550724637684 22546.523839529513 19842.183927989798 17420.883754932976 15104.099911490972 142783.19260502496\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" 3 Year Weighted Moving Average \n",
|
||||
" Past Years Forecasted Future Years \n",
|
||||
" Year 1 Year 2 Year 3 Year 4 Year 5 Year 6 Terminal Value \n",
|
||||
"Pre-tax income 50000 55000 45000 \n",
|
||||
"Income Taxes 10000 11000 9000 \n",
|
||||
"Net Income 40000 44000 36000 \n",
|
||||
"Depreciation Expense 5000 4000 3000 \n",
|
||||
"Capital Expenditures 10000 8000 5000 \n",
|
||||
"Debt Repayments 5000 5000 5000 \n",
|
||||
"Net Cash Flow 20000 27000 23000 23833.333333333332 24083.333333333332 23819.44444444444 158253.58851674633 \n",
|
||||
"Discounting Factor 0.8695652173913044 0.7561436672967865 0.6575162324319883 0.6575162324319883 \n",
|
||||
"Present Value of Future Cash Flow 20724.63768115942 18210.459987397608 15661.671369734164 104054.30329037321 \n",
|
||||
" \n",
|
||||
" \n",
|
||||
"Notes: \n",
|
||||
"-We based this simple discounted cash flow excel model based on the weighted moving averages (5 year or 3 year) for simplicity, in case a constant growth rate cannot be easily determined. \n",
|
||||
"-The factors such as Depreciation Expense, Capital Expense and Debt Repayments remain constant, so consider this when looking at the forecasted figures. \n",
|
||||
"-For the terminal value constant growth rate, we make the assumption of the growth from the last forecasted year compared to the first forecasted year. Adjust in the formula as needed. \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(base_docs[1].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ff6e812f-fa94-4b0f-8907-ee70983e53f1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from llama_index.core import SummaryIndex\n",
|
||||
"\n",
|
||||
"base_index = SummaryIndex.from_documents([base_docs[1]])\n",
|
||||
"\n",
|
||||
"base_query_engine = base_index.as_query_engine()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fa75f1bc-6fed-4721-ba5e-dc5408395618",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ask Questions over this Data\n",
|
||||
"\n",
|
||||
"Let's now ask questions over this data, using both the LlamaParse-powered pipeline and naive pipeline."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a875a20e-a6b6-46b7-80d4-614546215ffc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_str = \"Tell me about the income taxes in the past years (year 3-5) for the 5 year WMA table\"\n",
|
||||
"response = query_engine.query(query_str)\n",
|
||||
"base_response = base_query_engine.query(query_str)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "06b0b072-f159-47c4-9cad-9f0cc0d56b28",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"******* LlamaParse RAG *******\n",
|
||||
"The income taxes in the past years (year 3 to 5) for the 5-year Weighted Moving Average table were $9,000.00 in Year 3, $10,400.00 in Year 4, and $12,000.00 in Year 5.\n",
|
||||
"******* Naive RAG *******\n",
|
||||
"The income taxes in the past years (year 3-5) for the 5 year WMA table were $9,000, $10,400, and $12,000, respectively.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"******* LlamaParse RAG *******\")\n",
|
||||
"print(str(response))\n",
|
||||
"print(\"******* Naive RAG *******\")\n",
|
||||
"print(str(base_response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8bd0998f-4f7f-46f9-9b51-cfb510f384ee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(response.source_nodes[0].get_content())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7a93af5f-fcea-4f14-80eb-5dfad230cd8a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_str = \"Tell me about the discounting factors in year 5 for the 3 year WMA\"\n",
|
||||
"response = query_engine.query(query_str)\n",
|
||||
"base_response = base_query_engine.query(query_str)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c6d3a5fb-c32c-4dea-8f2e-956af85456a4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"******* LlamaParse RAG *******\n",
|
||||
"The discounting factor in year 5 for the 3-year Weighted Moving Average (WMA) is 0.7561.\n",
|
||||
"******* Naive RAG *******\n",
|
||||
"The discounting factor in year 5 for the 3-year Weighted Moving Average is 0.6575162324319883.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"******* LlamaParse RAG *******\")\n",
|
||||
"print(str(response))\n",
|
||||
"print(\"******* Naive RAG *******\")\n",
|
||||
"print(str(base_response))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b96f3a9b-6e99-4192-b6d6-447319d3c4fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_str = \"Tell me about the projected net cash flow in years 7-9 for the 5 year WMA\"\n",
|
||||
"response = query_engine.query(query_str)\n",
|
||||
"base_response = base_query_engine.query(query_str)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "92b419b9-25ee-4d69-98d9-56c0a45b24af",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"******* LlamaParse RAG *******\n",
|
||||
"The projected net cash flow for years 7 to 9 in the 5-year Weighted Moving Average scenario is as follows: Year 7 is $29,817.78, Year 8 is $30,177.48, and Year 9 is $30,469.23.\n",
|
||||
"******* Naive RAG *******\n",
|
||||
"The projected net cash flow for years 7 to 9 in the 5-year weighted moving average scenario is as follows: Year 7 is $29,093.33, Year 8 is $29,817.78, and Year 9 is $30,177.48.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"******* LlamaParse RAG *******\")\n",
|
||||
"print(str(response))\n",
|
||||
"print(\"******* Naive RAG *******\")\n",
|
||||
"print(str(base_response))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "llama_parse",
|
||||
"language": "python",
|
||||
"name": "llama_parse"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
After Width: | Height: | Size: 3.3 MiB |
@@ -0,0 +1,10 @@
|
||||
# Financial Modeling Assumptions
|
||||
Discount Rate: 8%
|
||||
Terminal Growth Rate: 2%
|
||||
Tax Rate: 25%
|
||||
Revenue Growth (Years 1-5): 10% per annum
|
||||
Revenue Growth (Years 6-10): 5% per annum
|
||||
Capital Expenditures as % of Revenue: 7%
|
||||
Working Capital Assumption: 3% of Revenue
|
||||
Depreciation Rate: 10% per annum
|
||||
Cost of Capital Assumption: 8%
|
||||
|
After Width: | Height: | Size: 67 KiB |