Compare commits
51 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a0ea197b28 | |||
| 74b11de9ae | |||
| c2b70436e5 | |||
| af9a9800e5 | |||
| e7bac2cbb8 | |||
| d595394243 | |||
| 27efb7b53c | |||
| 0c1fe17417 | |||
| 3f308e7ae4 | |||
| c85a17bac2 | |||
| a91672f619 | |||
| 81daa09d05 | |||
| 07be2e4555 | |||
| 4a642d576a | |||
| 8ee7108302 | |||
| a9461af96f | |||
| 4d42a32342 | |||
| 21add2715b | |||
| 3ded353c5a | |||
| b619226480 | |||
| 612f9346c5 | |||
| 90bec45008 | |||
| 5157e30fe7 | |||
| eb2d9e2b63 | |||
| 09d214522f | |||
| 8798735ea4 | |||
| 7ed859c068 | |||
| 417e6faccf | |||
| aeae13ba63 | |||
| 825d8ec9bb | |||
| 44a5c3530a | |||
| 14de11a420 | |||
| b15620ee9c | |||
| 13e7f2df0a | |||
| 888fce5060 | |||
| 148a3e4f89 | |||
| 0e10f3227f | |||
| b0667043ea | |||
| bd5eac5abd | |||
| dbb85200ac | |||
| c1023a14b8 | |||
| 8899acc989 | |||
| c0e7f51626 | |||
| 9f827eaca5 | |||
| d9fc08b05c | |||
| 8a5ba6d575 | |||
| 8204930f2b | |||
| 013fe6a153 | |||
| 01ffffd04c | |||
| 4ddbbc0ff8 | |||
| 5ffdbb5c4c |
@@ -114,7 +114,7 @@ jobs:
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Attempting to build docs..."
|
||||
make build_docs
|
||||
make docs_build
|
||||
test_datasets:
|
||||
timeout-minutes: 5
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
- name: Sphinx build
|
||||
shell: bash
|
||||
run: |
|
||||
make build_docs
|
||||
make docs_build
|
||||
- name: Publish Docs
|
||||
uses: peaceiris/actions-gh-pages@v3
|
||||
with:
|
||||
|
||||
@@ -3,32 +3,7 @@
|
||||
# Default target executed when no arguments are given to make.
|
||||
all: help
|
||||
|
||||
######################
|
||||
# TESTING AND COVERAGE
|
||||
######################
|
||||
|
||||
# Define a variable for the test file path.
|
||||
TEST_FILE ?= tests/unit_tests/
|
||||
|
||||
test:
|
||||
poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
|
||||
|
||||
test_watch:
|
||||
poetry run ptw . -- $(TEST_FILE)
|
||||
|
||||
build_docs:
|
||||
# Copy README.md to docs/index.md
|
||||
cp README.md ./docs/source/index.md
|
||||
# Append to the table of contents the contents of the file
|
||||
cat ./docs/source/toc.segment >> ./docs/source/index.md
|
||||
poetry run sphinx-build "./docs/source" "./docs/build"
|
||||
|
||||
clean_docs:
|
||||
rm -rf ./docs/build
|
||||
|
||||
######################
|
||||
# LINTING AND FORMATTING
|
||||
######################
|
||||
# LINTING AND FORMATTING:
|
||||
|
||||
# Define a variable for Python and notebook files.
|
||||
lint format: PYTHON_FILES=.
|
||||
@@ -48,19 +23,45 @@ spell_check:
|
||||
spell_fix:
|
||||
poetry run codespell --toml pyproject.toml -w
|
||||
|
||||
######################
|
||||
# HELP
|
||||
######################
|
||||
|
||||
# TESTING AND COVERAGE:
|
||||
|
||||
# Define a variable for the test file path.
|
||||
TEST_FILE ?= tests/unit_tests/
|
||||
|
||||
test:
|
||||
poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
|
||||
|
||||
test_watch:
|
||||
poetry run ptw . -- $(TEST_FILE)
|
||||
|
||||
|
||||
# DOCUMENTATION:
|
||||
|
||||
docs_clean:
|
||||
rm -rf ./docs/build
|
||||
|
||||
docs_build:
|
||||
# Copy README.md to docs/index.md
|
||||
cp README.md ./docs/source/index.md
|
||||
# Append to the table of contents the contents of the file
|
||||
cat ./docs/source/toc.segment >> ./docs/source/index.md
|
||||
poetry run sphinx-build "./docs/source" "./docs/build"
|
||||
|
||||
|
||||
# HELP:
|
||||
help:
|
||||
@echo '===================='
|
||||
@echo '-- LINTING --'
|
||||
@echo 'format - run code formatters'
|
||||
@echo 'lint - run linters'
|
||||
@echo 'spell_check - run codespell on the project'
|
||||
@echo 'spell_fix - run codespell on the project and fix the errors'
|
||||
@echo '-- TESTS --'
|
||||
@echo 'coverage - run unit tests and generate coverage report'
|
||||
@echo 'test - run unit tests'
|
||||
@echo 'test TEST_FILE=<test_file> - run all tests in file'
|
||||
@echo '-- DOCUMENTATION tasks are from the top-level Makefile --'
|
||||
@echo ''
|
||||
@echo 'LINTING:'
|
||||
@echo ' format - run code formatters'
|
||||
@echo ' lint - run linters'
|
||||
@echo ' spell_check - run codespell'
|
||||
@echo ' spell_fix - run codespell and fix the errors'
|
||||
@echo 'TESTS:'
|
||||
@echo ' test - run unit tests'
|
||||
@echo ' test TEST_FILE=<test_file> - run tests in <test_file>'
|
||||
@echo ' coverage - run unit tests and generate coverage report'
|
||||
@echo 'DOCUMENTATION:'
|
||||
@echo ' docs_clean - delete the docs/build directory'
|
||||
@echo ' docs_build - build the documentation'
|
||||
@echo ''
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
🚧 Under Active Development 🚧
|
||||
|
||||
# 🦜💪 LangChain Benchmarks
|
||||
# 🦜💯 LangChain Benchmarks
|
||||
|
||||
[](https://github.com/langchain-ai/langchain-benchmarks/releases)
|
||||
[](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml)
|
||||
@@ -35,7 +33,7 @@ pip install -U langchain-benchmarks
|
||||
All the benchmarks come with an associated benchmark dataset stored in [LangSmith](https://smith.langchain.com). To take advantage of the eval and debugging experience, [sign up](https://smith.langchain.com), and set your API key in your environment:
|
||||
|
||||
```bash
|
||||
export LANGCHAIN_API_KEY=sk-...
|
||||
export LANGCHAIN_API_KEY=ls-...
|
||||
```
|
||||
|
||||
## Repo Structure
|
||||
|
||||
|
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
|
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
|
Before Width: | Height: | Size: 9.7 KiB After Width: | Height: | Size: 9.7 KiB |
|
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
@@ -1,8 +1,7 @@
|
||||
from chat_langchain.chain import chain
|
||||
from fastapi import FastAPI
|
||||
from openai_functions_agent import agent_executor as openai_functions_agent_chain
|
||||
|
||||
from langserve import add_routes
|
||||
from openai_functions_agent import agent_executor as openai_functions_agent_chain
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@@ -1 +1,3 @@
|
||||
chromadb/
|
||||
index.md
|
||||
Untitled.ipynb
|
||||
|
||||
@@ -1,225 +1,226 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "033684fb-65b2-4586-a959-68c614741ca2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Datasets\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain-benchmarks/blob/main/docs/source/notebooks/datasets.ipynb)\n",
|
||||
"\n",
|
||||
"Here, we'll see how to work with LangSmith datasets."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -U langchain-benchmarks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "6d272fbf-710e-4a49-a0da-67e010541905",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_benchmarks import clone_public_dataset, download_public_dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "18ee0f96-e5c4-4ae9-aebf-7d8b88c51662",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's first download the dataset to the local file system"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "58b94f6d-0c91-4361-9b22-f758ffaa150a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching examples...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5a2fad8c0c3549ec96a3b38fe8a002b0",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/21 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Done fetching examples.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"download_public_dataset(\n",
|
||||
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "841db832-b0d3-4fd1-8531-1154ec9b3caa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"we can take a look at the first two examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "664e90fc-af84-4c5f-a3dd-5d9ffe649650",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[\n",
|
||||
" {\n",
|
||||
" \"created_at\": \"2023-11-15T15:26:53.511629\",\n",
|
||||
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
|
||||
" \"id\": \"0703a989-2693-4039-a1f6-7281fc1b4cb0\",\n",
|
||||
" \"inputs\": {\n",
|
||||
" \"question\": \"do bob and alice live in the same city?\"\n",
|
||||
" },\n",
|
||||
" \"modified_at\": \"2023-11-15T15:26:53.511629\",\n",
|
||||
" \"outputs\": {\n",
|
||||
" \"expected_steps\": [\n",
|
||||
" \"find_users_by_name\",\n",
|
||||
" \"get_user_location\",\n",
|
||||
" \"get_city_for_location\",\n",
|
||||
" \"get_user_location\",\n",
|
||||
" \"get_city_for_location\"\n",
|
||||
" ],\n",
|
||||
" \"order_matters\": false,\n",
|
||||
" \"reference\": \"no\"\n",
|
||||
" },\n",
|
||||
" \"runs\": []\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"created_at\": \"2023-11-15T15:26:53.491359\",\n",
|
||||
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
|
||||
" \"id\": \"b258b95a-9524-4da7-b758-c5481109322d\",\n",
|
||||
" \"inputs\": {\n",
|
||||
" \"question\": \"Is it likely that Donna is outside with an umbrella at this time?\"\n",
|
||||
" },\n",
|
||||
" \"modified_at\": \"2023-11-15T15:26:53.491359\",\n",
|
||||
" \"outputs\": {\n",
|
||||
" \"expected_steps\": [\n",
|
||||
" \"find_users_by_name\",\n",
|
||||
" \"get_user_location\",\n",
|
||||
" \"get_current_time_for_location\",\n",
|
||||
" \"get_current_weather_for_location\"\n",
|
||||
" ],\n",
|
||||
" \"order_matters\": false,\n",
|
||||
" \"reference\": \"yes\"\n",
|
||||
" },\n",
|
||||
" \"runs\": []\n",
|
||||
" }\n",
|
||||
"]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2c6cf01f-466b-406d-b4c7-2395747780fd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also clone the dataset to our local tenant"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e4dea4df-2f1c-436b-a71c-49ffb2295ccc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Executing this command will clone the dataset to your own LangSmith tenant. \n",
|
||||
"For this to work you must have a [LangSmith account](https://smith.langchain.com/) set up."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Get from https://smith.langchain.com/settings\n",
|
||||
"os.environ[\"LANGCHAIN_API_KEY\"] = \"ls_...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "18d0b905-2a6a-4752-a7cb-8653bd9049e3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clone_public_dataset(\n",
|
||||
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\",\n",
|
||||
" dataset_name=\"Agent Dataset\",\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "033684fb-65b2-4586-a959-68c614741ca2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Datasets\n",
|
||||
"\n",
|
||||
"Here, we'll see how to work with LangSmith datasets."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "474292e6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -U langchain-benchmarks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "6d272fbf-710e-4a49-a0da-67e010541905",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_benchmarks import clone_public_dataset, download_public_dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "18ee0f96-e5c4-4ae9-aebf-7d8b88c51662",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's first download the dataset to the local file system"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "58b94f6d-0c91-4361-9b22-f758ffaa150a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching examples...\n"
|
||||
]
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5a2fad8c0c3549ec96a3b38fe8a002b0",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/21 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Done fetching examples.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"download_public_dataset(\n",
|
||||
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "841db832-b0d3-4fd1-8531-1154ec9b3caa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"we can take a look at the first two examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "664e90fc-af84-4c5f-a3dd-5d9ffe649650",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[\n",
|
||||
" {\n",
|
||||
" \"created_at\": \"2023-11-15T15:26:53.511629\",\n",
|
||||
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
|
||||
" \"id\": \"0703a989-2693-4039-a1f6-7281fc1b4cb0\",\n",
|
||||
" \"inputs\": {\n",
|
||||
" \"question\": \"do bob and alice live in the same city?\"\n",
|
||||
" },\n",
|
||||
" \"modified_at\": \"2023-11-15T15:26:53.511629\",\n",
|
||||
" \"outputs\": {\n",
|
||||
" \"expected_steps\": [\n",
|
||||
" \"find_users_by_name\",\n",
|
||||
" \"get_user_location\",\n",
|
||||
" \"get_city_for_location\",\n",
|
||||
" \"get_user_location\",\n",
|
||||
" \"get_city_for_location\"\n",
|
||||
" ],\n",
|
||||
" \"order_matters\": false,\n",
|
||||
" \"reference\": \"no\"\n",
|
||||
" },\n",
|
||||
" \"runs\": []\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"created_at\": \"2023-11-15T15:26:53.491359\",\n",
|
||||
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
|
||||
" \"id\": \"b258b95a-9524-4da7-b758-c5481109322d\",\n",
|
||||
" \"inputs\": {\n",
|
||||
" \"question\": \"Is it likely that Donna is outside with an umbrella at this time?\"\n",
|
||||
" },\n",
|
||||
" \"modified_at\": \"2023-11-15T15:26:53.491359\",\n",
|
||||
" \"outputs\": {\n",
|
||||
" \"expected_steps\": [\n",
|
||||
" \"find_users_by_name\",\n",
|
||||
" \"get_user_location\",\n",
|
||||
" \"get_current_time_for_location\",\n",
|
||||
" \"get_current_weather_for_location\"\n",
|
||||
" ],\n",
|
||||
" \"order_matters\": false,\n",
|
||||
" \"reference\": \"yes\"\n",
|
||||
" },\n",
|
||||
" \"runs\": []\n",
|
||||
" }\n",
|
||||
"]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||
" print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2c6cf01f-466b-406d-b4c7-2395747780fd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also clone the dataset to our local tenant"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e4dea4df-2f1c-436b-a71c-49ffb2295ccc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Executing this command will clone the dataset to your own LangSmith tenant. \n",
|
||||
"For this to work you must have a [LangSmith account](https://smith.langchain.com/) set up."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7eb38ea6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Get from https://smith.langchain.com/settings\n",
|
||||
"os.environ[\"LANGCHAIN_API_KEY\"] = \"ls_...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "18d0b905-2a6a-4752-a7cb-8653bd9049e3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clone_public_dataset(\n",
|
||||
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\",\n",
|
||||
" dataset_name=\"Agent Dataset\",\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
||||
@@ -688,8 +688,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df = test_run.to_dataframe().join(claude_test_run.to_dataframe(), rsuffix=\"_claude\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"id": "86912590-a90a-4351-8ab4-89192cdee1e7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -26,19 +26,24 @@
|
||||
"<tr><th>Name </th><th>Type </th><th>Dataset ID </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d\" target=\"_blank\" rel=\"noopener\">36bdfe7d-3cd1-4b36-b957-d12d95810a2b</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
|
||||
"<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d\" target=\"_blank\" rel=\"noopener\">a1742786-bde5-4f51-a1d8-e148e5251ddb</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
|
||||
"\n",
|
||||
"Some additional cleanup of the data was done by hand after the initial pass.\n",
|
||||
"\n",
|
||||
"See https://github.com/jacoblee93/oss-model-extraction-evals. </td></tr>\n",
|
||||
"<tr><td>Chat Extraction </td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d\" target=\"_blank\" rel=\"noopener\">00f4444c-9460-4a82-b87a-f50096f1cfef</a></td><td>A dataset meant to test the ability of an LLM to extract and infer\n",
|
||||
"structured information from a dialogue. The dialogue is between a user and a support\n",
|
||||
"engineer. Outputs should be structured as a JSON object and test both the ability\n",
|
||||
"of the LLM to correctly structure the information and its ability to perform simple \n",
|
||||
"classification tasks. </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))]))])"
|
||||
"Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))])), ExtractionTask(name='Chat Extraction', dataset_id='https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d', description='A dataset meant to test the ability of an LLM to extract and infer\\nstructured information from a dialogue. The dialogue is between a user and a support\\nengineer. Outputs should be structured as a JSON object and test both the ability\\nof the LLM to correctly structure the information and its ability to perform simple \\nclassification tasks.', schema=<class 'langchain_benchmarks.extraction.tasks.chat_extraction.schema.GenerateTicket'>, instructions=ChatPromptTemplate(input_variables=['dialogue'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['dialogue'], template='Generate a ticket for the following question-response pair:\\n<Dialogue>\\n{dialogue}\\n</Dialogue>'))]))])"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -85,9 +90,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 2,
|
||||
"id": "9c7865bd-8251-4579-85a3-f9085d96f497",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
@@ -115,7 +122,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -286,7 +286,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"vectorstore = Chroma(\n",
|
||||
" collection_name=f\"lcbm-b-huggingface-gte-base\",\n",
|
||||
" collection_name=\"lcbm-b-huggingface-gte-base\",\n",
|
||||
" embedding_function=embeddings,\n",
|
||||
" persist_directory=\"./chromadb\",\n",
|
||||
")\n",
|
||||
|
||||
@@ -0,0 +1,610 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9fa3470d-9448-4792-9f65-6978fc58cf84",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Multi-modal eval: Baseline\n",
|
||||
"\n",
|
||||
"`Multi-modal slide decks` is a public dataset that contains a dataset of question-answer pairs from slide decks with visual content.\n",
|
||||
"\n",
|
||||
"The question-answer pairs are derived from the visual content in the decks, testing the ability of RAG to perform visual reasoning.\n",
|
||||
"\n",
|
||||
"As a baseline, we evaluate this dataset using text-based RAG pipeline, below.\n",
|
||||
"\n",
|
||||
"This will not reason about visual content and will simply load the text from the slides. \n",
|
||||
"\n",
|
||||
"## Pre-requisites"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "47220461-d4e9-4f1d-9c57-672ca947ca0d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install -U langchain langsmith langchain_benchmarks\n",
|
||||
"# %pip install --quiet chromadb openai pypdf pandas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "196de967-6de6-40da-aa75-e836923ab5e3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
|
||||
"env_vars = [\"LANGCHAIN_API_KEY\", \"OPENAI_API_KEY\"]\n",
|
||||
"for var in env_vars:\n",
|
||||
" if var not in os.environ:\n",
|
||||
" os.environ[var] = getpass.getpass(prompt=f\"Enter your {var}: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "10da8e11-6288-4131-bd60-d5aa86928acc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dataset\n",
|
||||
"\n",
|
||||
"We can browse the available LangChain benchmark datasets for retrieval."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2ff97905-14a6-413c-99be-58b7a9c8d4c1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<thead>\n",
|
||||
"<tr><th>Name </th><th>Type </th><th>Dataset ID </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>LangChain Docs Q&A </td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d\" target=\"_blank\" rel=\"noopener\">452ccafc-18e1-4314-885b-edd735f17b9d</a></td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
|
||||
"\n",
|
||||
"The environment provides the documents and the retriever information.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer.\n",
|
||||
"We also measure the faithfulness of the model's response relative to the retrieved documents (if any). </td></tr>\n",
|
||||
"<tr><td>Semi-structured Reports</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d\" target=\"_blank\" rel=\"noopener\">c47d9617-ab99-4d6e-a6e6-92b8daf85a7d</a></td><td>Questions and answers based on PDFs containing tables and charts.\n",
|
||||
"\n",
|
||||
"The task provides the raw documents as well as factory methods to easily index them\n",
|
||||
"and create a retriever.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer.\n",
|
||||
"We also measure the faithfulness of the model's response relative to the retrieved documents (if any). </td></tr>\n",
|
||||
"<tr><td>Multi-modal slide decks</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d\" target=\"_blank\" rel=\"noopener\">40afc8e7-9d7e-44ed-8971-2cae1eb59731</a></td><td>This public dataset is a work-in-progress and will be extended over time.\n",
|
||||
" \n",
|
||||
"Questions and answers based on slide decks containing visual tables and charts.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer. </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", get_docs=<function load_cached_docs at 0x104485800>, retriever_factories={'basic': <function _chroma_retriever_factory at 0x1360289a0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x136028a40>, 'hyde': <function _chroma_hyde_retriever_factory at 0x136028ae0>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x126ba2660>}), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", get_docs=<function load_docs at 0x136029620>, retriever_factories={'basic': <function _chroma_retriever_factory at 0x1360296c0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x136029760>, 'hyde': <function _chroma_hyde_retriever_factory at 0x136029800>}, architecture_factories={}), RetrievalTask(name='Multi-modal slide decks', dataset_id='https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d', description='This public dataset is a work-in-progress and will be extended over time.\\n \\nQuestions and answers based on slide decks containing visual tables and charts.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\n', get_docs={}, retriever_factories={}, architecture_factories={})])"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_benchmarks import clone_public_dataset, registry\n",
|
||||
"\n",
|
||||
"registry = registry.filter(Type=\"RetrievalTask\")\n",
|
||||
"registry"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2fb7dc3d-28f1-4c28-b0d0-3784d04b81ce",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"`Multi-modal slide decks` is the relevant dataset for our task."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "219a4141-4a5f-48e4-ae05-5a824e2193fd",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>Name </td><td>Multi-modal slide decks </td></tr>\n",
|
||||
"<tr><td>Type </td><td>RetrievalTask </td></tr>\n",
|
||||
"<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d\" target=\"_blank\" rel=\"noopener\">40afc8e7-9d7e-44ed-8971-2cae1eb59731</a></td></tr>\n",
|
||||
"<tr><td>Description </td><td>This public dataset is a work-in-progress and will be extended over time.\n",
|
||||
" \n",
|
||||
"Questions and answers based on slide decks containing visual tables and charts.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer. </td></tr>\n",
|
||||
"<tr><td>Retriever Factories </td><td> </td></tr>\n",
|
||||
"<tr><td>Architecture Factories</td><td> </td></tr>\n",
|
||||
"<tr><td>get_docs </td><td>{} </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"RetrievalTask(name='Multi-modal slide decks', dataset_id='https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d', description='This public dataset is a work-in-progress and will be extended over time.\\n \\nQuestions and answers based on slide decks containing visual tables and charts.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\n', get_docs={}, retriever_factories={}, architecture_factories={})"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"task = registry[\"Multi-modal slide decks\"]\n",
|
||||
"task"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2d6569b5-e79a-41b7-9745-c2f8a1dd704e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Clone the dataset so that it's available in our LangSmith datasets."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d2caa086-9549-4c74-bba9-ba80d5a7b218",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dataset Multi-modal slide decks already exists. Skipping.\n",
|
||||
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bf350917-a1e5-46f4-81cd-c1678ab9220f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "99ce6afb-2317-4bc1-9faf-4f828095ad91",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_benchmarks.rag.tasks.multi_modal_slide_decks import get_file_names\n",
|
||||
"\n",
|
||||
"file_names = list(get_file_names()) # PosixPath"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "848a4cdb-6c08-4c01-81ce-16ab83a7fdff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load\n",
|
||||
"\n",
|
||||
"Load and split the files for indexing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "6ce85810-98a7-406e-b44e-ce860ac35986",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"There are 98 text elements in DDOG_Q3_earnings_deck.pdf\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PyPDFLoader\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def load_and_split(file):\n",
|
||||
" \"\"\"\n",
|
||||
" Load and split PDF files\n",
|
||||
" :param file: PosixPath path for pdf\n",
|
||||
" :return: A list of text chunks\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" loader = PyPDFLoader(str(file))\n",
|
||||
" pdf_pages = loader.load()\n",
|
||||
"\n",
|
||||
" text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
|
||||
" chunk_size=100, chunk_overlap=50\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Get chunks\n",
|
||||
" docs = text_splitter.split_documents(pdf_pages)\n",
|
||||
" texts = [d.page_content for d in docs]\n",
|
||||
" print(f\"There are {len(texts)} text elements in {file.name}\")\n",
|
||||
" return texts\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"texts = []\n",
|
||||
"for fi in file_names:\n",
|
||||
" texts.extend(load_and_split(fi))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eb01925d-b7d1-47a1-bd90-805178d3c4a9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Index\n",
|
||||
"\n",
|
||||
"Embed (OpenAIEmbeddings) and store splits in a vectorstore (Chroma)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ceb31f71-45fb-4b12-bc1c-31981de334bb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"\n",
|
||||
"vectorstore_baseline = Chroma.from_texts(\n",
|
||||
" texts=texts, collection_name=\"baseline-multi-modal\", embedding=OpenAIEmbeddings()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"retriever_baseline = vectorstore_baseline.as_retriever()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e6dcbb01-f480-456d-b972-c732eb26c393",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## RAG\n",
|
||||
"\n",
|
||||
"Create a pipeline for retrieval of relevant chunks based on semantic similarity to the input question.\n",
|
||||
"\n",
|
||||
"Pass the images to GPT-4 for answer synthesis."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ea233664-e527-42f1-a820-0c2271e16c20",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.prompts import ChatPromptTemplate\n",
|
||||
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def rag_chain(retriever):\n",
|
||||
" \"\"\"\n",
|
||||
" RAG pipeline for the indexed presentations\n",
|
||||
" :param retriever: PosixPath path for pdf\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Prompt template\n",
|
||||
" template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n",
|
||||
" {context}\n",
|
||||
" Question: {question}\n",
|
||||
" \"\"\"\n",
|
||||
" prompt = ChatPromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
" # LLM\n",
|
||||
" model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
|
||||
"\n",
|
||||
" # RAG pipeline\n",
|
||||
" chain = (\n",
|
||||
" {\n",
|
||||
" \"context\": retriever | (lambda x: \"\\n\\n\".join([i.page_content for i in x])),\n",
|
||||
" \"question\": RunnablePassthrough(),\n",
|
||||
" }\n",
|
||||
" | prompt\n",
|
||||
" | model\n",
|
||||
" | StrOutputParser()\n",
|
||||
" )\n",
|
||||
" return chain\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Create RAG chain\n",
|
||||
"chain = rag_chain(retriever_baseline)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "95df1446-143d-4f4c-a15b-2a379266d8bf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Eval\n",
|
||||
"\n",
|
||||
"Run evaluation on our dataset:\n",
|
||||
"\n",
|
||||
"* `task.name` is the dataset of QA pairs that we cloned\n",
|
||||
"* `eval_config` specifies the [LangSmith evaluator](https://docs.smith.langchain.com/evaluation/evaluator-implementations#correctness-qa-evaluation) for our dataset, which will use GPT-4 as a grader\n",
|
||||
"* The grader will evaluate the chain-generated answer to each question relative to ground truth"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "479ce09d-642e-4b3b-9e4e-e9c2b7f0e9ca",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"View the evaluation results for project '866f-baseline' at:\n",
|
||||
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306/compare?selectedSessions=30199d47-50d7-4c5c-a55a-e74157e05951\n",
|
||||
"\n",
|
||||
"View all tests for Dataset Multi-modal slide decks at:\n",
|
||||
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306\n",
|
||||
"[------------------------------------------------->] 10/10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<h3>Experiment Results:</h3>"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>output</th>\n",
|
||||
" <th>feedback.COT Contextual Accuracy</th>\n",
|
||||
" <th>error</th>\n",
|
||||
" <th>execution_time</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>10.000000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>10.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unique</th>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>top</th>\n",
|
||||
" <td>Datadog has 20 total customers.</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>freq</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.200000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>4.674478</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.421637</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.864273</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>3.307960</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>4.113816</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>4.700962</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>5.018359</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>6.188082</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" output feedback.COT Contextual Accuracy \\\n",
|
||||
"count 10 10.000000 \n",
|
||||
"unique 10 NaN \n",
|
||||
"top Datadog has 20 total customers. NaN \n",
|
||||
"freq 1 NaN \n",
|
||||
"mean NaN 0.200000 \n",
|
||||
"std NaN 0.421637 \n",
|
||||
"min NaN 0.000000 \n",
|
||||
"25% NaN 0.000000 \n",
|
||||
"50% NaN 0.000000 \n",
|
||||
"75% NaN 0.000000 \n",
|
||||
"max NaN 1.000000 \n",
|
||||
"\n",
|
||||
" error execution_time \n",
|
||||
"count 0 10.000000 \n",
|
||||
"unique 0 NaN \n",
|
||||
"top NaN NaN \n",
|
||||
"freq NaN NaN \n",
|
||||
"mean NaN 4.674478 \n",
|
||||
"std NaN 0.864273 \n",
|
||||
"min NaN 3.307960 \n",
|
||||
"25% NaN 4.113816 \n",
|
||||
"50% NaN 4.700962 \n",
|
||||
"75% NaN 5.018359 \n",
|
||||
"max NaN 6.188082 "
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import uuid\n",
|
||||
"\n",
|
||||
"from langchain.smith import RunEvalConfig\n",
|
||||
"from langsmith.client import Client\n",
|
||||
"\n",
|
||||
"# Evaluator configuration\n",
|
||||
"client = Client()\n",
|
||||
"eval_config = RunEvalConfig(\n",
|
||||
" evaluators=[\"cot_qa\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Experiments\n",
|
||||
"chain_map = {\n",
|
||||
" \"baseline\": chain,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Run evaluation\n",
|
||||
"run_id = uuid.uuid4().hex[:4]\n",
|
||||
"test_runs = {}\n",
|
||||
"for project_name, chain in chain_map.items():\n",
|
||||
" test_runs[project_name] = client.run_on_dataset(\n",
|
||||
" dataset_name=task.name,\n",
|
||||
" llm_or_chain_factory=lambda: (lambda x: x[\"Question\"]) | chain,\n",
|
||||
" evaluation=eval_config,\n",
|
||||
" verbose=True,\n",
|
||||
" project_name=f\"{run_id}-{project_name}\",\n",
|
||||
" project_metadata={\"chain\": project_name},\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -412,8 +412,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from functools import partial\n",
|
||||
"\n",
|
||||
"from langsmith.client import Client\n",
|
||||
"\n",
|
||||
"from langchain_benchmarks.rag import get_eval_config\n",
|
||||
@@ -0,0 +1,317 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b6856d11-40d5-48e5-9eb3-423f479933a1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Semi-structured eval: Chunk size tuning\n",
|
||||
"\n",
|
||||
"`Semi-structured Reports` is a public dataset that contains question-answer pairs from documents with text and tables.\n",
|
||||
"\n",
|
||||
"The question-answer pairs are derived from the tables as well as some of the paragraphs in the docs.\n",
|
||||
"\n",
|
||||
"We evaluation performance of various chunk sizes with RAG. \n",
|
||||
"\n",
|
||||
"## Pre-requisites"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c387b660-967d-4d2f-8c38-af125f7b7a8b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install -U langchain langsmith langchain_benchmarks\n",
|
||||
"# %pip install --quiet chromadb openai pypdf tiktoken fireworks-ai"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e9e332b1-7da4-47fc-8d9a-4d65fbfc6953",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
|
||||
"env_vars = [\"LANGCHAIN_API_KEY\", \"OPENAI_API_KEY\", \"FIREWORKS_API_KEY\"]\n",
|
||||
"for var in env_vars:\n",
|
||||
" if var not in os.environ:\n",
|
||||
" os.environ[var] = getpass.getpass(prompt=f\"Enter your {var}: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b1a19f23-468c-4aeb-a0e9-0765a85f3f0b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dataset\n",
|
||||
"\n",
|
||||
"Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a94d9aa5-acd8-4032-ad8f-f995dec4d13c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"from langchain_benchmarks import clone_public_dataset, registry\n",
|
||||
"from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names\n",
|
||||
"\n",
|
||||
"# Task\n",
|
||||
"task = registry[\"Semi-structured Reports\"]\n",
|
||||
"\n",
|
||||
"# Files used\n",
|
||||
"paths = list(get_file_names())\n",
|
||||
"files = [str(p) for p in paths]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12b52285-358c-4752-ad6b-25ffb629e309",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Clone the dataset so that it's available in our LangSmith datasets."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "1ecca7af-c3e7-42d1-97dd-c7d9777207cb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dataset Semi-structured Reports already exists. Skipping.\n",
|
||||
"You can access the dataset at https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/6549a3a5-1cb9-463f-951d-0166cb9cf45c.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "64f37705-0190-4b7a-9d88-63bfd904fbd9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load and index\n",
|
||||
"\n",
|
||||
"We load each file, split it, embed with `OpenAIEmbeddings`, and create an index with `Chroma` vectorstore."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7eb9e333-77e6-48f9-b221-9bded023b978",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatFireworks, ChatOpenAI\n",
|
||||
"from langchain.document_loaders import PyPDFLoader\n",
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"from langchain.prompts import ChatPromptTemplate\n",
|
||||
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def load_and_split(file, token_count, split_document=True):\n",
|
||||
" \"\"\"\n",
|
||||
" Load and optionally split PDF files.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" file (str): File path.\n",
|
||||
" token_count (int): Token count for splitting.\n",
|
||||
" split_document (bool): Flag for splitting or returning pages.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" loader = PyPDFLoader(file)\n",
|
||||
" pdf_pages = loader.load()\n",
|
||||
"\n",
|
||||
" if split_document:\n",
|
||||
" text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
|
||||
" chunk_size=token_count, chunk_overlap=50\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" docs = text_splitter.split_documents(pdf_pages)\n",
|
||||
" texts = [d.page_content for d in docs]\n",
|
||||
" else:\n",
|
||||
" texts = [d.page_content for d in pdf_pages]\n",
|
||||
"\n",
|
||||
" print(f\"There are {len(texts)} text elements\")\n",
|
||||
" return texts\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def load_files(files, token_count, split_document):\n",
|
||||
" \"\"\"\n",
|
||||
" Load files.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" files (list): List of file names.\n",
|
||||
" dir (str): Directory path.\n",
|
||||
" token_count (int): Token count for splitting.\n",
|
||||
" split_document (bool): Flag for splitting documents.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" texts = []\n",
|
||||
" for fi in files:\n",
|
||||
" texts.extend(load_and_split(fi, token_count, split_document))\n",
|
||||
" return texts\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def make_retriever(texts, expt):\n",
|
||||
" \"\"\"\n",
|
||||
" Make vector store.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" texts (list): List of texts.\n",
|
||||
" expt (str): Experiment name.\n",
|
||||
" \"\"\"\n",
|
||||
" vectorstore = Chroma.from_texts(\n",
|
||||
" texts=texts, collection_name=expt, embedding=OpenAIEmbeddings()\n",
|
||||
" )\n",
|
||||
" retriever = vectorstore.as_retriever()\n",
|
||||
" return retriever\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def rag_chain(retriever, llm):\n",
|
||||
" \"\"\"\n",
|
||||
" RAG chain.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" retriever: The retriever to use.\n",
|
||||
" llm: The llm to use.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Prompt template\n",
|
||||
" template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n",
|
||||
" {context}\n",
|
||||
" Question: {question}\n",
|
||||
" \"\"\"\n",
|
||||
" prompt = ChatPromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
" # LLM\n",
|
||||
" if llm == \"mixtral\":\n",
|
||||
" model = ChatFireworks(\n",
|
||||
" model=\"accounts/fireworks/models/mixtral-8x7b-instruct\", temperature=0\n",
|
||||
" )\n",
|
||||
" else:\n",
|
||||
" model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
|
||||
"\n",
|
||||
" # RAG pipeline\n",
|
||||
" chain = (\n",
|
||||
" {\n",
|
||||
" \"context\": retriever | (lambda x: \"\\n\\n\".join([i.page_content for i in x])),\n",
|
||||
" \"question\": RunnablePassthrough(),\n",
|
||||
" }\n",
|
||||
" | prompt\n",
|
||||
" | model\n",
|
||||
" | StrOutputParser()\n",
|
||||
" )\n",
|
||||
" return chain\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Experiment configurations\n",
|
||||
"experiments = [\n",
|
||||
" (None, False, \"page_split-oai\", \"oai\"),\n",
|
||||
" (50, True, \"50_tok_split-oai\", \"oai\"),\n",
|
||||
" (100, True, \"100_tok_split-oai\", \"oai\"),\n",
|
||||
" (250, True, \"250_tok_split-oai\", \"oai\"),\n",
|
||||
" (250, True, \"250_tok_split-mixtral\", \"mixtral\"),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Run\n",
|
||||
"stor_chain = {}\n",
|
||||
"for token_count, split_document, expt, llm in experiments:\n",
|
||||
" texts = load_files(files, token_count, split_document)\n",
|
||||
" retriever = make_retriever(texts, expt)\n",
|
||||
" stor_chain[expt] = rag_chain(retriever, llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "29515a91-3cb1-41bd-a2d4-6cf6ce7806c2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Eval\n",
|
||||
"\n",
|
||||
"Run eval onm our dataset, `Semi-structured Reports`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "edd2e7f9-b3f6-4885-bf05-96f1c1758b20",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import uuid\n",
|
||||
"\n",
|
||||
"from langchain.smith import RunEvalConfig\n",
|
||||
"from langsmith.client import Client\n",
|
||||
"\n",
|
||||
"# Config\n",
|
||||
"client = Client()\n",
|
||||
"eval_config = RunEvalConfig(\n",
|
||||
" evaluators=[\"cot_qa\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Experiments\n",
|
||||
"chain_map = {\n",
|
||||
" \"page_split\": stor_chain[\"page_split-oai\"],\n",
|
||||
" \"baseline-50-tok\": stor_chain[\"50_tok_split-oai\"],\n",
|
||||
" \"baseline-100-tok\": stor_chain[\"100_tok_split-oai\"],\n",
|
||||
" \"baseline-250-tok\": stor_chain[\"250_tok_split-oai\"],\n",
|
||||
" \"baseline-250-tok-mixtral\": stor_chain[\"250_tok_split-mixtral\"],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Run evaluation\n",
|
||||
"run_id = uuid.uuid4().hex[:4]\n",
|
||||
"test_runs = {}\n",
|
||||
"for project_name, chain in chain_map.items():\n",
|
||||
" test_runs[project_name] = client.run_on_dataset(\n",
|
||||
" dataset_name=task.name,\n",
|
||||
" llm_or_chain_factory=lambda: (lambda x: x[\"question\"]) | chain,\n",
|
||||
" evaluation=eval_config,\n",
|
||||
" verbose=True,\n",
|
||||
" project_name=f\"{run_id}-{project_name}\",\n",
|
||||
" project_metadata={\"chain\": project_name},\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||