Compare commits

...

23 Commits

Author SHA1 Message Date
Leonid Ganeline a0ea197b28 updated Makefile (#153)
Cleaned up `makefile`
2023-12-20 09:24:06 -05:00
Eugene Yurtsev 74b11de9ae Update evaluators (#157)
Update to remove user warning
2023-12-19 17:30:24 -05:00
William FH c2b70436e5 Add runnable agent factory (#156)
Not sure if it's "easier" but it involves less thinking about
benchmarking abstractions
2023-12-19 13:39:08 -08:00
Eugene Yurtsev af9a9800e5 Register the new dataset (#155)
Register the new dataset
2023-12-19 15:01:38 -05:00
Eugene Yurtsev e7bac2cbb8 Change multiverse math to multiverse math (tiny) and add another multiverse math set (#154)
* This PR adds a multiverse math consisting of 20 questions.
* Question about rounding has been removed to simplify evaluation.
2023-12-19 14:57:37 -05:00
Eugene Yurtsev d595394243 Update Math Evaluator (#152)
Try another evaluator that ignores the question
2023-12-19 13:52:13 -05:00
William FH 27efb7b53c Add Gemini (#151) 2023-12-18 20:27:59 -08:00
William FH 0c1fe17417 Add to toc (#149) 2023-12-18 18:10:41 -08:00
William FH 3f308e7ae4 Update Benchmark (#148)
- Ran all benchmarks again
- Add options to overwrite or archive existing test runs
- Updated some of the aggregation logic at the end
2023-12-18 17:29:03 -08:00
William FH c85a17bac2 Include assistant factory in benchmark all (#147) 2023-12-18 13:42:36 -08:00
Eugene Yurtsev a91672f619 Update notebooks (#146)
Update notebooks for tool usage

- Use task.get_eval_config()
- Add benchmark all to introduction
2023-12-18 12:03:20 -05:00
Eugene Yurtsev 81daa09d05 Update example in multiverse math (#145)
Update example
2023-12-18 11:19:40 -05:00
William FH 07be2e4555 OAI Assistant (#144)
Similar to our functions agent
2023-12-17 10:35:10 -08:00
Eugene Yurtsev 4a642d576a Update openai function factory, update benchmark all (#143)
* Update open ai agent factory to be consistent with other factories.
* Update benchmark all to add anthropic sdk.
2023-12-15 09:44:05 -05:00
William FH 8ee7108302 Run w/o langsmith (#137) 2023-12-14 21:09:49 -08:00
William FH a9461af96f Parser Fix (#142)
Needs to be the tool parser not the functions parser
2023-12-14 21:09:38 -08:00
William FH 4d42a32342 🐶 (#136) 2023-12-14 21:07:42 -08:00
Eugene Yurtsev 21add2715b Add anthropic agent based on tool user repo (#139)
For benchmarking, this is comparing against anthropic implementation.
2023-12-14 22:21:55 -05:00
William FH 3ded353c5a Fix openai output parser used (#138) 2023-12-14 18:19:41 -08:00
William FH b619226480 Add Anyscale Model (#135) 2023-12-14 15:32:16 -08:00
Eugene Yurtsev 612f9346c5 Update benchmark all notebook (#134)
Benchmark all
2023-12-14 16:33:07 -05:00
Eugene Yurtsev 90bec45008 Version 0.0.10 (#133)
Bump requirement on minimal langsmith client
2023-12-14 13:31:13 -05:00
Eugene Yurtsev 5157e30fe7 Update min langsmith client (#132)
Update min langsmith client
2023-12-14 13:29:57 -05:00
36 changed files with 2697 additions and 769 deletions
+1 -1
View File
@@ -114,7 +114,7 @@ jobs:
shell: bash
run: |
echo "Attempting to build docs..."
make build_docs
make docs_build
test_datasets:
timeout-minutes: 5
runs-on: ubuntu-latest
+1 -1
View File
@@ -34,7 +34,7 @@ jobs:
- name: Sphinx build
shell: bash
run: |
make build_docs
make docs_build
- name: Publish Docs
uses: peaceiris/actions-gh-pages@v3
with:
+41 -40
View File
@@ -3,32 +3,7 @@
# Default target executed when no arguments are given to make.
all: help
######################
# TESTING AND COVERAGE
######################
# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
test:
poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
test_watch:
poetry run ptw . -- $(TEST_FILE)
build_docs:
# Copy README.md to docs/index.md
cp README.md ./docs/source/index.md
# Append to the table of contents the contents of the file
cat ./docs/source/toc.segment >> ./docs/source/index.md
poetry run sphinx-build "./docs/source" "./docs/build"
clean_docs:
rm -rf ./docs/build
######################
# LINTING AND FORMATTING
######################
# LINTING AND FORMATTING:
# Define a variable for Python and notebook files.
lint format: PYTHON_FILES=.
@@ -48,19 +23,45 @@ spell_check:
spell_fix:
poetry run codespell --toml pyproject.toml -w
######################
# HELP
######################
# TESTING AND COVERAGE:
# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
test:
poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
test_watch:
poetry run ptw . -- $(TEST_FILE)
# DOCUMENTATION:
docs_clean:
rm -rf ./docs/build
docs_build:
# Copy README.md to docs/index.md
cp README.md ./docs/source/index.md
# Append to the table of contents the contents of the file
cat ./docs/source/toc.segment >> ./docs/source/index.md
poetry run sphinx-build "./docs/source" "./docs/build"
# HELP:
help:
@echo '===================='
@echo '-- LINTING --'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'spell_check - run codespell on the project'
@echo 'spell_fix - run codespell on the project and fix the errors'
@echo '-- TESTS --'
@echo 'coverage - run unit tests and generate coverage report'
@echo 'test - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'
@echo '-- DOCUMENTATION tasks are from the top-level Makefile --'
@echo ''
@echo 'LINTING:'
@echo ' format - run code formatters'
@echo ' lint - run linters'
@echo ' spell_check - run codespell'
@echo ' spell_fix - run codespell and fix the errors'
@echo 'TESTS:'
@echo ' test - run unit tests'
@echo ' test TEST_FILE=<test_file> - run tests in <test_file>'
@echo ' coverage - run unit tests and generate coverage report'
@echo 'DOCUMENTATION:'
@echo ' docs_clean - delete the docs/build directory'
@echo ' docs_build - build the documentation'
@echo ''
View File
View File
+223 -222
View File
@@ -1,225 +1,226 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "033684fb-65b2-4586-a959-68c614741ca2",
"metadata": {},
"source": [
"# Datasets\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain-benchmarks/blob/main/docs/source/notebooks/datasets.ipynb)\n",
"\n",
"Here, we'll see how to work with LangSmith datasets."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -U langchain-benchmarks"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6d272fbf-710e-4a49-a0da-67e010541905",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, download_public_dataset"
]
},
{
"cell_type": "markdown",
"id": "18ee0f96-e5c4-4ae9-aebf-7d8b88c51662",
"metadata": {},
"source": [
"Let's first download the dataset to the local file system"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "58b94f6d-0c91-4361-9b22-f758ffaa150a",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fetching examples...\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a2fad8c0c3549ec96a3b38fe8a002b0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/21 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done fetching examples.\n"
]
}
],
"source": [
"download_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "841db832-b0d3-4fd1-8531-1154ec9b3caa",
"metadata": {},
"source": [
"we can take a look at the first two examples"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "664e90fc-af84-4c5f-a3dd-5d9ffe649650",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"0703a989-2693-4039-a1f6-7281fc1b4cb0\",\n",
" \"inputs\": {\n",
" \"question\": \"do bob and alice live in the same city?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"no\"\n",
" },\n",
" \"runs\": []\n",
" },\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"b258b95a-9524-4da7-b758-c5481109322d\",\n",
" \"inputs\": {\n",
" \"question\": \"Is it likely that Donna is outside with an umbrella at this time?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_current_time_for_location\",\n",
" \"get_current_weather_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"yes\"\n",
" },\n",
" \"runs\": []\n",
" }\n",
"]\n"
]
}
],
"source": [
"import json\n",
"\n",
"with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
" print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
]
},
{
"cell_type": "markdown",
"id": "2c6cf01f-466b-406d-b4c7-2395747780fd",
"metadata": {},
"source": [
"We can also clone the dataset to our local tenant"
]
},
{
"cell_type": "markdown",
"id": "e4dea4df-2f1c-436b-a71c-49ffb2295ccc",
"metadata": {},
"source": [
"Executing this command will clone the dataset to your own LangSmith tenant. \n",
"For this to work you must have a [LangSmith account](https://smith.langchain.com/) set up."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Get from https://smith.langchain.com/settings\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = \"ls_...\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18d0b905-2a6a-4752-a7cb-8653bd9049e3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"clone_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\",\n",
" dataset_name=\"Agent Dataset\",\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
"cells": [
{
"cell_type": "markdown",
"id": "033684fb-65b2-4586-a959-68c614741ca2",
"metadata": {},
"source": [
"# Datasets\n",
"\n",
"Here, we'll see how to work with LangSmith datasets."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "474292e6",
"metadata": {},
"outputs": [],
"source": [
"%pip install -U langchain-benchmarks"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6d272fbf-710e-4a49-a0da-67e010541905",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, download_public_dataset"
]
},
{
"cell_type": "markdown",
"id": "18ee0f96-e5c4-4ae9-aebf-7d8b88c51662",
"metadata": {},
"source": [
"Let's first download the dataset to the local file system"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "58b94f6d-0c91-4361-9b22-f758ffaa150a",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fetching examples...\n"
]
},
"nbformat": 4,
"nbformat_minor": 5
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a2fad8c0c3549ec96a3b38fe8a002b0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/21 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done fetching examples.\n"
]
}
],
"source": [
"download_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "841db832-b0d3-4fd1-8531-1154ec9b3caa",
"metadata": {},
"source": [
"we can take a look at the first two examples"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "664e90fc-af84-4c5f-a3dd-5d9ffe649650",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"0703a989-2693-4039-a1f6-7281fc1b4cb0\",\n",
" \"inputs\": {\n",
" \"question\": \"do bob and alice live in the same city?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"no\"\n",
" },\n",
" \"runs\": []\n",
" },\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"b258b95a-9524-4da7-b758-c5481109322d\",\n",
" \"inputs\": {\n",
" \"question\": \"Is it likely that Donna is outside with an umbrella at this time?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_current_time_for_location\",\n",
" \"get_current_weather_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"yes\"\n",
" },\n",
" \"runs\": []\n",
" }\n",
"]\n"
]
}
],
"source": [
"import json\n",
"\n",
"with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
" print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
]
},
{
"cell_type": "markdown",
"id": "2c6cf01f-466b-406d-b4c7-2395747780fd",
"metadata": {},
"source": [
"We can also clone the dataset to our local tenant"
]
},
{
"cell_type": "markdown",
"id": "e4dea4df-2f1c-436b-a71c-49ffb2295ccc",
"metadata": {},
"source": [
"Executing this command will clone the dataset to your own LangSmith tenant. \n",
"For this to work you must have a [LangSmith account](https://smith.langchain.com/) set up."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7eb38ea6",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Get from https://smith.langchain.com/settings\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = \"ls_...\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18d0b905-2a6a-4752-a7cb-8653bd9049e3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"clone_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\",\n",
" dataset_name=\"Agent Dataset\",\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@@ -661,7 +661,7 @@
},
"outputs": [],
"source": [
"from typing import Any, Dict, List, Type\n",
"from typing import Any, Dict, Type\n",
"\n",
"from langchain.chat_models import ChatAnthropic\n",
"from langchain.output_parsers.xml import XMLOutputParser\n",
@@ -1123,7 +1123,7 @@
},
"outputs": [],
"source": [
"from typing import Any, Dict, List, Type\n",
"from typing import Any, Dict, Type\n",
"\n",
"from langchain.chat_models import ChatAnthropic\n",
"from langchain.output_parsers.xml import XMLOutputParser\n",
@@ -1602,7 +1602,6 @@
"\n",
"from langchain.chat_models import ChatFireworks\n",
"from langchain.output_parsers.json import parse_json_markdown\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"\n",
"llama_prompt = ChatPromptTemplate.from_messages(\n",
" [\n",
@@ -1996,8 +1995,6 @@
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = (\n",
" test_run.to_dataframe()\n",
" .join(claude_test_run.to_dataframe(), rsuffix=\"_claude\")\n",
@@ -688,8 +688,6 @@
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = test_run.to_dataframe().join(claude_test_run.to_dataframe(), rsuffix=\"_claude\")"
]
},
@@ -286,7 +286,7 @@
")\n",
"\n",
"vectorstore = Chroma(\n",
" collection_name=f\"lcbm-b-huggingface-gte-base\",\n",
" collection_name=\"lcbm-b-huggingface-gte-base\",\n",
" embedding_function=embeddings,\n",
" persist_directory=\"./chromadb\",\n",
")\n",
@@ -412,8 +412,6 @@
}
],
"source": [
"from functools import partial\n",
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.rag import get_eval_config\n",
@@ -118,8 +118,6 @@
"metadata": {},
"outputs": [],
"source": [
"from langchain.callbacks.manager import CallbackManager\n",
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
"from langchain.chat_models import ChatFireworks, ChatOpenAI\n",
"from langchain.document_loaders import PyPDFLoader\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
@@ -0,0 +1,600 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "60bb467d-861d-4b07-a48d-8e5aa177c969",
"metadata": {
"tags": []
},
"source": [
"# Running Locally\n",
"\n",
"The LangChain benchmarks package is best used with LangSmith. You can create a free account [here](https://smith.langchain.com/) and read the [docs here](https://docs.smith.langchain.com/).\n",
"\n",
"\n",
"If you are unable to make an account, you can still run these benchmarks locally without an account.\n",
"\n",
"Below is an example."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a00a1a5f-43ef-4445-a792-8bf6a5f74643",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Prove that we can run without LangSmith\n",
"import os\n",
"\n",
"_ = [\n",
" os.environ.pop(key)\n",
" for key in list(os.environ.keys())\n",
" if key.startswith(\"LANGCHAIN_\")\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<tbody>\n",
"<tr><td>Name </td><td>Multiverse Math </td></tr>\n",
"<tr><td>Type </td><td>ToolUsageTask </td></tr>\n",
"<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d\" target=\"_blank\" rel=\"noopener\">594f9f60-30a0-49bf-b075-f44beabf546a</a></td></tr>\n",
"<tr><td>Description</td><td>An environment that contains a few basic math operations, but with altered results.\n",
"\n",
"For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n",
"\n",
"The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math. </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x137b70360>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain_benchmarks import registry\n",
"\n",
"task = registry[\"Multiverse Math\"]\n",
"task"
]
},
{
"cell_type": "markdown",
"id": "3821e4b0-8e67-418a-840c-470fcde42df0",
"metadata": {},
"source": [
"## Eval\n",
"\n",
"Let's evaluate an agent now. Nothing will be saved to langsmith, so be sure to save the test results to your file system if you want to use them later."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bb6a27e067fa4887beaa78a28d8d431d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Running Evaluation: 0%| | 0/10 [00:00<?, ?example/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<h3>Experiment Results:</h3>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>inputs.question</th>\n",
" <th>outputs.input</th>\n",
" <th>outputs.output</th>\n",
" <th>outputs.intermediate_steps</th>\n",
" <th>feedback.Intermediate steps correctness</th>\n",
" <th>feedback.# steps / # expected steps</th>\n",
" <th>feedback.correctness</th>\n",
" <th>error</th>\n",
" <th>execution_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>10.0</td>\n",
" <td>10.0</td>\n",
" <td>10.0</td>\n",
" <td>0</td>\n",
" <td>10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>multiply the result of (log of 100 to base 10)...</td>\n",
" <td>multiply the result of (log of 100 to base 10)...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>1.453172</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.496547</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.763208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.963885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>1.593439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>1.870549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>1.957470</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" inputs.question \\\n",
"count 10 \n",
"unique 10 \n",
"top multiply the result of (log of 100 to base 10)... \n",
"freq 1 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" outputs.input outputs.output \\\n",
"count 10 10 \n",
"unique 10 1 \n",
"top multiply the result of (log of 100 to base 10)... \n",
"freq 1 10 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" outputs.intermediate_steps feedback.Intermediate steps correctness \\\n",
"count 10 10.0 \n",
"unique 1 NaN \n",
"top [] NaN \n",
"freq 10 NaN \n",
"mean NaN 0.0 \n",
"std NaN 0.0 \n",
"min NaN 0.0 \n",
"25% NaN 0.0 \n",
"50% NaN 0.0 \n",
"75% NaN 0.0 \n",
"max NaN 0.0 \n",
"\n",
" feedback.# steps / # expected steps feedback.correctness error \\\n",
"count 10.0 10.0 0 \n",
"unique NaN NaN 0 \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
"mean 0.0 0.0 NaN \n",
"std 0.0 0.0 NaN \n",
"min 0.0 0.0 NaN \n",
"25% 0.0 0.0 NaN \n",
"50% 0.0 0.0 NaN \n",
"75% 0.0 0.0 NaN \n",
"max 0.0 0.0 NaN \n",
"\n",
" execution_time \n",
"count 10.000000 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean 1.453172 \n",
"std 0.496547 \n",
"min 0.763208 \n",
"25% 0.963885 \n",
"50% 1.593439 \n",
"75% 1.870549 \n",
"max 1.957470 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import uuid\n",
"\n",
"from langchain_benchmarks.tool_usage import agents, get_eval_config\n",
"from langchain_benchmarks.utils import run_without_langsmith\n",
"\n",
"experiment_uuid = uuid.uuid4().hex[:4]\n",
"\n",
"\n",
"models = [\"gpt-3.5-turbo-1106\"]\n",
"\n",
"for model in models:\n",
" print()\n",
" eval_config = get_eval_config(output_evaluation=\"qa_math\")\n",
" agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
" test_run = run_without_langsmith(\n",
" # This will clone the dataset locally if not already there\n",
" path_or_token_id=task.dataset_id,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" verbose=True,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "da3015b0-61b2-4748-ab0f-a0239bb74d58",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>inputs.question</th>\n",
" <th>outputs.input</th>\n",
" <th>outputs.output</th>\n",
" <th>outputs.intermediate_steps</th>\n",
" <th>feedback.Intermediate steps correctness</th>\n",
" <th>feedback.# steps / # expected steps</th>\n",
" <th>feedback.correctness</th>\n",
" <th>error</th>\n",
" <th>execution_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>27c44572-6c67-4129-a95a-fe1509c350be</th>\n",
" <td>multiply the result of (log of 100 to base 10)...</td>\n",
" <td>multiply the result of (log of 100 to base 10)...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>0.763208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2a20a13d-050e-4a16-84ff-22d9582f1449</th>\n",
" <td>after calculating the sin of 1.5 radians, divi...</td>\n",
" <td>after calculating the sin of 1.5 radians, divi...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>1.413695</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67867526-791a-452f-b534-ef2c1f5efd20</th>\n",
" <td>ecoli divides every 20 minutes. How many cells...</td>\n",
" <td>ecoli divides every 20 minutes. How many cells...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>1.773183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4ac33c1a-62f0-4da4-9455-07b582f6ff52</th>\n",
" <td>calculate 101 to the power of 0.5 to 4 digits ...</td>\n",
" <td>calculate 101 to the power of 0.5 to 4 digits ...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>1.819677</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2e82a924-8382-425e-8738-daa2d912e9fe</th>\n",
" <td>convert 15 degrees to radians</td>\n",
" <td>convert 15 degrees to radians</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>1.957470</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" inputs.question \\\n",
"27c44572-6c67-4129-a95a-fe1509c350be multiply the result of (log of 100 to base 10)... \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 after calculating the sin of 1.5 radians, divi... \n",
"67867526-791a-452f-b534-ef2c1f5efd20 ecoli divides every 20 minutes. How many cells... \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 calculate 101 to the power of 0.5 to 4 digits ... \n",
"2e82a924-8382-425e-8738-daa2d912e9fe convert 15 degrees to radians \n",
"\n",
" outputs.input \\\n",
"27c44572-6c67-4129-a95a-fe1509c350be multiply the result of (log of 100 to base 10)... \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 after calculating the sin of 1.5 radians, divi... \n",
"67867526-791a-452f-b534-ef2c1f5efd20 ecoli divides every 20 minutes. How many cells... \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 calculate 101 to the power of 0.5 to 4 digits ... \n",
"2e82a924-8382-425e-8738-daa2d912e9fe convert 15 degrees to radians \n",
"\n",
" outputs.output \\\n",
"27c44572-6c67-4129-a95a-fe1509c350be \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 \n",
"67867526-791a-452f-b534-ef2c1f5efd20 \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 \n",
"2e82a924-8382-425e-8738-daa2d912e9fe \n",
"\n",
" outputs.intermediate_steps \\\n",
"27c44572-6c67-4129-a95a-fe1509c350be [] \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 [] \n",
"67867526-791a-452f-b534-ef2c1f5efd20 [] \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 [] \n",
"2e82a924-8382-425e-8738-daa2d912e9fe [] \n",
"\n",
" feedback.Intermediate steps correctness \\\n",
"27c44572-6c67-4129-a95a-fe1509c350be 0 \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 0 \n",
"67867526-791a-452f-b534-ef2c1f5efd20 0 \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 0 \n",
"2e82a924-8382-425e-8738-daa2d912e9fe 0 \n",
"\n",
" feedback.# steps / # expected steps \\\n",
"27c44572-6c67-4129-a95a-fe1509c350be 0.0 \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 0.0 \n",
"67867526-791a-452f-b534-ef2c1f5efd20 0.0 \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 0.0 \n",
"2e82a924-8382-425e-8738-daa2d912e9fe 0.0 \n",
"\n",
" feedback.correctness error \\\n",
"27c44572-6c67-4129-a95a-fe1509c350be 0 None \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 0 None \n",
"67867526-791a-452f-b534-ef2c1f5efd20 0 None \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 0 None \n",
"2e82a924-8382-425e-8738-daa2d912e9fe 0 None \n",
"\n",
" execution_time \n",
"27c44572-6c67-4129-a95a-fe1509c350be 0.763208 \n",
"2a20a13d-050e-4a16-84ff-22d9582f1449 1.413695 \n",
"67867526-791a-452f-b534-ef2c1f5efd20 1.773183 \n",
"4ac33c1a-62f0-4da4-9455-07b582f6ff52 1.819677 \n",
"2e82a924-8382-425e-8738-daa2d912e9fe 1.957470 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# You can interact with the object directly or as a flattened dataframe\n",
"df = test_run.to_dataframe()\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1bf4ea77-147f-4687-a2c6-7528a6eba08d",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"output.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
File diff suppressed because one or more lines are too long
+375 -159
View File
@@ -2,8 +2,10 @@
"cells": [
{
"cell_type": "markdown",
"id": "6728b05f-e3bb-487a-8818-e0d5d18b5501",
"metadata": {},
"id": "1c9df2ed-3496-45c6-8b1b-e12776a02a0f",
"metadata": {
"tags": []
},
"source": [
"# Introduction\n",
"\n",
@@ -17,42 +19,11 @@
"* Can the agent use more than 10 tools effectively?\n",
"* Can the agent correctly incorporate information returned by the tool (and ignore internal knowledge)?\n",
"\n",
"To help in this evaluation, each task is associated with a LangSmith dataset that includes input/output examples of varying difficulties."
]
},
{
"cell_type": "markdown",
"id": "e274faca-26fc-470b-8485-5a81b83e2c54",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "markdown",
"id": "cbe7a63b-04f3-4121-9fe6-5ce772527e85",
"metadata": {},
"source": [
"How does one evaluate an agent? Given a particular task and input, an agent uses tools to produce an output AND/OR change the state of the environment.\n",
"To help in this evaluation, each task is associated with a LangSmith dataset that includes input/output examples of varying difficulties.\n",
"\n",
"To evaluate an agent, we can check the following:\n",
"\n",
"1. Did the agent use the expected tools?\n",
"2. Did the agent use the tools in the most effective way; e.g., was the order of tool invocation correct?\n",
"3. Did the environment end up in the correct final state after the agent used the tools? (e.g., does my calendar contain all the scheduled meetings?)\n",
"4. Did the agent output match the expected reference output?"
]
},
{
"cell_type": "markdown",
"id": "34bb4fb7-085a-4f8f-a670-3ad7b479d8b4",
"metadata": {
"tags": []
},
"source": [
"## Schema\n",
"\n",
"To make it possible to evaluate different agent implementations, we're using a standardized schema, we'll illustrate it with the following example taken from tool usage:\n",
"To make it possible to evaluate different agent implementations, we're using a standardized schema, we'll illustrate it with the following example taken from tool usage.\n",
"\n",
"### Dataset\n",
"\n",
@@ -105,32 +76,7 @@
" \"intermediate_steps\": [... \"find_locations_by_name\" ...], // list of the intermediate steps taken by the agent (see format in LangChain)\n",
" \"state\": .., // Can be anything, this is the state fo the environment after the agent has taken all of its actions (optional key)\n",
"}\n",
"```\n",
"\n",
"## Standard Evaluator\n",
"\n",
"This task is associated with a standard evaluator that can be used to benchmark different aspects of tool usage.\n",
"\n",
"Specifically:\n",
"\n",
"1. Use an LLM to grade Compare output to reference using an LLM that grades the response.\n",
"2. Compare equality of expected_steps to the list of tools in intermediate_steps -- simple list equality\n",
"3. Compare the state of the environment against expected state (if present in the dataset and in the agent)\n",
"4. It does not use `order_matters` at the moment"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5af4134d-5c96-472c-b575-21f9be46e02d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"\n",
"run_eval_config = get_eval_config()"
"```"
]
},
{
@@ -199,7 +145,7 @@
"</table>"
],
"text/plain": [
"Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x7f8f5f01a520>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \"), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7f8f5f01aa20>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\"), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x7f8f5f01a020>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\"), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x7f8f5f019a80>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.')])"
"Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x12778be20>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x1277c0360>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x12778b920>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x12778b240>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})])"
]
},
"execution_count": 2,
@@ -252,7 +198,7 @@
"</table>"
],
"text/plain": [
"ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7f8f5f01aa20>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\")"
"ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x1277c0360>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
]
},
"execution_count": 3,
@@ -311,10 +257,10 @@
{
"data": {
"text/plain": [
"[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f8f5eba2980>),\n",
" StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f8f5eba2a20>),\n",
" StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f8f5eba2ac0>),\n",
" StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f8f5eba2b60>)]"
"[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c18a0>),\n",
" StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c13a0>),\n",
" StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c19e0>),\n",
" StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c1800>)]"
]
},
"execution_count": 4,
@@ -380,26 +326,354 @@
"id": "8d39b9b3-d4da-49bc-b3db-8a4165b1db55",
"metadata": {},
"source": [
"## Agent Factory\n",
"## Creating an agent\n",
"\n",
"For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
"So now that you know how the test environment works, it's time to define an agent! \n",
"\n",
"The `AgentExecutor` should accept `question` as an input and include the fields `output`, `intermediate_steps` and potentially `state` in its response -- for this we\n",
"will wrap the agent executor in an adapter (`apply_agent_executor_adapter`) that will help match the expected schema.\n",
"\n",
"Please reference the LangChain documentation to see how to [use and implement agents](https://python.langchain.com/docs/modules/agents/)"
"We will follow the example in the LangChain documentation to [define an OpenAI tool using agent](https://python.langchain.com/docs/modules/agents/). "
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 26,
"id": "8827186a-8ed3-43c7-956c-71342e0a7bf2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.agents.format_scratchpad.openai_tools import (\n",
" format_to_openai_tool_messages,\n",
")\n",
"from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser\n",
"from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
"from langchain.tools.render import (\n",
" format_tool_to_openai_function,\n",
" format_tool_to_openai_tool,\n",
")\n",
"from langchain_community.chat_models import ChatOpenAI\n",
"from langchain_core.runnables import RunnableParallel\n",
"\n",
"tools = task.create_environment().tools\n",
"formatted_tools = [format_tool_to_openai_tool(t) for t in tools]\n",
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-1106\", temperature=0, model_kwargs={\"seed\": 42})\n",
"# Compose the llm call with the tools' JSON schemas\n",
"llm_with_tools = llm.bind(tools=formatted_tools)\n",
"format_inputs = RunnableParallel(\n",
" {\n",
" \"input\": lambda x: x[\"input\"],\n",
" \"agent_scratchpad\": lambda x: format_to_openai_tool_messages(\n",
" x[\"intermediate_steps\"]\n",
" ),\n",
" }\n",
")\n",
"\n",
"prompt = ChatPromptTemplate.from_messages(\n",
" [\n",
" (\n",
" \"system\",\n",
" \"You are very powerful assistant, but bad at calculating lengths of words.\",\n",
" ),\n",
" (\"user\", \"{input}\"),\n",
" MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
" ]\n",
")\n",
"agent_definition = (\n",
" # Input to this pipeline is a dictionary with \"input\" and \"intermediate_steps\" keys\n",
" format_inputs | prompt | llm_with_tools | OpenAIToolsAgentOutputParser()\n",
")"
]
},
{
"cell_type": "markdown",
"id": "7614ab73-dc66-4f2e-9eeb-ff1711c113d0",
"metadata": {},
"source": [
"### Agent Factory\n",
"\n",
"As discussed above, each test environment tracks state. We want to create a new environment for each data point to avoid cross-contamination between rows in the dataset.\n",
"\n",
"We do this by defining an agent factory. Below, we integrate our agent into a `CustomRunnableAgentFactory`, which helps create the environment and agent executor for each data point."
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "629416b3-b5d6-45ad-9bda-4f0642a0eb13",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks.tool_usage.agents import CustomRunnableAgentFactory\n",
"\n",
"agent_factory = CustomRunnableAgentFactory(task, agent=agent_definition)"
]
},
{
"cell_type": "markdown",
"id": "7f06cf25-6766-4ea5-a566-36af045bdcf4",
"metadata": {},
"source": [
"Let's check that the agent works"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "755f7920-831b-4595-8c6d-cca22c935198",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain import globals\n",
"\n",
"globals.set_verbose(True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2804eae-5b0b-4a38-9dff-363a4fe8f324",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"agent = agent_factory()\n",
"agent.invoke({\"question\": \"abc\"})"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "2aa68a11-d268-4868-a862-309801201989",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"globals.set_verbose(False)"
]
},
{
"cell_type": "markdown",
"id": "e3bce984-7c9c-4f6e-a51b-01c3e2b6e00a",
"metadata": {},
"source": [
"## Benchmarking\n",
"\n",
"How does one evaluate an agent? Given a particular task and input, an agent uses tools to produce an output AND/OR change the state of the environment.\n",
"\n",
"To evaluate an agent, we can check the following:\n",
"\n",
"1. Did the agent use the expected tools?\n",
"2. Did the agent use the tools in the most effective way; e.g., was the order of tool invocation correct?\n",
"3. Did the environment end up in the correct final state after the agent used the tools? (e.g., does my calendar contain all the scheduled meetings?)\n",
"4. Did the agent output match the expected reference output?\n",
"\n",
"Each task is associated with a standard evaluator that does evaluation that's appropriate for the task; for example,\n",
"\n",
"1. Use an LLM to grade Compare output to reference using an LLM that grades the response.\n",
"2. Compare equality of expected_steps to the list of tools in intermediate_steps -- simple list equality\n",
"3. Compare the state of the environment against expected state (if present in the dataset and in the agent)"
]
},
{
"cell_type": "markdown",
"id": "5e9e5817-3b9d-4a1e-8ee8-692d39aa68ca",
"metadata": {},
"source": [
"This evaluator will be used below when we benchmark on all tasks!"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "c88bd6e1-f77e-4668-a143-096929e897ee",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"RunEvalConfig(evaluators=[], custom_evaluators=[<langchain_benchmarks.tool_usage.evaluators.AgentTrajectoryEvaluator object at 0x15699ed10>], reference_key=None, prediction_key=None, input_key=None, eval_llm=None)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eval_config = task.get_eval_config()\n",
"eval_config"
]
},
{
"cell_type": "markdown",
"id": "044c7f91-9bb3-44b5-802d-f9f444ddeff9",
"metadata": {},
"source": [
"Set up code to run against all tasks"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "60466447-eb37-4204-a497-fe47e8d8dd70",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import datetime\n",
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks import (\n",
" __version__,\n",
" clone_public_dataset,\n",
" model_registry,\n",
" registry,\n",
")\n",
"from langchain_benchmarks.rate_limiting import RateLimiter\n",
"from langchain_benchmarks.tool_usage.agents import (\n",
" AnthropicToolUserFactory,\n",
" CustomAgentFactory,\n",
" CustomRunnableAgentFactory,\n",
" OpenAIAgentFactory,\n",
" OpenAIAssistantFactory,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c448d139-9923-4cf6-af49-cbf3dff46bdc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import uuid\n",
"\n",
"experiment_uuid = uuid.uuid4().hex[:]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "200df769-4dd9-453b-8500-219c1d5305f6",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"tests = [\n",
" # 2-tuple of (architecture, model name)\n",
" (\"openai_functions\", \"gpt-3.5-turbo-1106\"), # Requires OpenAI Creds\n",
" (\"openai_functions\", \"gpt-3.5-turbo-0613\"),\n",
" (\"openai_functions\", \"gpt-4-1106-preview\"),\n",
" (\"openai_functions\", \"gpt-4-0613\"),\n",
" (\"openai_functions\", \"mistral-7b-instruct-v0.1\"), # Requires AnyScale creds\n",
" # Requires Anthropic Creds and Setting up Anthropics Tool Usage package.\n",
" # (\n",
" # \"anthropic_tool_user\",\n",
" # \"claude-2.1\",\n",
" # ),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ddf7355-7db9-4adc-bc1e-f04c3d0ec57d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"client = Client() # Launch langsmith client for cloning datasets\n",
"today = datetime.date.today().isoformat()\n",
"rate_limiter = RateLimiter(requests_per_second=2)\n",
"\n",
"for task in registry:\n",
" if task.type != \"ToolUsageTask\":\n",
" continue\n",
"\n",
" dataset_name = task.name\n",
" clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n",
"\n",
" for arch, model in tests:\n",
" print()\n",
" print(f\"Benchmarking {task.name} with model: {model} and arch: {arch}\")\n",
" eval_config = task.get_eval_config()\n",
"\n",
" if arch == \"openai_functions\":\n",
" agent_factory = OpenAIAgentFactory(\n",
" task, model=model, rate_limiter=rate_limiter\n",
" )\n",
" elif arch == \"custom_agent\":\n",
" agent_factory = CustomAgentFactory(\n",
" task, model=model, rate_limiter=rate_limiter\n",
" )\n",
" elif arch == \"custom_runnable_agent\":\n",
" # For this, the model would have to be a runnable object\n",
" agent_factory = CustomRunnableAgentFactory(task, agent=model)\n",
" elif arch == \"anthropic_tool_user\":\n",
" agent_factory = AnthropicToolUserFactory(task)\n",
" else:\n",
" raise ValueError()\n",
"\n",
" client.run_on_dataset(\n",
" dataset_name=dataset_name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" verbose=False,\n",
" project_name=f\"{model}-{task.name}-{today}-{experiment_uuid}\",\n",
" tags=[model],\n",
" concurrency_level=5,\n",
" project_metadata={\n",
" \"model\": model,\n",
" \"id\": experiment_uuid,\n",
" \"task\": task.name,\n",
" \"date\": today,\n",
" \"langchain_benchmarks_version\": __version__,\n",
" \"arch\": arch,\n",
" },\n",
" )"
]
},
{
"cell_type": "markdown",
"id": "4c0a6505-693d-46e5-9ed1-e33e0044b040",
"metadata": {},
"source": [
"## Advanced Usage\n",
"\n",
"The following sections demonstrate slightly more \"advanced\" usage if you want to completely customize the agent runtime in a way that is compatible with our test runner.\n",
"\n",
"### Custom Agent Factory\n",
"\n",
"If you want even more configurability beyond what the `CustomRunnableAgentFactory` provides, you can create your owne `AgentFactory` using the following pattern.\n",
"\n",
"The `AgentExecutor` should accept `question` as an input and include the fields `output`, `intermediate_steps` and potentially `state` in its response -- for this we\n",
"will wrap the agent executor in an adapter (`apply_agent_executor_adapter`) that will help match the expected schema."
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "bca8ad69-9956-451c-b639-ea30c77d982f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.agents import AgentExecutor, AgentType, Tool, initialize_agent\n",
"from langchain.agents import AgentType, initialize_agent\n",
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"from langchain_benchmarks.schema import ExtractionTask\n",
@@ -408,7 +682,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 19,
"id": "44839ebe-48ea-4d5b-87b4-2ad72acacb71",
"metadata": {
"tags": []
@@ -442,106 +716,48 @@
},
{
"cell_type": "code",
"execution_count": 9,
"id": "755f7920-831b-4595-8c6d-cca22c935198",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain import globals\n",
"\n",
"globals.set_verbose(True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "1b18952b-43b8-4f30-a0d9-e7763eb05b13",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"agent_factory = AgentFactory(task, model=\"gpt-3.5-turbo-1106\")"
]
},
{
"cell_type": "markdown",
"id": "c31a81e5-b3d6-42e5-895d-0c4dc8413738",
"metadata": {
"tags": []
},
"source": [
"Let's check that the agent works"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c2804eae-5b0b-4a38-9dff-363a4fe8f324",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"agent = agent_factory()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "eb0bb2bf-5f53-4f59-a73f-2144fe850d50",
"execution_count": 24,
"id": "8b6108e4-c7cc-42e8-a23d-89c7b94fab6c",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m\n",
"Invoking: `a` with `{}`\n",
"\n",
"\n",
"\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `b` with `{}`\n",
"\n",
"\n",
"\u001b[0m\u001b[33;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `c` with `{}`\n",
"\n",
"\n",
"\u001b[0m\u001b[38;5;200m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3mYou've successfully typed \"abc\"! Is there anything else you'd like to do?\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"{'input': 'abc',\n",
" 'output': 'You\\'ve successfully typed \"abc\"! Is there anything else you\\'d like to do?',\n",
" 'intermediate_steps': [(AgentActionMessageLog(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{}', 'name': 'a'}})]),\n",
"{'input': 'xypxy',\n",
" 'output': 'I have typed \"xypxy\" as you requested.',\n",
" 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n",
" 'OK'),\n",
" (AgentActionMessageLog(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{}', 'name': 'b'}})]),\n",
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n",
" 'OK'),\n",
" (AgentActionMessageLog(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{}', 'name': 'c'}})]),\n",
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'p'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'p'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"p\"\\n}', 'name': 'type_letter'}})]),\n",
" 'OK'),\n",
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n",
" 'OK'),\n",
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n",
" 'OK')],\n",
" 'state': 'abc'}"
" 'state': 'xypxy'}"
]
},
"execution_count": 12,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.invoke({\"input\": \"abc\"})"
"agent_factory = AgentFactory(task, \"gpt-4\")\n",
"agent = agent_factory()\n",
"agent.invoke({\"question\": \"xypxy\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9bdf9328-0103-48d3-8dfc-933423db9796",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@@ -560,7 +776,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
"version": "3.11.2"
}
},
"nbformat": 4,
@@ -40,19 +40,19 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, registry"
"from langchain_benchmarks import registry"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
"metadata": {
"tags": []
@@ -75,10 +75,10 @@
"</table>"
],
"text/plain": [
"ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x7f0466942ca0>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.')"
"ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x7f94df105a80>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})"
]
},
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -96,27 +96,6 @@
"Clone the dataset associated with this task"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "70369f67-deb4-467a-801a-6d38c3d0460d",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset Multiverse Math already exists. Skipping.\n",
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0.\n"
]
}
],
"source": [
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
]
},
{
"cell_type": "markdown",
"id": "cede4edd-884d-4330-a186-5058b712394b",
@@ -129,7 +108,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "e2439d0c-ccb9-4f5b-a127-548725025a98",
"metadata": {
"tags": []
@@ -138,14 +117,14 @@
{
"data": {
"text/plain": [
"[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=<class 'pydantic.v1.main.multiplySchemaSchema'>, func=<function multiply at 0x7f04669422a0>),\n",
" StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=<class 'pydantic.v1.main.addSchemaSchema'>, func=<function add at 0x7f04669427a0>),\n",
" StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=<class 'pydantic.v1.main.divideSchemaSchema'>, func=<function divide at 0x7f0466942700>),\n",
" StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=<class 'pydantic.v1.main.subtractSchemaSchema'>, func=<function subtract at 0x7f0466942980>),\n",
" StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=<class 'pydantic.v1.main.powerSchemaSchema'>, func=<function power at 0x7f0466942a20>)]"
"[StructuredTool(name='multiply', description='multiply(a: float, b: float) -> float - Multiply two numbers; a * b.', args_schema=<class 'pydantic.v1.main.multiplySchemaSchema'>, func=<function multiply at 0x7f94e0148a40>),\n",
" StructuredTool(name='add', description='add(a: float, b: float) -> float - Add two numbers; a + b.', args_schema=<class 'pydantic.v1.main.addSchemaSchema'>, func=<function add at 0x7f94df105580>),\n",
" StructuredTool(name='divide', description='divide(a: float, b: float) -> float - Divide two numbers; a / b.', args_schema=<class 'pydantic.v1.main.divideSchemaSchema'>, func=<function divide at 0x7f94df104540>),\n",
" StructuredTool(name='subtract', description='subtract(a: float, b: float) -> float - Subtract two numbers; a - b.', args_schema=<class 'pydantic.v1.main.subtractSchemaSchema'>, func=<function subtract at 0x7f94df105760>),\n",
" StructuredTool(name='power', description='power(a: float, b: float) -> float - Raise a number to a power; a ** b.', args_schema=<class 'pydantic.v1.main.powerSchemaSchema'>, func=<function power at 0x7f94df105800>)]"
]
},
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -165,7 +144,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "f5a100bd-6e19-498f-8a36-393b5c19bcb9",
"metadata": {
"tags": []
@@ -177,7 +156,7 @@
"8.8"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -196,7 +175,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "31afb08b-17b8-4866-86c1-ee24e804415c",
"metadata": {
"tags": []
@@ -208,7 +187,7 @@
"'You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.'"
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -310,8 +289,9 @@
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"from langchain_benchmarks import clone_public_dataset\n",
"\n",
"clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
"experiment_uuid = uuid.uuid4().hex[:4]\n",
"\n",
"client = Client()\n",
@@ -320,16 +300,11 @@
"\n",
"for model in models:\n",
" print()\n",
"\n",
" # qa_math uses a custom prompt to grade the output\n",
" # The prompt guides the LLM to ignore whether the TRUE ANSWER is factually\n",
" # correct\n",
" eval_config = get_eval_config(output_evaluation=\"qa_math\")\n",
" agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
" test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" evaluation=task.get_eval_config(),\n",
" verbose=False,\n",
" project_name=f\"multiverse-math-{model}-{experiment_uuid}\",\n",
" tags=[model],\n",
@@ -66,22 +66,16 @@
"outputs": [],
"source": [
"import json\n",
"from copy import deepcopy\n",
"from functools import partial\n",
"from typing import Sequence, Tuple\n",
"\n",
"from langchain.agents import AgentExecutor, AgentType, Tool, initialize_agent\n",
"from langchain.agents.format_scratchpad import format_to_openai_function_messages\n",
"from langchain.agents import AgentExecutor\n",
"from langchain.agents.structured_chat.output_parser import (\n",
" AgentAction,\n",
" AgentFinish,\n",
" StructuredChatOutputParser,\n",
")\n",
"from langchain.chains.openai_functions.base import convert_to_openai_function\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.output_parsers.json import SimpleJsonOutputParser, parse_json_markdown\n",
"from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"from langchain.output_parsers.json import parse_json_markdown\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain.tools import tool\n",
"from langchain_core.runnables import RunnableLambda\n",
"\n",
@@ -353,8 +347,6 @@
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"\n",
"experiment_uuid = uuid.uuid4().hex[:4]\n",
"\n",
"client = Client()\n",
@@ -53,7 +53,7 @@
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, registry"
"from langchain_benchmarks import registry"
]
},
{
@@ -76,35 +76,6 @@
"task = registry[\"Tool Usage - Relational Data\"]"
]
},
{
"cell_type": "markdown",
"id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
"metadata": {},
"source": [
"Clone the dataset associaetd with this task"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "70369f67-deb4-467a-801a-6d38c3d0460d",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset Tool Usage - Relational Data already exists. Skipping.\n",
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826.\n"
]
}
],
"source": [
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
]
},
{
"cell_type": "markdown",
"id": "110bdafa-bdab-4194-90c9-46416d14b2f9",
@@ -217,7 +188,7 @@
},
"outputs": [],
"source": [
"from langchain.agents import AgentType, Tool, initialize_agent\n",
"from langchain.agents import AgentType, initialize_agent\n",
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"from langchain_benchmarks.schema import ExtractionTask\n",
@@ -407,7 +378,9 @@
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"from langchain_benchmarks import clone_public_dataset\n",
"\n",
"clone_public_dataset(task.dataset_id, dataset_name=task.name) # Clone dataset\n",
"\n",
"experiment_uuid = uuid.uuid4().hex[:4]\n",
"\n",
@@ -417,12 +390,11 @@
"\n",
"for model in models:\n",
" print()\n",
" eval_config = get_eval_config()\n",
" agent_factory = AgentFactory(task, model=model)\n",
" test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" evaluation=task.get_eval_config(),\n",
" verbose=False,\n",
" project_name=f\"tool-usage-relational-data-{model}-{experiment_uuid}\",\n",
" tags=[model],\n",
@@ -23,19 +23,19 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 12,
"id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, registry"
"from langchain_benchmarks import registry"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 13,
"id": "1aef2b32-a5df-421f-8be3-a2ef27372ece",
"metadata": {
"tags": []
@@ -60,10 +60,10 @@
"</table>"
],
"text/plain": [
"ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x7f3e404877e0>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \")"
"ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x7f1791bd2480>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'})"
]
},
"execution_count": 2,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -73,35 +73,6 @@
"task"
]
},
{
"cell_type": "markdown",
"id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
"metadata": {},
"source": [
"Clone the dataset associaetd with this task"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "70369f67-deb4-467a-801a-6d38c3d0460d",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset Tool Usage - Typewriter (1 tool) already exists. Skipping.\n",
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3.\n"
]
}
],
"source": [
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
]
},
{
"cell_type": "markdown",
"id": "fc78a3e1-80da-4607-98c3-a99c2037e7ca",
@@ -118,7 +89,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"id": "64e538ae-5cf2-4cd5-a312-25ee6924e869",
"metadata": {
"tags": []
@@ -130,7 +101,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "5516a34b-1e9b-4f1e-9462-cfc4d5bc29f9",
"metadata": {
"tags": []
@@ -139,10 +110,10 @@
{
"data": {
"text/plain": [
"[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=<class 'pydantic.v1.main.type_letterSchemaSchema'>, func=<function create_typer.<locals>.type_letter at 0x7f3e404a07c0>)]"
"[StructuredTool(name='type_letter', description='type_letter(letter: str) -> str - Print the given letter on the paper.', args_schema=<class 'pydantic.v1.main.type_letterSchemaSchema'>, func=<function create_typer.<locals>.type_letter at 0x7f1791bd3f60>)]"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -153,7 +124,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "80501e1a-f1f6-4b38-8637-894503029d86",
"metadata": {
"tags": []
@@ -165,11 +136,34 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "3f352e32-fdb6-4d9e-b1c4-3d78b4f50646",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'OK'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tool.invoke({\"letter\": \"a\"})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ec9c2e68-b55e-4087-bc1a-c38f4cfd401b",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
@@ -182,36 +176,13 @@
"output_type": "execute_result"
}
],
"source": [
"tool.invoke({\"letter\": \"a\"})"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ec9c2e68-b55e-4087-bc1a-c38f4cfd401b",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'OK'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tool.invoke({\"letter\": \"b\"})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "2cc5b174-25a4-4d5a-8535-56ecea62ea81",
"metadata": {
"tags": []
@@ -223,7 +194,7 @@
"'ab'"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -246,7 +217,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "e2acab1e-78a7-4198-8e79-4529c95ce7e2",
"metadata": {
"tags": []
@@ -263,7 +234,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"id": "ceaa8edf-292b-48a1-be94-e6bfea0e75b1",
"metadata": {
"tags": []
@@ -274,16 +245,16 @@
"text/plain": [
"{'input': 'abc',\n",
" 'output': 'a, b, c',\n",
" 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"a\"\\n}', 'name': 'type_letter'}})]),\n",
" 'intermediate_steps': [(OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'a'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'a'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_uXlSgkG7N9nBCjYPB6SZn0n4', 'function': {'arguments': '{\\n \"letter\": \"a\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_uXlSgkG7N9nBCjYPB6SZn0n4'),\n",
" 'OK'),\n",
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"b\"\\n}', 'name': 'type_letter'}})]),\n",
" (OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'b'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'b'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_dbSJb120AxFn55XcJHR0xH1I', 'function': {'arguments': '{\\n \"letter\": \"b\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_dbSJb120AxFn55XcJHR0xH1I'),\n",
" 'OK'),\n",
" (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n \"letter\": \"c\"\\n}', 'name': 'type_letter'}})]),\n",
" (OpenAIToolAgentAction(tool='type_letter', tool_input={'letter': 'c'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'c'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_sFV4km9Jd9BOGO7A3oo1op0b', 'function': {'arguments': '{\\n \"letter\": \"c\"\\n}', 'name': 'type_letter'}, 'type': 'function'}]})], tool_call_id='call_sFV4km9Jd9BOGO7A3oo1op0b'),\n",
" 'OK')],\n",
" 'state': 'abc'}"
]
},
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -297,8 +268,14 @@
"id": "3821e4b0-8e67-418a-840c-470fcde42df0",
"metadata": {},
"source": [
"## Eval\n",
"\n",
"## Eval"
]
},
{
"cell_type": "markdown",
"id": "bc860fc6-89db-4929-926a-69b6320616ab",
"metadata": {},
"source": [
"Let's evaluate an agent now"
]
},
@@ -341,9 +318,11 @@
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"from langchain_benchmarks import clone_public_dataset\n",
"\n",
"experiment_uuid = uuid.uuid4().hex[:4]\n",
"clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
"\n",
"\n",
"client = Client()\n",
"\n",
@@ -352,12 +331,11 @@
"for model in models:\n",
" # Will evaluate the trajectory and state, but not the output which is meaningless for this task.\n",
" print()\n",
" eval_config = get_eval_config(output_evaluation=\"none\")\n",
" agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
" test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" evaluation=task.get_eval_config(),\n",
" verbose=False,\n",
" project_name=f\"typewriter-1-{model}-{experiment_uuid}\",\n",
" tags=[model],\n",
@@ -37,7 +37,7 @@
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, registry"
"from langchain_benchmarks import registry"
]
},
{
@@ -71,7 +71,7 @@
"</table>"
],
"text/plain": [
"ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7f6cd20e4f40>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\")"
"ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7f1b23b13240>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
]
},
"execution_count": 2,
@@ -84,35 +84,6 @@
"task"
]
},
{
"cell_type": "markdown",
"id": "bc33a639-3caf-4314-8ea7-1c7c8b1d114d",
"metadata": {},
"source": [
"Clone the dataset associaetd with this task"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "70369f67-deb4-467a-801a-6d38c3d0460d",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset Tool Usage - Typewriter (26 tools) already exists. Skipping.\n",
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478.\n"
]
}
],
"source": [
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
]
},
{
"cell_type": "markdown",
"id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
@@ -301,9 +272,7 @@
"source": [
"## Eval\n",
"\n",
"Let's evaluate an agent now.\n",
"\n",
"Eval code below has not been run yet."
"Let's evaluate an agent now."
]
},
{
@@ -319,7 +288,10 @@
"\n",
"from langsmith.client import Client\n",
"\n",
"from langchain_benchmarks.tool_usage import get_eval_config\n",
"from langchain_benchmarks import clone_public_dataset\n",
"\n",
"# Clone the dataset\n",
"clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
"\n",
"experiment_uuid = uuid.uuid4().hex[:4]\n",
"\n",
@@ -329,13 +301,11 @@
"\n",
"for model in models:\n",
" print()\n",
" # The eval config will evaluate the state, but not the output which is meaningless for this task.\n",
" eval_config = get_eval_config(output_evaluation=\"none\")\n",
" agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
" test_run = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=agent_factory,\n",
" evaluation=eval_config,\n",
" evaluation=task.get_eval_config(),\n",
" verbose=False,\n",
" concurrency_level=1,\n",
" project_name=f\"typewriter-26-{model}-{experiment_uuid}\",\n",
+7
View File
@@ -17,6 +17,7 @@
./notebooks/tool_usage/multiverse_math
./notebooks/tool_usage/typewriter_1
./notebooks/tool_usage/typewriter_26
./notebooks/tool_usage/benchmark_all_tasks
```
```{toctree}
@@ -42,3 +43,9 @@
./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval
./notebooks/retrieval/comparing_techniques
```
```{toctree}
:maxdepth: 2
:caption: Benchmarking Without LangSmith
./notebooks/run_without_langsmith
```
+29 -1
View File
@@ -252,7 +252,35 @@ _ANTHROPIC_MODELS = [
},
),
]
_GOOGLE_GENAI_MODELS = [
RegisteredModel(
provider="google-genai",
name="gemini-pro",
description="Gemini Pro is a large model from Google trained on a diverse set of tasks.",
type="chat",
params={
"model": "gemini-pro",
"convert_system_message_to_human": True,
},
)
]
_ANYSCALE_MODELS = [
RegisteredModel(
provider="anyscale",
name="mistral-7b-instruct-v0.1",
description="Mistral 7B model fine-tuned for function-calling.",
type="chat",
params={
"model": "mistralai/Mistral-7B-Instruct-v0.1",
},
),
]
model_registry = ModelRegistry(
registered_models=_OPEN_AI_MODELS + _FIREWORKS_MODELS + _ANTHROPIC_MODELS
registered_models=_OPEN_AI_MODELS
+ _FIREWORKS_MODELS
+ _ANYSCALE_MODELS
+ _ANTHROPIC_MODELS
+ _GOOGLE_GENAI_MODELS
)
+1
View File
@@ -21,6 +21,7 @@ registry = Registry(
type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK,
relational_data.RELATIONAL_DATA_TASK,
multiverse_math.MULTIVERSE_MATH,
multiverse_math.MULTIVERSE_MATH_TINY,
email_task.EMAIL_EXTRACTION_TASK,
chat_extraction.CHAT_EXTRACTION_TASK,
LANGCHAIN_DOCS_TASK,
+12 -2
View File
@@ -251,9 +251,9 @@ class Registry:
self.tasks.append(task)
Provider = Literal["fireworks", "openai", "anthropic"]
Provider = Literal["fireworks", "openai", "anthropic", "anyscale"]
ModelType = Literal["chat", "llm"]
AUTHORIZED_NAMESPACES = {"langchain"}
AUTHORIZED_NAMESPACES = {"langchain", "langchain_google_genai"}
def _get_model_class_from_path(
@@ -284,9 +284,15 @@ def _get_default_path(provider: str, type_: ModelType) -> str:
paths = {
("fireworks", "chat"): "langchain.chat_models.fireworks.ChatFireworks",
("fireworks", "llm"): "langchain.llms.fireworks.Fireworks",
("anyscale", "chat"): "langchain.chat_models.anyscale.ChatAnyscale",
("anyscale", "llm"): "langchain.llms.anyscale.Anyscale",
("openai", "chat"): "langchain.chat_models.openai.ChatOpenAI",
("openai", "llm"): "langchain.llms.openai.OpenAI",
("anthropic", "chat"): "langchain.chat_models.anthropic.ChatAnthropic",
(
"google-genai",
"chat",
): "langchain_google_genai.chat_models.ChatGoogleGenerativeAI",
}
if (provider, type_) not in paths:
@@ -303,6 +309,10 @@ def _get_default_url(provider: str, type_: ModelType) -> Optional[str]:
return "https://platform.openai.com/docs/models"
elif provider == "anthropic":
return "https://docs.anthropic.com/claude/reference/selecting-a-model"
elif provider == "anyscale":
return "https://docs.endpoints.anyscale.com/category/supported-models"
elif provider == "google-genai":
return "https://ai.google.dev/"
else:
return None
@@ -1,7 +1,23 @@
from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
from langchain_benchmarks.tool_usage.agents.anthropic_tool_user import (
AnthropicToolUserFactory,
)
from langchain_benchmarks.tool_usage.agents.experimental.factory import (
CustomAgentFactory,
)
from langchain_benchmarks.tool_usage.agents.openai_assistant import (
OpenAIAssistantFactory,
)
from langchain_benchmarks.tool_usage.agents.openai_functions import OpenAIAgentFactory
from langchain_benchmarks.tool_usage.agents.runnable_agent import (
CustomRunnableAgentFactory,
)
__all__ = ["OpenAIAgentFactory", "apply_agent_executor_adapter", "CustomAgentFactory"]
__all__ = [
"OpenAIAgentFactory",
"OpenAIAssistantFactory",
"apply_agent_executor_adapter",
"CustomAgentFactory",
"AnthropicToolUserFactory",
"CustomRunnableAgentFactory",
]
@@ -0,0 +1,271 @@
"""Wrapper around the anthropic tool user SDK.
The anthropic tool user SDK is an alpha release so this code will likely be
changed or deleted in the future. It's here simply to make it easier to benchmark
the performance of the existing tool user SDK, to compare it with the performance
of other implementations.
"""
from importlib.util import find_spec
from typing import Any, Dict, List, Optional, Sequence
from langchain.tools import StructuredTool
from langchain_core.callbacks.manager import trace_as_chain_group
from langchain_core.runnables import Runnable, RunnableConfig, RunnableLambda
from langchain_benchmarks import rate_limiting
from langchain_benchmarks.schema import ToolUsageTask
from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
def convert_langchain_tool_to_tool_user_tool(lc_tool: StructuredTool) -> Any:
"""Convert a langchain tool to a tool user tool."""
from tool_use_package.tools.base_tool import BaseTool
class DynamicTool(BaseTool):
def use_tool(self, **kwargs):
return lc_tool(kwargs)
schema = lc_tool.args_schema.schema()
properties = schema["properties"]
parameters = []
# Is this needed or is string OK?
type_adapter = {
"string": "str", # str or string?
"integer": "int",
"number": "float",
"boolean": "bool",
}
for key, value in properties.items():
parameters.append(
{
"name": key,
"type": type_adapter.get(value["type"], value["type"]),
"description": value.get("description", ""),
}
)
return DynamicTool(lc_tool.name, lc_tool.description, parameters)
def _handle_tool_inputs(
tool_inputs: List[Dict[str, Any]],
tools: Sequence[StructuredTool],
config: Optional[RunnableConfig] = None,
) -> Dict[str, Any]:
"""Handle tool inputs."""
tool_by_name = {tool.name: tool for tool in tools}
tool_error: Optional[str] = None
tool_outputs = []
for tool_input in tool_inputs:
tool_name = tool_input["tool_name"]
tool_arguments = tool_input["tool_arguments"]
tool = tool_by_name[tool_name]
try:
tool_result = tool.invoke(tool_arguments, config=config)
except Exception as e: # Break on first error
tool_error = str(e)
tool_outputs = None
break
tool_outputs.append(
{
"tool_name": tool_name,
"tool_result": tool_result,
}
)
return {
"role": "tool_outputs",
"tool_outputs": tool_outputs,
"tool_error": tool_error,
}
def run_anthropic_agent_simple(
tools: Sequence[StructuredTool],
user_message: str,
*,
max_iterations: int = 30,
config: Optional[RunnableConfig] = None,
**kwargs,
) -> List[dict]:
"""Make an anthropic agent."""
from tool_use_package.tool_user import ToolUser
verbose = kwargs.pop("verbose", False)
tool_user = ToolUser(
[convert_langchain_tool_to_tool_user_tool(tool) for tool in tools], **kwargs
)
messages = [
{
"role": "human",
"content": user_message,
"tool_error": None,
"tool_outputs": [],
"tool_inputs": [],
}
]
with trace_as_chain_group(
"Anthropic Agent Run",
inputs={"user_message": user_message},
callback_manager=config.get("callbacks", None) if config else None,
) as group_manager:
for num_iteration in range(max_iterations):
with trace_as_chain_group(
f"Anthropic Agent Iteration {num_iteration}",
inputs={"messages": messages},
callback_manager=group_manager.parent_run_manager.get_child(),
) as iteration_manager:
last_message = tool_user.use_tools(
messages, execution_mode="manual", verbose=verbose
)
new_messages = [last_message]
if last_message["role"] == "tool_inputs":
tool_inputs = last_message["tool_inputs"]
new_message = _handle_tool_inputs(
tool_inputs,
tools,
config={
"callbacks": iteration_manager.parent_run_manager.get_child(),
},
)
new_messages.append(new_message)
iteration_manager.on_chain_end(outputs=new_messages)
messages.extend(new_messages)
# Finally break if the last message is from the assistant
if last_message["role"] == "assistant":
break
else:
raise ValueError("Max iterations reached")
group_manager.on_chain_end(outputs=messages)
return messages
def convert_messages_to_finalized_output(
messages: List[Dict[str, Any]],
) -> Dict[str, Any]:
"""Convert the history of messages into the expected output for eval.
This matches the agent executor output which has the following structure:
{
"output": "The output of the agent",
"intermediate_steps": [
(
AgentAction(
tool="add_x_y",
tool_input={"x": 2.0, "y": 5.0},
log="Invoking tool `add_x_y` with `{'x': 2.0, 'y': 5.0}`",
),
9.0,
)
],
"state": Any, # Optional key for tasks that involve manipulation of an env.
}
"""
if not messages:
raise ValueError("Expected at least one message")
last_message = messages[-1]
if last_message["role"] != "assistant":
raise ValueError(
f"Expected the last message to be from the assistant. "
f"Instead got {last_message}."
)
actual_steps = []
for message in messages:
if "role" not in message:
raise ValueError(f"Expected role in message {message}")
role = message["role"]
if role == "tool_inputs":
# Get the name of the tool used
for tool_input in message["tool_inputs"]:
actual_steps.append(tool_input["tool_name"])
return {
"output": last_message["content"],
"actual_steps": actual_steps,
}
def create_agent(tools: Sequence[StructuredTool]) -> RunnableLambda:
"""Create an agent."""
def run_agent(
input: dict, config: Optional[RunnableConfig] = None, **kwargs
) -> dict:
"""Run the agent."""
messages = run_anthropic_agent_simple(
tools, input["input"], config=config, **kwargs
)
return convert_messages_to_finalized_output(messages)
return RunnableLambda(run_agent)
class AnthropicToolUserFactory:
def __init__(
self,
task: ToolUsageTask,
*,
rate_limiter: Optional[rate_limiting.RateLimiter] = None,
) -> None:
"""Create an OpenAI agent factory for the given task.
Args:
task: The task to create an agent factory for.
rate_limiter: The rate limiter to use
"""
self.task = task
self.rate_limiter = rate_limiter
if not find_spec("tool_use_package"):
raise ImportError(
'Could not import "tool_use_package". Please '
"follow instructions here to install "
"https://github.com/anthropics/anthropic-tools/tree/main"
)
def __call__(self, **kwargs: Any) -> Runnable:
env = self.task.create_environment()
def _add_task_instructions(
input: dict, config: Optional[RunnableConfig] = None, **kwargs
) -> dict:
"""Add task instructions to the question."""
if not isinstance(input, dict) or "question" not in input:
raise ValueError(
f"Expected input to be a dict with key `question`. "
f"Found {type(input)}."
)
input = input.copy()
input["question"] = (
f"{self.task.instructions}\nWrite down your answer, "
f"but do not explain it. Input: `{input['question']}`"
)
return input
agent = create_agent(env.tools) # type: ignore
# Returns `state` in the output if the environment has a state reader
# makes sure that `output` is always in the output
if kwargs:
agent = agent.bind(**kwargs)
runnable = _add_task_instructions | apply_agent_executor_adapter(
agent, state_reader=env.read_state
)
if self.rate_limiter: # Add a rate limiter
runnable = rate_limiting.with_rate_limit(runnable, self.rate_limiter)
return runnable
@@ -27,6 +27,7 @@ class CustomAgentFactory:
model: str,
*,
rate_limiter: Optional[RateLimiter] = None,
num_retries: int = 0,
) -> None:
"""Create an agent factory for the given tool usage task.
@@ -34,12 +35,14 @@ class CustomAgentFactory:
task: The task to create an agent factory for
model: model name (check model_registry)
rate_limiter: The rate limiter to use if provided
num_retries: The number of times to retry the agent if it fails
"""
if model not in model_registry:
raise ValueError(f"Unknown model: {model}")
self.task = task
self.model = model
self.rate_limiter = rate_limiter
self.num_retries = num_retries
def __call__(self) -> Runnable:
if isinstance(self.model, str):
@@ -74,6 +77,10 @@ class CustomAgentFactory:
GenericAgentParser(wrapping_xml_tag="tool", require_closing_xml_tag=False),
rate_limiter=self.rate_limiter,
)
if self.num_retries > 0:
agent = agent.with_retry(
stop_after_attempt=self.num_retries + 1,
)
executor = AgentExecutor(
agent=agent,
tools=env.tools,
@@ -0,0 +1,77 @@
"""Code for creating an assistant factory for evaluating tool usage tasks.
See: https://platform.openai.com/docs/assistants/how-it-works/creating-assistants
"""
from typing import Optional
from langchain.agents import AgentExecutor
from langchain.agents.openai_assistant.base import OpenAIAssistantRunnable
from langchain.schema.runnable import Runnable
from langchain_benchmarks import rate_limiting
from langchain_benchmarks.schema import ToolUsageTask
from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
class OpenAIAssistantFactory:
def __init__(
self,
task: ToolUsageTask,
*,
model: str,
rate_limiter: Optional[rate_limiting.RateLimiter] = None,
num_retries: int = 0,
) -> None:
"""Create an OpenAI agent factory for the given task.
Args:
task: The task to create an agent factory for.
model: The model to use -- this must be an open AI model.
rate_limiter: The rate limiter to use
num_retries: The number of times to retry the assistant if it fails
"""
if not isinstance(model, str):
raise ValueError(f"Expected str for model, got {type(model)}")
self.task = task
tools = task.create_environment().tools
# Stateless, so we only need to create it once
self.agent = OpenAIAssistantRunnable.create_assistant(
name=f"{task.name} assistant",
instructions=self.task.instructions,
tools=tools,
model=model,
as_agent=True,
)
self.rate_limiter = rate_limiter
self.num_retries = num_retries
def __call__(self) -> Runnable:
env = self.task.create_environment()
agent = self.agent
if self.rate_limiter is not None:
# Rate limited model
agent = rate_limiting.with_rate_limit(agent, self.rate_limiter)
def _map_key(x: dict):
# Assistant expects the 'content' key explicitly
return {
"content": x["input"],
**{k: v for k, v in x.items() if k != "input"},
}
agent = _map_key | self.agent
if self.num_retries > 0:
agent = agent.with_retry(
stop_after_attempt=self.num_retries + 1,
)
runnable = AgentExecutor(
agent=agent,
tools=env.tools,
handle_parsing_errors=True,
return_intermediate_steps=True,
)
# Returns `state` in the output if the environment has a state reader
# makes sure that `output` is always in the output
return apply_agent_executor_adapter(runnable, state_reader=env.read_state)
@@ -1,28 +1,86 @@
"""Code for creating an agent factory for evaluating tool usage tasks."""
from typing import Optional
from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.agents.format_scratchpad.openai_tools import (
format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.runnable import Runnable
from langchain.tools.render import format_tool_to_openai_function
from langchain.tools.render import format_tool_to_openai_tool
from langchain_core.language_models import BaseChatModel, BaseLanguageModel
from langchain_core.language_models.base import LanguageModelInput
from langchain_core.messages import BaseMessage
from langchain_core.pydantic_v1 import BaseModel
from langchain_benchmarks import rate_limiting
from langchain_benchmarks import model_registry, rate_limiting
from langchain_benchmarks.model_registration import RegisteredModel
from langchain_benchmarks.schema import ToolUsageTask
from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
# PUBLIC API
def _bind_tools(
llm: BaseChatModel,
tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable]],
tool_choice: Optional[str] = None,
json_mode: bool = False,
**kwargs: Any,
) -> Runnable[LanguageModelInput, BaseMessage]:
"""Bind tools (and other objects) to this chat model.
Args:
tools: A list of tool definitions to bind to this chat model.
Can be a dictionary, pydantic model, or callable. Pydantic
models and callables will be automatically converted to
their schema dictionary representation.
tool_choice: Which tool to require the model to call.
Must be the name of the single provided tool or
"auto" to automatically determine which tool to call
(if any).
json_mode: Whether to set JSON mode for the tool call.
This guarantees the model will respond in valid JSON
(unless truncated).
kwargs: Any additional parameters to pass to the
:class:`~langchain.runnable.Runnable` constructor.
"""
formatted_tools: List[Dict[str, Union[str, dict]]] = [
format_tool_to_openai_tool(tool) for tool in tools
]
if tool_choice is not None:
if not formatted_tools:
raise ValueError(
"When specifying `tool_choice`, you must provide at least one " "tool."
)
tool_names = [tool["function"]["name"] for tool in formatted_tools]
if not any(tool_name == tool_choice for tool_name in tool_names):
raise ValueError(
f"Tool choice {tool_choice} was specified, but the only "
f"provided tools were {tool_names}."
)
tool_choice_ = {"type": "function", "function": {"name": tool_choice}}
kwargs = {**kwargs, "tool_choice": tool_choice_}
if json_mode:
kwargs = {**kwargs, "response_format": {"type": "json_object"}}
return llm.bind(
tools=formatted_tools,
**kwargs,
)
class OpenAIAgentFactory:
def __init__(
self,
task: ToolUsageTask,
*,
model: str = "gpt-3.5-turbo-16k",
model: Union[
str, RegisteredModel, BaseLanguageModel, BaseChatModel
] = "gpt-3.5-turbo-16k",
rate_limiter: Optional[rate_limiting.RateLimiter] = None,
num_retries: int = 0,
) -> None:
"""Create an OpenAI agent factory for the given task.
@@ -34,6 +92,26 @@ class OpenAIAgentFactory:
self.task = task
self.model = model
self.rate_limiter = rate_limiter
self.num_retries = num_retries
def _create_model(self) -> Union[BaseChatModel, BaseLanguageModel]:
if isinstance(self.model, RegisteredModel):
return self.model.get_model(
model_params={"temperature": 0, "model_kwargs": {"seed": 0}}
)
elif isinstance(self.model, (BaseChatModel, BaseLanguageModel)):
return self.model
elif isinstance(self.model, str):
if self.model in model_registry:
registered_model = model_registry.get_model(self.model)
model_instance = registered_model.get_model(
model_params={"temperature": 0, "model_kwargs": {"seed": 0}}
)
return model_instance
else:
raise ValueError(f"Unknown model: {self.model}")
else:
raise TypeError(f"Expected str or RegisteredModel, got {type(self.model)}")
def create(self) -> Runnable:
"""Agent Executor"""
@@ -41,18 +119,12 @@ class OpenAIAgentFactory:
return self()
def __call__(self) -> Runnable:
model = ChatOpenAI(
model=self.model,
temperature=0,
)
model = self._create_model()
env = self.task.create_environment()
model = model.bind(
functions=[format_tool_to_openai_function(t) for t in env.tools]
)
model = _bind_tools(model, env.tools)
if rate_limiting:
if self.rate_limiter is not None:
# Rate limited model
model = rate_limiting.with_rate_limit(model, self.rate_limiter)
@@ -70,15 +142,18 @@ class OpenAIAgentFactory:
runnable_agent = (
{
"input": lambda x: x["input"],
"agent_scratchpad": lambda x: format_to_openai_functions(
"agent_scratchpad": lambda x: format_to_openai_tool_messages(
x["intermediate_steps"]
),
}
| prompt
| model
| OpenAIFunctionsAgentOutputParser()
| OpenAIToolsAgentOutputParser()
)
if self.num_retries > 0:
runnable_agent = runnable_agent.with_retry(
stop_after_attempt=self.num_retries + 1,
)
runnable = AgentExecutor(
agent=runnable_agent,
tools=env.tools,
@@ -0,0 +1,49 @@
"""Factory for creating agents for the tool usage task."""
from typing import Union
from langchain.agents.agent import (
AgentExecutor,
BaseMultiActionAgent,
BaseSingleActionAgent,
)
from langchain_core.runnables import Runnable
from langchain_benchmarks.schema import ToolUsageTask
from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
class CustomRunnableAgentFactory:
"""A factory for creating tool using agents.
A factory for agents that do not leverage any special JSON mode for
function usage; instead all function invocation behavior is implemented solely
through prompt engineering and parsing.
"""
def __init__(
self,
task: ToolUsageTask,
agent: Union[Runnable, BaseSingleActionAgent, BaseMultiActionAgent],
) -> None:
"""Create an agent factory for the given tool usage task.
Note: The agent should not be stateful, as it will be reused across
multiple runs.
Args:
task: The task to create an agent factory for
agent: The agent to use
"""
self.task = task
self.agent = agent
def __call__(self) -> Runnable:
env = self.task.create_environment()
executor = AgentExecutor(
agent=self.agent,
tools=env.tools,
handle_parsing_errors=True,
return_intermediate_steps=True,
)
return apply_agent_executor_adapter(executor, state_reader=env.read_state)
+62 -9
View File
@@ -5,11 +5,13 @@ Requirements:
* Agents must output "intermediate_steps" in their run outputs.
* The dataset must have "expected_steps" in its outputs.
"""
from typing import Literal, Optional, Union
import re
from typing import Any, Literal, Optional, Union
from langchain.callbacks.manager import collect_runs
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import EvaluatorType, load_evaluator
from langchain.evaluation import EvaluatorType, StringEvaluator, load_evaluator
from langchain.evaluation.schema import StringEvaluator
from langchain.smith import RunEvalConfig
from langchain_core.language_models import BaseChatModel, BaseLanguageModel
@@ -20,7 +22,49 @@ from langsmith.evaluation.evaluator import (
)
from langsmith.schemas import Example, Run
from langchain_benchmarks.tool_usage.prompts import QA_TEMPLATE_FOR_MULTIVERSE_MATH
from langchain_benchmarks.tool_usage.prompts import (
QA_TEMPLATE_FOR_MULTIVERSE_MATH,
QA_TEMPLATE_FOR_MULTIVERSE_MATH_WITHOUT_QUESTION,
)
OutputEvaluation = Literal["qa", "qa_math", "none", "qa_math_without_question"]
class QAMathEvaluator(StringEvaluator):
"""An LLM-based relevance evaluator."""
def __init__(self, chat_model: BaseChatModel) -> None:
"""Initialize the evaluator."""
self.eval_chain = QA_TEMPLATE_FOR_MULTIVERSE_MATH_WITHOUT_QUESTION | chat_model
@property
def evaluation_name(self) -> str:
"""Return the name of the evaluator."""
return "QAMathEvaluator"
@property
def requires_reference(self) -> bool:
return True
@property
def requires_input(self) -> bool:
return False
def _evaluate_strings(
self,
prediction: str,
input: Optional[str] = None,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate the prediction against the reference."""
result = self.eval_chain.invoke(
{"answer": reference, "result": prediction}, **kwargs
)
if result.content.startswith("CORRECT"):
return {"score": 1}
else:
return {"score": 0}
def compare_outputs(
@@ -90,11 +134,17 @@ def compare_outputs(
if "output" in run_outputs and qa_evaluator:
output = run_outputs["output"]
with collect_runs() as cb:
qa_results = qa_evaluator.evaluate_strings(
prediction=output,
reference=example_outputs["reference"],
input=run_inputs["question"],
)
if isinstance(qa_evaluator, QAMathEvaluator):
qa_results = qa_evaluator.evaluate_strings(
prediction=output,
reference=example_outputs["reference"],
)
else:
qa_results = qa_evaluator.evaluate_strings(
prediction=output,
reference=example_outputs["reference"],
input=run_inputs["question"],
)
results.append(
EvaluationResult(
key="correctness",
@@ -137,6 +187,8 @@ class AgentTrajectoryEvaluator(RunEvaluator):
llm=eval_llm,
prompt=QA_TEMPLATE_FOR_MULTIVERSE_MATH,
)
elif output_evaluation == "qa_math_without_question":
qa_evaluator = QAMathEvaluator(eval_llm)
else:
raise ValueError(
f"output_evaluation must be one of 'qa' or 'none', "
@@ -144,6 +196,7 @@ class AgentTrajectoryEvaluator(RunEvaluator):
)
self.qa_evaluator = qa_evaluator
self.output_evaluation = output_evaluation
def evaluate_run(
self, run: Run, example: Optional[Example] = None
@@ -181,7 +234,7 @@ class AgentTrajectoryEvaluator(RunEvaluator):
def get_eval_config(
*,
eval_llm: Union[BaseLanguageModel, BaseChatModel, None] = None,
output_evaluation: Literal["qa", "qa_math", "none"] = "qa",
output_evaluation: OutputEvaluation = "qa",
) -> RunEvalConfig:
"""Get the default evaluator for the environment.
@@ -22,3 +22,26 @@ GRADE:"""
QA_TEMPLATE_FOR_MULTIVERSE_MATH = PromptTemplate(
input_variables=["result", "answer"], template=MATH_TEMPLATE
)
MATH_TEMPLATE_NO_QUESTION = """\
Compare the INPUT_A and INPUT_B and determine whether the numeric result in them is the same.
If the result is the same, reply with CORRECT. If the result is different, reply with INCORRECT.
Example Format:
INPUT_A: input_a here
INPUT_B: input_b here
COMPARISON: CORRECT or INCORRECT here
Ignore differences in punctuation and phrasing between the student answer and true answer.
Begin!
INPUT_A: {answer}
INPUT_B: {result}
COMPARISON:"""
# Version without the query
QA_TEMPLATE_FOR_MULTIVERSE_MATH_WITHOUT_QUESTION = PromptTemplate(
input_variables=["result", "answer"], template=MATH_TEMPLATE_NO_QUESTION
)
@@ -127,38 +127,8 @@ def get_environment() -> ToolUsageEnvironment:
)
MULTIVERSE_MATH = ToolUsageTask(
name="Multiverse Math",
dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d",
create_environment=get_environment,
instructions=(
"You are requested to solve math questions in an alternate "
"mathematical universe. The operations have been altered to yield "
"different results than expected. Do not guess the answer or rely on your "
" innate knowledge of math. Use the provided tools to answer the question. "
"While associativity and commutativity apply, distributivity does not. Answer "
"the question using the fewest possible tools. Only include the numeric "
"response without any clarifications."
),
description=(
"""\
An environment that contains a few basic math operations, but with altered results.
For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
The basic operations retain some basic properties, such as commutativity, \
associativity, and distributivity; however, the results are different than expected.
The objective of this task is to evaluate the ability to use the provided tools to \
solve simple math questions and ignore any innate knowledge about math.
"""
),
eval_params={
"output_evaluation": "qa_math",
},
)
# Source dataset used to create the public dataset in LangSmith
DATASET = [
DATASET_TINY = [
{
"question": "Add 2 and 3",
"answer": add(2, 3),
@@ -193,14 +163,14 @@ DATASET = [
"expected_steps": ["log", "multiply"],
},
{
"question": "calculate 101 to the power of 0.5 to 4 digits of precision",
"answer": round(power(101, 0.5), 4),
"expected_steps": ["power", "round"],
"question": "calculate 101 to the power of 0.5",
"answer": power(101, 0.5),
"expected_steps": ["power"],
},
{
"question": (
"ecoli divides every 20 minutes. How many cells will be "
"there after 2 hours if we start with 5 cells?"
"there after 2 hours (120 minutes) if we start with 5 cells?"
),
"answer": multiply(5, power(2, divide(120, 20))),
"expected_steps": ["divide", "power", "multiply"],
@@ -220,6 +190,125 @@ DATASET = [
},
]
DATASET = DATASET_TINY + [
{
"question": "evaluate negate(-131,778)",
"answer": negate(-131_778),
"expected_steps": ["negate"],
},
{
"question": "what is the value of pi?",
"answer": pi(),
"expected_steps": ["pi"],
},
{
"question": "what is cos(pi)?",
"answer": cos(pi()),
"expected_steps": ["pi", "cos"],
},
{
"question": "how much is 131,778 divided by 2?",
"answer": divide(131_778, 2),
"expected_steps": ["divide"],
},
{
"question": "131,778 + 22,312?",
"answer": add(131_778, 22_312),
"expected_steps": ["add"],
},
{
"question": "(1+2) + 5",
"answer": add(add(1, 2), 5),
"expected_steps": ["add", "add"],
},
{
"question": "-(1 + 1)",
"answer": negate(add(1, 1)),
"expected_steps": ["add", "negate"],
},
{
"question": "Evaluate 1 + 2 + 3 + 4 + 5 using only the add function",
"answer": add(add(add(add(1, 2), 3), 4), 5),
"expected_steps": ["add", "add", "add", "add"],
},
{
"question": "Evaluate the sum of the numbers 1 through 10 using only the add function",
"answer": add(
add(add(add(add(add(add(add(add(1, 2), 3), 4), 5), 6), 7), 8), 9), 10
),
"expected_steps": ["add"] * (10 - 1),
},
{
"question": "Calculate 5 divided by 5",
"answer": divide(5, 5),
"expected_steps": ["divide"],
},
]
MULTIVERSE_MATH_TINY = ToolUsageTask(
name="Multiverse Math (Tiny)",
dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d",
create_environment=get_environment,
instructions=(
"You are requested to solve math questions in an alternate "
"mathematical universe. The operations have been altered to yield "
"different results than expected. Do not guess the answer or rely on your "
" innate knowledge of math. Use the provided tools to answer the question. "
"While associativity and commutativity apply, distributivity does not. Answer "
"the question using the fewest possible tools. Only include the numeric "
"response without any clarifications."
),
description=(
"""\
An environment that contains a few basic math operations, but with altered results.
For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
The basic operations retain some basic properties, such as commutativity, \
associativity, and distributivity; however, the results are different than expected.
The objective of this task is to evaluate the ability to use the provided tools to \
solve simple math questions and ignore any innate knowledge about math.
This is a tiny version of the Multiverse Math task, with 10 examples only.
"""
),
eval_params={
"output_evaluation": "qa_math_without_question",
},
)
MULTIVERSE_MATH = ToolUsageTask(
name="Multiverse Math",
dataset_id="https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d",
create_environment=get_environment,
instructions=(
"You are requested to solve math questions in an alternate "
"mathematical universe. The operations have been altered to yield "
"different results than expected. Do not guess the answer or rely on your "
" innate knowledge of math. Use the provided tools to answer the question. "
"While associativity and commutativity apply, distributivity does not. Answer "
"the question using the fewest possible tools. Only include the numeric "
"response without any clarifications."
),
description=(
"""\
An environment that contains a few basic math operations, but with altered results.
For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. \
The basic operations retain some basic properties, such as commutativity, \
associativity, and distributivity; however, the results are different than expected.
The objective of this task is to evaluate the ability to use the provided tools to \
solve simple math questions and ignore any innate knowledge about math.
This task is associated with 20 test examples.
"""
),
eval_params={
"output_evaluation": "qa_math_without_question",
},
)
def _create_dataset() -> None:
"""Create a dataset with the langsmith client."""
+3
View File
@@ -0,0 +1,3 @@
from langchain_benchmarks.utils._langsmith import run_without_langsmith
__all__ = ["run_without_langsmith"]
+178 -2
View File
@@ -1,14 +1,26 @@
"""Copy the public dataset to your own langsmith tenant."""
import functools
import json
import logging
import threading
import urllib.parse
from pathlib import Path
from typing import Optional, Tuple, Union
from typing import Any, Callable, List, Optional, Tuple, Union, cast
from uuid import UUID
from langsmith import Client
from langchain.smith import RunEvalConfig
from langchain.smith.evaluation import runner_utils as eval_runner_utils
from langchain_core import runnables
from langchain_core.runnables import config as runnable_config
from langchain_core.tracers.root_listeners import RootListenersTracer
from langsmith import Client, EvaluationResult
from langsmith.evaluation.evaluator import EvaluationResults
from langsmith.schemas import DataType, Example, Run
from langsmith.utils import LangSmithNotFoundError
from tqdm import auto
logger = logging.getLogger(__name__)
API_URL = "https://api.smith.langchain.com/"
@@ -133,3 +145,167 @@ def exists_public_dataset(token_or_url: str, *, api_url: str = API_URL) -> bool:
finally:
del source_client
def _select_eval_results(
results: Union[EvaluationResult, EvaluationResults],
) -> List[EvaluationResult]:
if isinstance(results, EvaluationResult):
results_ = [results]
elif isinstance(results, dict) and "results" in results:
results_ = cast(List[EvaluationResult], results["results"])
else:
raise TypeError(
f"Invalid evaluation result type {type(results)}."
" Expected EvaluationResult or EvaluationResults."
)
return results_
def _is_jupyter_environment() -> bool:
try:
from IPython import get_ipython
res = get_ipython()
return get_ipython() is not None and "zmqshell" in str(type(res))
except ImportError:
return False
def _display_aggregate_results(aggregate_results: Any) -> None:
if _is_jupyter_environment():
from IPython.display import HTML, display
display(HTML("<h3>Experiment Results:</h3>"))
display(aggregate_results)
else:
formatted_string = aggregate_results.to_string(
float_format=lambda x: f"{x:.2f}", justify="right"
)
print("\n Experiment Results:")
print(formatted_string)
def run_without_langsmith(
path_or_token_id: Union[str, Path],
llm_or_chain_factory: Union[
Callable[[], runnables.Runnable], Callable[[dict], Any]
],
*,
evaluation: Optional[RunEvalConfig] = None,
concurrency_level: int = 5,
verbose: bool = True,
) -> None:
"""Run a public dataset without langsmith."""
from langchain.smith.evaluation.runner_utils import (
_setup_evaluation,
_wrap_in_chain_factory,
)
if isinstance(path_or_token_id, Path) or path_or_token_id.endswith(".json"):
dataset_path = path_or_token_id
else:
_, token_uuid = _parse_token_or_url(path_or_token_id, API_URL)
dataset_path = f"{token_uuid}.json"
if not Path(dataset_path).exists():
download_public_dataset(path_or_token_id, path=dataset_path)
if not dataset_path.endswith(".json"):
raise ValueError(f"Unrecognized dataset path: {path_or_token_id}")
with open(str(dataset_path), encoding="utf-8") as f:
example_dicts = json.load(f)
examples = [Example(**example_dict) for example_dict in example_dicts]
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)
run_evaluators = _setup_evaluation(
llm_or_chain_factory=wrapped_model,
examples=examples,
evaluation=evaluation,
data_type=DataType.kv,
)
all_eval_results = {}
results_lock = threading.RLock()
_progress_bar = iter(
auto.tqdm(
iterable=range(len(examples)),
desc="Running Evaluation",
unit="example",
total=len(examples),
)
)
def _evaluate_run(run: Run, example: Example):
with results_lock:
next(_progress_bar)
example_result = all_eval_results.setdefault(str(example.id), {}) or {}
example_result.update(
{
"input": run.inputs,
"execution_time": (
(run.end_time - run.start_time).total_seconds()
if run.end_time
else None
),
"run_id": str(run.id),
}
)
if run.error is not None:
example_result["Error"] = run.error
else:
example_result["output"] = run.outputs
all_eval_results[str(example.id)] = example_result
if run_evaluators is None:
return
feedback = []
for evaluator in run_evaluators:
try:
eval_results = evaluator.evaluate_run(run, example)
except Exception as e:
logger.error(f"Failed to evaluate run {run.id}: {repr(e)}")
continue
flattened = _select_eval_results(eval_results)
feedback.extend(flattened)
with results_lock:
example_result = all_eval_results.setdefault(str(example.id), {}) or {}
example_result.update(
{
"feedback": feedback,
}
)
all_eval_results[str(example.id)] = example_result
configs = [
runnable_config.RunnableConfig(
callbacks=[
RootListenersTracer(
config={},
on_start=None,
on_end=functools.partial(_evaluate_run, example=example),
on_error=functools.partial(_evaluate_run, example=example),
),
],
max_concurrency=concurrency_level,
)
for example in examples
]
def run_runnable(x: dict) -> Any:
model = wrapped_model()
return model.invoke(x)
runnables.RunnableLambda(run_runnable).batch(
inputs=[example.inputs for example in examples],
config=configs,
return_exceptions=True,
)
results = eval_runner_utils.TestResult(
project_name="Local",
results=all_eval_results,
)
if verbose:
try:
agg_feedback = results.get_aggregate_feedback()
_display_aggregate_results(agg_feedback)
except Exception as e:
logger.debug(f"Failed to print aggregate feedback: {repr(e)}")
return results
Generated
+4 -4
View File
@@ -1622,13 +1622,13 @@ tenacity = ">=8.1.0,<9.0.0"
[[package]]
name = "langsmith"
version = "0.0.67"
version = "0.0.70"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
optional = false
python-versions = ">=3.8.1,<4.0"
files = [
{file = "langsmith-0.0.67-py3-none-any.whl", hash = "sha256:66a257b97dabd43a7e62af271b2ddb7566167ce4e446fd7b7760e97d6ce84a5e"},
{file = "langsmith-0.0.67.tar.gz", hash = "sha256:cef00bac2e7455a5943f3afaea91c032db1a1f2adb83003159a71e884fb5a9a2"},
{file = "langsmith-0.0.70-py3-none-any.whl", hash = "sha256:a0d4cac3af94fe44c2ef3814c32b6740f92aebe267e395d62e62040bc5bad343"},
{file = "langsmith-0.0.70.tar.gz", hash = "sha256:3a546c45e67f6600d6669ef63f1f58b772e505703126338ad4f22fe0e2bbf677"},
]
[package.dependencies]
@@ -4083,4 +4083,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata]
lock-version = "2.0"
python-versions = "^3.8.1"
content-hash = "8ff4713dcb59c3e0d796659a7feaf21a5bacd9ce9995426f56cce4b5af9e5e1b"
content-hash = "91171e1e590780b3d7df5efcf5eaddddabbe2715294add5ccf14f52cd3fa3b6d"
+2 -2
View File
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-benchmarks"
version = "0.0.9"
version = "0.0.10"
description = "🦜💪 Flex those feathers!"
authors = ["LangChain AI"]
license = "MIT"
@@ -9,7 +9,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8.1"
langchain = ">=0.0.300"
langsmith = ">=0.0.66"
langsmith = ">=0.0.70"
tqdm = "^4"
ipywidgets = "^8"
tabulate = ">=0.8.0"