benchmarks[major]: bump core to 0.3 (#211 )

- Drop support for python 3.8 - Bump langchain-core to 0.3 - Update pydantic objects to v2
add faiss-cpu dependency (#209 )
2026-07-01 01:37:54 -04:00 · 2024-10-21 16:47:14 -04:00 · 2024-08-07 07:53:45 -07:00 · 2024-08-06 15:28:43 -07:00 · 2024-08-06 15:24:23 -07:00 · 2024-08-06 15:15:08 -07:00
71 changed files with 5174 additions and 6460 deletions
@@ -1,94 +0,0 @@
-name: pydantic v1/v2 compatibility
-
-on:
-  workflow_call:
-    inputs:
-      working-directory:
-        required: true
-        type: string
-        description: "From which folder this pipeline executes"
-
-env:
-  POETRY_VERSION: "1.6.1"
-
-jobs:
-  build:
-    timeout-minutes: 5
-    defaults:
-      run:
-        working-directory: ${{ inputs.working-directory }}
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-          - "3.11"
-    name: Pydantic v1/v2 compatibility - Python ${{ matrix.python-version }}
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
-        uses: "./.github/actions/poetry_setup"
-        with:
-          python-version: ${{ matrix.python-version }}
-          poetry-version: ${{ env.POETRY_VERSION }}
-          working-directory: ${{ inputs.working-directory }}
-          cache-key: pydantic-cross-compat
-
-      - name: Install dependencies
-        shell: bash
-        run: poetry install --with test
-
-      - name: Install the opposite major version of pydantic
-        # If normal tests use pydantic v1, here we'll use v2, and vice versa.
-        shell: bash
-        run: |
-          # Determine the major part of pydantic version
-          REGULAR_VERSION=$(poetry run python -c "import pydantic; print(pydantic.__version__)" | cut -d. -f1)
-
-          if [[ "$REGULAR_VERSION" == "1" ]]; then
-            PYDANTIC_DEP=">=2.1,<3"
-            TEST_WITH_VERSION="2"
-          elif [[ "$REGULAR_VERSION" == "2" ]]; then
-            PYDANTIC_DEP="<2"
-            TEST_WITH_VERSION="1"
-          else
-            echo "Unexpected pydantic major version '$REGULAR_VERSION', cannot determine which version to use for cross-compatibility test."
-            exit 1
-          fi
-
-          # Install via `pip` instead of `poetry add` to avoid changing lockfile,
-          # which would prevent caching from working: the cache would get saved
-          # to a different key than where it gets loaded from.
-          poetry run pip install "pydantic${PYDANTIC_DEP}"
-
-          # Ensure that the correct pydantic is installed now.
-          echo "Checking pydantic version... Expecting ${TEST_WITH_VERSION}"
-
-          # Determine the major part of pydantic version
-          CURRENT_VERSION=$(poetry run python -c "import pydantic; print(pydantic.__version__)" | cut -d. -f1)
-
-          # Check that the major part of pydantic version is as expected, if not
-          # raise an error
-          if [[ "$CURRENT_VERSION" != "$TEST_WITH_VERSION" ]]; then
-            echo "Error: expected pydantic version ${CURRENT_VERSION} to have been installed, but found: ${TEST_WITH_VERSION}"
-            exit 1
-          fi
-          echo "Found pydantic version ${CURRENT_VERSION}, as expected"
-      - name: Run pydantic compatibility tests
-        shell: bash
-        run: make test
-
-      - name: Ensure the tests did not create any additional files
-        shell: bash
-        run: |
-          set -eu
-
-          STATUS="$(git status)"
-          echo "$STATUS"
-
-          # grep will exit non-zero if the target message isn't found,
-          # and `set -e` above will cause the step to fail.
-          echo "$STATUS" | grep 'nothing to commit, working tree clean'
@@ -31,12 +31,6 @@ jobs:
      working-directory: .
    secrets: inherit

-  pydantic-compatibility:
-    uses:
-      ./.github/workflows/_pydantic_compatibility.yml
-    with:
-      working-directory: .
-    secrets: inherit
  test:
    timeout-minutes: 5
    runs-on: ubuntu-latest
@@ -8,6 +8,7 @@ jobs:
  release:
    uses:
      ./.github/workflows/_release.yml
+    permissions: write-all
    with:
      working-directory: .
    secrets: inherit
@@ -0,0 +1,44 @@
+name: Weekly Tool Benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 0'  # Runs at midnight (00:00) every Sunday (UTC time)
+
+env:
+  POETRY_VERSION: "1.6.1"
+  LANGCHAIN_API_KEY: ${{ secrets.LANGCHAIN_API_KEY }}
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+
+jobs:
+  run_tool_benchmarks:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12 + Poetry ${{ env.POETRY_VERSION }}
+        uses: "./.github/actions/poetry_setup"
+        with:
+          python-version: '3.12'
+          poetry-version: ${{ env.POETRY_VERSION }}
+          working-directory: .
+          cache-key: benchmarks-all
+      
+      - name: Install dependencies
+        shell: bash
+        run: |
+          echo "Running tests, installing dependencies with poetry..."
+          poetry install --with test,lint,typing,docs
+
+      - name: Multiverse math benchmark
+
+        run: |
+          cd scripts
+          poetry run python multiverse_math_benchmark.py
+      
+      - name: Query analysis benchmark
+        run: |
+          cd scripts
+          poetry run python query_analysis_benchmark.py
@@ -22,6 +22,29 @@ We have several goals in open sourcing this:
 - Showing how we evaluate each task
 - Encouraging others to benchmark their solutions on these tasks (we are always looking for better ways of doing things!)

+## Benchmarking Results
+
+Read some of the articles about benchmarking results on our blog.
+
+* [Agent Tool Use](https://blog.langchain.dev/benchmarking-agent-tool-use/)
+* [Query Analysis in High Cardinality Situations](https://blog.langchain.dev/high-cardinality/)
+* [RAG on Tables](https://blog.langchain.dev/benchmarking-rag-on-tables/)
+* [Q&A over CSV data](https://blog.langchain.dev/benchmarking-question-answering-over-csv-data/)
+
+
+### Tool Usage (2024-04-18)
+
+See [tool usage docs](https://langchain-ai.github.io/langchain-benchmarks/notebooks/tool_usage/benchmark_all_tasks.html) to recreate!
+
+![download](https://github.com/langchain-ai/langchain-benchmarks/assets/3205522/0da33de8-e03f-49cf-bd48-e9ff945828a9)
+
+Explore Agent Traces on LangSmith:
+
+* [Relational Data](https://smith.langchain.com/public/22721064-dcf6-4e42-be65-e7c46e6835e7/d)
+* [Tool Usage (1-tool)](https://smith.langchain.com/public/ac23cb40-e392-471f-b129-a893a77b6f62/d)
+* [Tool Usage (26-tools)](https://smith.langchain.com/public/366bddca-62b3-4b6e-849b-a478abab73db/d)
+* [Multiverse Math](https://smith.langchain.com/public/983faff2-54b9-4875-9bf2-c16913e7d489/d)
+
 ## Installation

 To install the packages, run the following command:
@@ -3,12 +3,12 @@ from langchain.agents import AgentExecutor, OpenAIFunctionsAgent
 from langchain.agents.agent_toolkits.conversational_retrieval.tool import (
    create_retriever_tool,
 )
-from langchain.chat_models import ChatOpenAI
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.smith import RunEvalConfig, run_on_dataset
 from langchain.tools import PythonAstREPLTool
 from langchain.vectorstores import FAISS
+from langchain_openai import ChatOpenAI
 from langsmith import Client
 from pydantic import BaseModel, Field

@@ -1,8 +1,8 @@
 import pandas as pd
 from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
 from langchain.agents.agent_types import AgentType
-from langchain.chat_models import ChatOpenAI
 from langchain.smith import RunEvalConfig, run_on_dataset
+from langchain_openai import ChatOpenAI
 from langsmith import Client

 if __name__ == "__main__":
@@ -1,8 +1,8 @@
 import pandas as pd
 from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
 from langchain.agents.agent_types import AgentType
-from langchain.chat_models import ChatOpenAI
 from langchain.smith import RunEvalConfig, run_on_dataset
+from langchain_openai import ChatOpenAI
 from langsmith import Client

 if __name__ == "__main__":
@@ -1,8 +1,8 @@
 import pandas as pd
-from langchain.chat_models import ChatOpenAI
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema.output_parser import StrOutputParser
 from langchain.smith import RunEvalConfig, run_on_dataset
+from langchain_openai import ChatOpenAI
 from langsmith import Client
 from pandasai import PandasAI

@@ -2,7 +2,7 @@ import pandas as pd
 import streamlit as st
 from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
 from langchain.agents.agent_types import AgentType
-from langchain.chat_models import ChatOpenAI
+from langchain_openai import ChatOpenAI

 df = pd.read_csv("titanic.csv")

@@ -1,6 +1,6 @@
 import streamlit as st
 from langchain.chains import create_extraction_chain
-from langchain.chat_models import ChatOpenAI
+from langchain_openai import ChatOpenAI
 from langsmith import Client

 st.set_page_config(page_title="🦜🔗 Text-to-graph extraction")
@@ -1,8 +1,8 @@
 from langchain.chat_models import ChatAnthropic
 from langchain.prompts import ChatPromptTemplate
-from langchain.pydantic_v1 import BaseModel
 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnableLambda
+from pydantic import BaseModel

 from .prompts import answer_prompt
 from .retriever_agent import executor
@@ -3,13 +3,13 @@ from typing import List, Tuple
 from langchain.agents import AgentExecutor
 from langchain.agents.format_scratchpad import format_to_openai_functions
 from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
-from langchain.chat_models import ChatOpenAI
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain.pydantic_v1 import BaseModel, Field
 from langchain.schema.messages import AIMessage, HumanMessage
 from langchain.tools import tool
 from langchain.tools.render import format_tool_to_openai_function
 from langchain_docs_retriever.retriever import get_retriever
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel, Field

 # This is used to tell the model how to best use the retriever.

@@ -7,9 +7,9 @@ from typing import Callable, Optional

 from anthropic_iterative_search.chain import chain as anthropic_agent_chain
 from chat_langchain.chain import create_chain
-from langchain.chat_models import ChatOpenAI
 from langchain.schema.runnable import Runnable
 from langchain.smith import RunEvalConfig, run_on_dataset
+from langchain_openai import ChatOpenAI
 from langsmith import Client
 from oai_assistant.chain import agent_executor as openai_assistant_chain
 from openai_functions_agent import agent_executor as openai_functions_agent_chain
@@ -259,8 +259,8 @@
   },
   "outputs": [],
   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "llm = ChatOpenAI(model=\"gpt-4-1106-preview\", temperature=0).bind_functions(\n",
    "    functions=[task.schema],\n",
@@ -232,8 +232,8 @@
   },
   "outputs": [],
   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\", temperature=0).bind_functions(\n",
    "    functions=[task.schema],\n",
@@ -97,7 +97,7 @@
   },
   "outputs": [],
   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "from langchain_benchmarks.extraction import get_eval_config\n",
    "\n",
@@ -75,6 +75,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "7fb27b941602401d91542211134fc71a",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -728,12 +729,12 @@
    "from langchain.agents import AgentExecutor\n",
    "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
    "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
-    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
    "from langchain.pydantic_v1 import BaseModel, Field\n",
    "from langchain.schema.messages import AIMessage, HumanMessage\n",
    "from langchain.tools import tool\n",
    "from langchain.tools.render import format_tool_to_openai_function\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "# This is used to tell the model how to best use the retriever.\n",
    "\n",
@@ -508,8 +508,8 @@
   },
   "outputs": [],
   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.schema.messages import HumanMessage\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "\n",
    "def image_summarize(img_base64, prompt):\n",
@@ -328,10 +328,10 @@
   },
   "outputs": [],
   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.prompts import ChatPromptTemplate\n",
    "from langchain.schema.output_parser import StrOutputParser\n",
    "from langchain.schema.runnable import RunnablePassthrough\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "\n",
    "def rag_chain(retriever):\n",
@@ -451,11 +451,11 @@
   "source": [
    "from operator import itemgetter\n",
    "\n",
-    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.prompts import ChatPromptTemplate\n",
    "from langchain.schema.document import Document\n",
    "from langchain.schema.output_parser import StrOutputParser\n",
    "from langchain.schema.runnable.passthrough import RunnableAssign\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "# Prompt\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
@@ -126,7 +126,6 @@
   "source": [
    "import uuid\n",
    "\n",
-    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.document_loaders import PyPDFLoader\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.prompts import ChatPromptTemplate\n",
@@ -138,6 +137,7 @@
    "from langchain.storage import InMemoryStore\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.vectorstores import Chroma\n",
+    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "\n",
    "def prepare_documents(docs):\n",
@@ -93,7 +93,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "id": "3b9b82fc-b689-4a25-b718-99ecc2fc6867",
   "metadata": {
    "tags": []
@@ -136,19 +136,21 @@
       "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n",
       "\n",
       "Success is measured by the ability to answer the question correctly, and efficiently.              </td></tr>\n",
-       "<tr><td>Multiverse Math                   </td><td>ToolUsageTask</td><td><a href=\"https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d\" target=\"_blank\" rel=\"noopener\">594f9f60-30a0-49bf-b075-f44beabf546a</a></td><td>An environment that contains a few basic math operations, but with altered results.\n",
+       "<tr><td>Multiverse Math                   </td><td>ToolUsageTask</td><td><a href=\"https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d\" target=\"_blank\" rel=\"noopener\">47ed57bc-e852-4f84-a23e-cce4793864e9</a></td><td>An environment that contains a few basic math operations, but with altered results.\n",
       "\n",
       "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n",
       "\n",
-       "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.              </td></tr>\n",
+       "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\n",
+       "\n",
+       "This task is associated with 20 test examples.              </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ],
      "text/plain": [
-       "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x12778be20>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x1277c0360>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x12778b920>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=<function get_environment at 0x12778b240>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})])"
+       "Registry(tasks=[ToolUsageTask(name='Tool Usage - Typewriter (1 tool)', dataset_id='https://smith.langchain.com/public/59577193-8938-4ccf-92a7-e8a96bcf4f86/d', description=\"Environment with a single tool that accepts a single letter as input, and prints it on a piece of virtual paper.\\n\\nThe objective of this task is to evaluate the ability of the model to use the provided tools to repeat a given input string.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\", create_environment=<function get_environment at 0x7b3a9f5fad40>, instructions=\"Repeat the given string using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must print the letters 'a', 'b', and 'c' one at a time and in that order. \", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7b3a9f5fb240>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'}), ToolUsageTask(name='Tool Usage - Relational Data', dataset_id='https://smith.langchain.com/public/1d89f4b3-5f73-48cf-a127-2fdeb22f6d84/d', description='Environment with fake data about users and their locations and favorite foods.\\n\\nThe environment provides a set of tools that can be used to query the data.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to answer questions about relational data.\\n\\nThe dataset contains 21 examples of varying difficulty. The difficulty is measured by the number of tools that need to be used to answer the question.\\n\\nEach example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\\n\\nSuccess is measured by the ability to answer the question correctly, and efficiently.\\n', create_environment=<function get_environment at 0x7b3a9f5fa840>, instructions=\"Please answer the user's question by using the tools provided. Do not guess the answer. Keep in mind that entities like users,foods and locations have both a name and an ID, which are not the same.\", eval_params={}), ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/47ed57bc-e852-4f84-a23e-cce4793864e9/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n\\nThis task is associated with 20 test examples.\\n', create_environment=<function get_environment at 0x7b3a9f5fa200>, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your  innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math_without_question'})])"
      ]
     },
-     "execution_count": 2,
+     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -169,7 +171,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "7543739b-d212-4249-9b4a-fc406a58c9c7",
   "metadata": {
    "tags": []
@@ -198,10 +200,10 @@
       "</table>"
      ],
      "text/plain": [
-       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x1277c0360>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
+       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7b3a9f5fb240>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
      ]
     },
-     "execution_count": 3,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -248,7 +250,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "id": "f201dbbe-7d92-4bc7-b4b5-ea8901dd2970",
   "metadata": {
    "tags": []
@@ -257,13 +259,13 @@
    {
     "data": {
      "text/plain": [
-       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c18a0>),\n",
-       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c13a0>),\n",
-       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c19e0>),\n",
-       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x1277c1800>)]"
+       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62c9a0>),\n",
+       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62c5e0>),\n",
+       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62cae0>),\n",
+       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchema'>, func=<function _create_typing_func.<locals>.func at 0x7b3a9f62cb80>)]"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -275,7 +277,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "id": "b07957ee-ae52-47d4-a4ff-aa99d4d9bdaf",
   "metadata": {
    "tags": []
@@ -287,7 +289,7 @@
       "'OK'"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -300,7 +302,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "id": "40fbb9b6-00f6-4445-b480-00eed6b5b3aa",
   "metadata": {
    "tags": []
@@ -312,7 +314,7 @@
       "'aac'"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -326,134 +328,118 @@
   "id": "8d39b9b3-d4da-49bc-b3db-8a4165b1db55",
   "metadata": {},
   "source": [
-    "## Creating an agent\n",
+    "## Create an Agent!\n",
    "\n",
-    "So now that you know how the test environment works, it's time to define an agent! \n",
+    "Now that you know how the test environment works, let's create an agent that we can test!\n",
    "\n",
-    "We will follow the example in the LangChain documentation to [define an OpenAI tool using agent](https://python.langchain.com/docs/modules/agents/). "
+    "Because an agent interacts with the environment via tools and can change the state of the environment during the course of an agent run, what we actually want is the ability to create a fresh agent and a fresh environment for each test run.\n",
+    "\n",
+    "We'll do this using a factory. A factory is just a fancy name in computer science for an object that can create other objects. In this case, we'll have an Agent Factory that we can call and it'll create a fresh agent for us on each call.\n",
+    "\n",
+    "We'll use the StandardAgentFactory which under the hood creates a standard LangChain [tool calling agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/). It can be used with any [Chat Model that support tool calling](https://python.langchain.com/docs/integrations/chat/)."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
-   "id": "8827186a-8ed3-43c7-956c-71342e0a7bf2",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 7,
+   "id": "db65c253-7710-4c7b-b968-0662ec089030",
+   "metadata": {},
   "outputs": [],
   "source": [
-    "from langchain.agents.format_scratchpad.openai_tools import (\n",
-    "    format_to_openai_tool_messages,\n",
-    ")\n",
-    "from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser\n",
-    "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
-    "from langchain.tools.render import (\n",
-    "    format_tool_to_openai_function,\n",
-    "    format_tool_to_openai_tool,\n",
-    ")\n",
-    "from langchain_community.chat_models import ChatOpenAI\n",
-    "from langchain_core.runnables import RunnableParallel\n",
+    "from langchain_anthropic.chat_models import ChatAnthropic\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
    "\n",
-    "tools = task.create_environment().tools\n",
-    "formatted_tools = [format_tool_to_openai_tool(t) for t in tools]\n",
-    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-1106\", temperature=0, model_kwargs={\"seed\": 42})\n",
-    "# Compose the llm call with the tools' JSON schemas\n",
-    "llm_with_tools = llm.bind(tools=formatted_tools)\n",
-    "format_inputs = RunnableParallel(\n",
-    "    {\n",
-    "        \"input\": lambda x: x[\"input\"],\n",
-    "        \"agent_scratchpad\": lambda x: format_to_openai_tool_messages(\n",
-    "            x[\"intermediate_steps\"]\n",
-    "        ),\n",
-    "    }\n",
-    ")\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
    "\n",
+    "model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
    "        (\n",
-    "            \"system\",\n",
-    "            \"You are very powerful assistant, but bad at calculating lengths of words.\",\n",
-    "        ),\n",
-    "        (\"user\", \"{input}\"),\n",
-    "        MessagesPlaceholder(variable_name=\"agent_scratchpad\"),\n",
+    "            \"human\",\n",
+    "            \"{question}\",\n",
+    "        ),  # Each evaluation example is associated with a question\n",
+    "        (\"placeholder\", \"{agent_scratchpad}\"),  # Space for the agent to do work\n",
    "    ]\n",
    ")\n",
-    "agent_definition = (\n",
-    "    # Input to this pipeline is a dictionary with \"input\" and \"intermediate_steps\" keys\n",
-    "    format_inputs | prompt | llm_with_tools | OpenAIToolsAgentOutputParser()\n",
-    ")"
+    "\n",
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "7614ab73-dc66-4f2e-9eeb-ff1711c113d0",
+   "id": "5c99a9bd-fa3e-4401-9062-77dbcff30d5c",
   "metadata": {},
   "source": [
-    "### Agent Factory\n",
-    "\n",
-    "As discussed above, each test environment tracks state. We want to create a new environment for each data point to avoid cross-contamination between rows in the dataset.\n",
-    "\n",
-    "We do this by defining an agent factory. Below, we integrate our agent into a `CustomRunnableAgentFactory`, which helps create the environment and agent executor for each data point."
+    "Here, were the instructions for the task"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
-   "id": "629416b3-b5d6-45ad-9bda-4f0642a0eb13",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 9,
+   "id": "8e1f0a3d-fed6-41f7-8825-08787a57ad98",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\""
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "from langchain_benchmarks.tool_usage.agents import CustomRunnableAgentFactory\n",
-    "\n",
-    "agent_factory = CustomRunnableAgentFactory(task, agent=agent_definition)"
+    "task.instructions"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "7f06cf25-6766-4ea5-a566-36af045bdcf4",
+   "id": "82c9de5d-185b-4776-9ee9-112a2db32139",
   "metadata": {},
   "source": [
-    "Let's check that the agent works"
+    "Let's test it out"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
-   "id": "755f7920-831b-4595-8c6d-cca22c935198",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 10,
+   "id": "ce67d619-fa99-4c15-bc53-3fb08b40a201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `a` with `{}`\n",
+      "responded: [{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n",
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `b` with `{}`\n",
+      "responded: [{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n",
+      "\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `c` with `{}`\n",
+      "responded: [{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_01MQ6oTx2j2uNGCR5LBVeKui', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01AytT1jvNNR67VodMkhbq7r', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_015VkTYUV5hWcobtduqssi9k', 'input': {}, 'name': 'c', 'type': 'tool_use'}]\n",
+      "\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m[]\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Finished chain.\u001b[0m\n"
+     ]
+    }
+   ],
   "source": [
    "from langchain import globals\n",
    "\n",
-    "globals.set_verbose(True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c2804eae-5b0b-4a38-9dff-363a4fe8f324",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
+    "globals.set_verbose(True)\n",
    "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"abc\"})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "2aa68a11-d268-4868-a862-309801201989",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
+    "agent.invoke({\"question\": \"abc\"})\n",
    "globals.set_verbose(False)"
   ]
  },
@@ -485,12 +471,12 @@
   "id": "5e9e5817-3b9d-4a1e-8ee8-692d39aa68ca",
   "metadata": {},
   "source": [
-    "This evaluator will be used below when we benchmark on all tasks!"
+    "Each task is associated with its own task specific evaluator!"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
   "id": "c88bd6e1-f77e-4668-a143-096929e897ee",
   "metadata": {
    "tags": []
@@ -499,10 +485,10 @@
    {
     "data": {
      "text/plain": [
-       "RunEvalConfig(evaluators=[], custom_evaluators=[<langchain_benchmarks.tool_usage.evaluators.AgentTrajectoryEvaluator object at 0x15699ed10>], reference_key=None, prediction_key=None, input_key=None, eval_llm=None)"
+       "RunEvalConfig(evaluators=[], custom_evaluators=[<langchain_benchmarks.tool_usage.evaluators.AgentTrajectoryEvaluator object at 0x7b3a9ea5b110>], batch_evaluators=None, reference_key=None, prediction_key=None, input_key=None, eval_llm=None)"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -522,14 +508,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
-   "id": "60466447-eb37-4204-a497-fe47e8d8dd70",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 11,
+   "id": "0770b442-f96a-4670-a4f7-3093f24fb64b",
+   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
+    "import uuid\n",
    "\n",
    "from langsmith.client import Client\n",
    "\n",
@@ -539,110 +524,89 @@
    "    model_registry,\n",
    "    registry,\n",
    ")\n",
-    "from langchain_benchmarks.rate_limiting import RateLimiter\n",
-    "from langchain_benchmarks.tool_usage.agents import (\n",
-    "    AnthropicToolUserFactory,\n",
-    "    CustomAgentFactory,\n",
-    "    CustomRunnableAgentFactory,\n",
-    "    OpenAIAgentFactory,\n",
-    "    OpenAIAssistantFactory,\n",
-    ")"
+    "from langchain_benchmarks.rate_limiting import RateLimiter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15cbded4-5ab5-4b9b-9e88-77b24d3b750c",
+   "metadata": {},
+   "source": [
+    "Create an experiment ID. we'll use it to tag our runs, which we can later use to retrieve run data from LangSmith."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
-   "id": "c448d139-9923-4cf6-af49-cbf3dff46bdc",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 12,
+   "id": "c23208e3-01d1-4e83-9e4a-59544828f6f5",
+   "metadata": {},
   "outputs": [],
   "source": [
-    "import uuid\n",
-    "\n",
-    "experiment_uuid = uuid.uuid4().hex[:]"
+    "experiment_id = uuid.uuid4().hex[:]"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "200df769-4dd9-453b-8500-219c1d5305f6",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "83050cfc-f50f-4c63-8257-07e7688a54c4",
+   "metadata": {},
   "source": [
-    "tests = [\n",
-    "    # 2-tuple of (architecture, model name)\n",
-    "    (\"openai_functions\", \"gpt-3.5-turbo-1106\"),  # Requires OpenAI Creds\n",
-    "    (\"openai_functions\", \"gpt-3.5-turbo-0613\"),\n",
-    "    (\"openai_functions\", \"gpt-4-1106-preview\"),\n",
-    "    (\"openai_functions\", \"gpt-4-0613\"),\n",
-    "    (\"openai_functions\", \"mistral-7b-instruct-v0.1\"),  # Requires AnyScale creds\n",
-    "    # Requires Anthropic Creds and Setting up Anthropics Tool Usage package.\n",
-    "    # (\n",
-    "    #     \"anthropic_tool_user\",\n",
-    "    #     \"claude-2.1\",\n",
-    "    # ),\n",
-    "]"
+    "Run evaluation against all tasks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "5ddf7355-7db9-4adc-bc1e-f04c3d0ec57d",
-   "metadata": {
-    "tags": []
-   },
+   "id": "b2a3463b-1c9f-494b-bcbd-1dc1760ebf19",
+   "metadata": {},
   "outputs": [],
   "source": [
    "client = Client()  # Launch langsmith client for cloning datasets\n",
    "today = datetime.date.today().isoformat()\n",
-    "rate_limiter = RateLimiter(requests_per_second=2)\n",
    "\n",
-    "for task in registry:\n",
+    "# You can use an optional rate limiter to rate limit your requests!\n",
+    "rate_limiter = RateLimiter(requests_per_second=1)\n",
+    "\n",
+    "\n",
+    "# Set up 2-tuples of (model name, model instance)\n",
+    "# You can update this list with any model that supports tool calling.\n",
+    "# See list here: https://python.langchain.com/docs/integrations/chat/\n",
+    "tests = [\n",
+    "    (\n",
+    "        \"claude-3-haiku-20240307\",\n",
+    "        ChatAnthropic(model=\"claude-3-haiku-20240307\", temperature=0),\n",
+    "    )\n",
+    "]\n",
+    "\n",
+    "\n",
+    "for task in registry.tasks:\n",
    "    if task.type != \"ToolUsageTask\":\n",
    "        continue\n",
    "\n",
-    "    dataset_name = task.name\n",
+    "    dataset_name = task.name + f\" ({today})\"\n",
    "    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n",
    "\n",
-    "    for arch, model in tests:\n",
+    "    for model_name, model in tests:\n",
    "        print()\n",
-    "        print(f\"Benchmarking {task.name} with model: {model} and arch: {arch}\")\n",
+    "        print(f\"Benchmarking {task.name} with model: {model_name}\")\n",
    "        eval_config = task.get_eval_config()\n",
    "\n",
-    "        if arch == \"openai_functions\":\n",
-    "            agent_factory = OpenAIAgentFactory(\n",
-    "                task, model=model, rate_limiter=rate_limiter\n",
-    "            )\n",
-    "        elif arch == \"custom_agent\":\n",
-    "            agent_factory = CustomAgentFactory(\n",
-    "                task, model=model, rate_limiter=rate_limiter\n",
-    "            )\n",
-    "        elif arch == \"custom_runnable_agent\":\n",
-    "            # For this, the model would have to be a runnable object\n",
-    "            agent_factory = CustomRunnableAgentFactory(task, agent=model)\n",
-    "        elif arch == \"anthropic_tool_user\":\n",
-    "            agent_factory = AnthropicToolUserFactory(task)\n",
-    "        else:\n",
-    "            raise ValueError()\n",
+    "        agent_factory = StandardAgentFactory(\n",
+    "            task, model, prompt, rate_limiter=rate_limiter\n",
+    "        )\n",
    "\n",
    "        client.run_on_dataset(\n",
    "            dataset_name=dataset_name,\n",
    "            llm_or_chain_factory=agent_factory,\n",
    "            evaluation=eval_config,\n",
    "            verbose=False,\n",
-    "            project_name=f\"{model}-{task.name}-{today}-{experiment_uuid}\",\n",
-    "            tags=[model],\n",
+    "            project_name=f\"{model_name}-{task.name}-{today}-{experiment_id}\",\n",
    "            concurrency_level=5,\n",
    "            project_metadata={\n",
-    "                \"model\": model,\n",
+    "                \"model\": model_name,\n",
    "                \"id\": experiment_uuid,\n",
    "                \"task\": task.name,\n",
    "                \"date\": today,\n",
    "                \"langchain_benchmarks_version\": __version__,\n",
-    "                \"arch\": arch,\n",
    "            },\n",
    "        )"
   ]
@@ -656,6 +620,8 @@
    "\n",
    "The following sections demonstrate slightly more \"advanced\" usage if you want to completely customize the agent runtime in a way that is compatible with our test runner.\n",
    "\n",
+    "We'll also apply an adapter to the agent which will will capture its inputs and outputs (e.g, add information the agent's environment at the end of the run) so that it we can evaluate it.\n",
+    "\n",
    "### Custom Agent Factory\n",
    "\n",
    "If you want even more configurability beyond what the `CustomRunnableAgentFactory` provides, you can create your owne `AgentFactory` using the following pattern.\n",
@@ -666,33 +632,33 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
-   "id": "bca8ad69-9956-451c-b639-ea30c77d982f",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 16,
+   "id": "69351864-2e97-43df-81ae-5067cbf5e471",
+   "metadata": {},
   "outputs": [],
   "source": [
-    "from langchain.agents import AgentType, initialize_agent\n",
-    "from langchain.chat_models import ChatOpenAI\n",
+    "from typing import Optional\n",
+    "\n",
+    "from langchain.agents import AgentExecutor, create_tool_calling_agent\n",
+    "from langchain_anthropic import ChatAnthropic\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
    "\n",
    "from langchain_benchmarks.schema import ExtractionTask\n",
-    "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "44839ebe-48ea-4d5b-87b4-2ad72acacb71",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "class AgentFactory:\n",
-    "    def __init__(self, task: ExtractionTask, model: str) -> None:\n",
+    "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter\n",
+    "\n",
+    "\n",
+    "class CustomAgentFactory:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        task: ExtractionTask,\n",
+    "        *,\n",
+    "        # It can be useful to add a rate-limiter\n",
+    "        # which will limit ther number of requests per second\n",
+    "        # when running evaluation.\n",
+    "        rate_limiter: Optional[RateLimiter] = None,\n",
+    "    ) -> None:\n",
    "        self.task = task\n",
-    "        self.model = model\n",
+    "        self.rate_limiter = rate_limiter\n",
    "\n",
    "    def __call__(self):\n",
    "        # This factory creates a new environment for every agent run.\n",
@@ -701,63 +667,127 @@
    "        # At the end of the run, the environment state will be read.\n",
    "        env = task.create_environment()  # Create a new environment for every agent run!\n",
    "        tools = env.tools\n",
-    "        llm = ChatOpenAI(temperature=0, model=self.model)\n",
-    "        agent_executor = initialize_agent(\n",
-    "            tools,\n",
-    "            llm,\n",
-    "            agent=AgentType.OPENAI_FUNCTIONS,\n",
-    "            return_intermediate_steps=True,\n",
-    "            handle_parsing_errors=True,\n",
+    "        model = ChatAnthropic(model=\"claude-3-opus-20240229\", temperature=0)\n",
+    "        prompt = ChatPromptTemplate.from_messages(\n",
+    "            [\n",
+    "                (\"system\", self.task.instructions),\n",
+    "                (\n",
+    "                    \"human\",\n",
+    "                    \"{question}\",\n",
+    "                ),  # Populated from task.instructions automatically\n",
+    "                (\"placeholder\", \"{agent_scratchpad}\"),\n",
+    "            ]\n",
    "        )\n",
+    "\n",
+    "        # This is the standard tool calling agent implementation\n",
+    "        # Feel free to replace it with any other implementation you want!\n",
+    "        # https://python.langchain.com/docs/modules/agents/how_to/custom_agent/\n",
+    "        agent = create_tool_calling_agent(model, env.tools, prompt)\n",
+    "\n",
+    "        if self.rate_limiter:\n",
+    "            agent = with_rate_limit(agent, self.rate_limiter)\n",
+    "\n",
+    "        executor = AgentExecutor(\n",
+    "            agent=agent,\n",
+    "            tools=env.tools,\n",
+    "            handle_parsing_errors=True,\n",
+    "            return_intermediate_steps=True,\n",
+    "        )\n",
+    "\n",
    "        # Apply the adapters so that inputs and outputs match dataset schema\n",
    "        # state_reader automatically adds the state of the environment at the end of the run.\n",
-    "        return apply_agent_executor_adapter(agent_executor, state_reader=env.read_state)"
+    "        return apply_agent_executor_adapter(executor, state_reader=env.read_state)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
-   "id": "8b6108e4-c7cc-42e8-a23d-89c7b94fab6c",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 17,
+   "id": "18a96a6f-812b-4b0e-83c5-d001bf50851e",
+   "metadata": {},
   "outputs": [
    {
     "data": {
+      "text/html": [
+       "<table>\n",
+       "<tbody>\n",
+       "<tr><td>Name       </td><td>Tool Usage - Typewriter (26 tools)                                                                                                                         </td></tr>\n",
+       "<tr><td>Type       </td><td>ToolUsageTask                                                                                                                                              </td></tr>\n",
+       "<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d\" target=\"_blank\" rel=\"noopener\">128af05e-aa00-4e3b-a958-d166dd450581</a></td></tr>\n",
+       "<tr><td>Description</td><td>Environment with 26 tools each tool represents a letter of the alphabet.\n",
+       "\n",
+       "The objective of this task is to evaluate the model's ability the use tools\n",
+       "for a simple repetition task.\n",
+       "\n",
+       "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n",
+       "\n",
+       "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n",
+       "\n",
+       "This is a variation of the typer writer task, where 26 parameterless tools are\n",
+       "given instead of a single tool that takes a letter as an argument.                                                                                                                                                            </td></tr>\n",
+       "</tbody>\n",
+       "</table>"
+      ],
      "text/plain": [
-       "{'input': 'xypxy',\n",
-       " 'output': 'I have typed \"xypxy\" as you requested.',\n",
-       " 'intermediate_steps': [(AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'p'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'p'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"p\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'x'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'x'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"x\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='type_letter', tool_input={'letter': 'y'}, log=\"\\nInvoking: `type_letter` with `{'letter': 'y'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\\n  \"letter\": \"y\"\\n}', 'name': 'type_letter'}})]),\n",
-       "   'OK')],\n",
-       " 'state': 'xypxy'}"
+       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x78972c6c3060>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
      ]
     },
-     "execution_count": 24,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "agent_factory = AgentFactory(task, \"gpt-4\")\n",
-    "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"xypxy\"})"
+    "task"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "9bdf9328-0103-48d3-8dfc-933423db9796",
+   "execution_count": 18,
+   "id": "a7bd4af3-c0f1-4308-abbf-330d7497b3e3",
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "custom_agent_factory = CustomAgentFactory(task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "c5b69b7c-4294-47d1-85d7-47d718945898",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent = custom_agent_factory()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "1ac24ef5-d3ca-41aa-b888-7ebcd8a92ff4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'abc',\n",
+       " 'output': [],\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\nresponded: [{\\'text\\': \\'<thinking>\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n</thinking>\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_016f6CZwwFmdz2h8KbdGRVjj'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\nresponded: [{\\'text\\': \\'<thinking>\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n</thinking>\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01JvfeTpU3hEuS7PknFk5a8S'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\nresponded: [{\\'text\\': \\'<thinking>\\\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\\\n</thinking>\\', \\'type\\': \\'text\\'}, {\\'id\\': \\'toolu_016f6CZwwFmdz2h8KbdGRVjj\\', \\'input\\': {}, \\'name\\': \\'a\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01JvfeTpU3hEuS7PknFk5a8S\\', \\'input\\': {}, \\'name\\': \\'b\\', \\'type\\': \\'tool_use\\'}, {\\'id\\': \\'toolu_01NbBCY5Fg62RsyAAUd4n2g1\\', \\'input\\': {}, \\'name\\': \\'c\\', \\'type\\': \\'tool_use\\'}]\\n\\n', message_log=[AIMessageChunk(content=[{'text': '<thinking>\\nTo repeat the string \"abc\", I need to call the a(), b(), and c() functions in that order. No parameters are required for these functions.\\n</thinking>', 'type': 'text'}, {'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'input': {}, 'name': 'a', 'type': 'tool_use'}, {'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'input': {}, 'name': 'b', 'type': 'tool_use'}, {'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'input': {}, 'name': 'c', 'type': 'tool_use'}], id='run-42ea263e-e52a-4fc7-8aa3-71e16a9db42b', tool_calls=[{'name': 'a', 'args': {}, 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj'}, {'name': 'b', 'args': {}, 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S'}, {'name': 'c', 'args': {}, 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'toolu_016f6CZwwFmdz2h8KbdGRVjj', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'toolu_01JvfeTpU3hEuS7PknFk5a8S', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'toolu_01NbBCY5Fg62RsyAAUd4n2g1', 'index': 2}])], tool_call_id='toolu_01NbBCY5Fg62RsyAAUd4n2g1'),\n",
+       "   'OK')],\n",
+       " 'state': 'abc'}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent.invoke({\"question\": \"abc\"})"
+   ]
  }
 ],
 "metadata": {
@@ -776,7 +806,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
+   "version": "3.11.4"
  }
 },
 "nbformat": 4,
@@ -88,7 +88,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "id": "27b6b0fd-639d-43a7-a730-9acdc5b2f102",
   "metadata": {
    "tags": []
@@ -97,14 +97,14 @@
    {
     "data": {
      "text/plain": [
-       "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n        Args:\\n            user_id: The user's ID.\\n\\n        Returns:\\n            The user's name.\", args_schema=<class 'pydantic.v1.main.get_user_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.get_user_name at 0x7fbb0e864f40>),\n",
-       " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=<class 'pydantic.v1.main.list_user_idsSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.list_user_ids at 0x7fbb0e864fe0>),\n",
-       " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n        Args:\\n            name: The name to search for.\\n\\n        Returns:\\n            The list of matching users.', args_schema=<class 'pydantic.v1.main.find_users_by_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_users_by_name at 0x7fbb0e865080>),\n",
-       " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=<class 'pydantic.v1.main.find_locations_by_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_locations_by_name at 0x7fbb0e865120>),\n",
-       " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=<class 'pydantic.v1.main.find_foods_by_nameSchemaSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_foods_by_name at 0x7fbb0e8651c0>)]"
+       "[StructuredTool(name='get_user_name', description=\"get_user_name(user_id: int) -> str - Get the name of the user with the given user ID.\\n\\n        Args:\\n            user_id: The user's ID.\\n\\n        Returns:\\n            The user's name.\", args_schema=<class 'pydantic.v1.main.get_user_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.get_user_name at 0x78f30602fec0>),\n",
+       " StructuredTool(name='list_user_ids', description='list_user_ids() -> List[str] - List all the user IDs.', args_schema=<class 'pydantic.v1.main.list_user_idsSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.list_user_ids at 0x78f30602fe20>),\n",
+       " StructuredTool(name='find_users_by_name', description='find_users_by_name(name: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find users with the given name.\\n\\n        Args:\\n            name: The name to search for.\\n\\n        Returns:\\n            The list of matching users.', args_schema=<class 'pydantic.v1.main.find_users_by_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_users_by_name at 0x78f306058040>),\n",
+       " StructuredTool(name='find_locations_by_name', description='find_locations_by_name(city: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find locations with the given city name.', args_schema=<class 'pydantic.v1.main.find_locations_by_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_locations_by_name at 0x78f3060580e0>),\n",
+       " StructuredTool(name='find_foods_by_name', description='find_foods_by_name(food: str) -> List[langchain_benchmarks.tool_usage.tasks.relational_data.SearchHit] - Find foods with the given name.', args_schema=<class 'pydantic.v1.main.find_foods_by_nameSchema'>, handle_tool_error=True, func=<function get_available_functions.<locals>.find_foods_by_name at 0x78f306058180>)]"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -116,7 +116,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "id": "7f1c1242-449c-4536-863d-b62bf6d2dff1",
   "metadata": {
    "tags": []
@@ -128,7 +128,7 @@
       "'Bob'"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -139,7 +139,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "id": "854e139b-a120-4012-bdf4-6394e0b1c42d",
   "metadata": {
    "tags": []
@@ -155,7 +155,7 @@
       " {'id': 5, 'city': 'Miami'}]"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -169,105 +169,46 @@
   "id": "b462f7b8-fd42-4613-ab5f-5f3cbbc37d28",
   "metadata": {},
   "source": [
-    "## Agent Factory\n",
+    "## Explore the task\n",
    "\n",
    "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
    "\n",
-    "The `AgentExecutor` should accept `question` as an input and include the fields `output`, `intermediate_steps` and potentially `state` in its response -- for this we\n",
-    "will wrap the agent executor in an adapter (`apply_agent_executor_adapter`) that will help match the expected schema.\n",
-    "\n",
-    "Please reference the LangChain documentation to see how to [use and implement agents](https://python.langchain.com/docs/modules/agents/)"
+    "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "id": "1c2d80d2-4ddf-4b80-b6c5-331133a85314",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.agents import AgentType, initialize_agent\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "\n",
-    "from langchain_benchmarks.schema import ExtractionTask\n",
-    "from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
   "id": "81c0e4a1-f56e-4117-8804-4161c642b068",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "class AgentFactory:\n",
-    "    def __init__(self, task: ExtractionTask, model: str) -> None:\n",
-    "        self.task = task\n",
-    "        self.model = model\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_openai.chat_models import ChatOpenAI\n",
    "\n",
-    "    def __call__(self):\n",
-    "        # This factory creates a new environment for every agent run.\n",
-    "        # The reason is that the environment may be associated with an environment state (e.g., typewriter)\n",
-    "        # which is changed by the actions of the agent.\n",
-    "        # At the end of the run, the environment state will be read.\n",
-    "        env = task.create_environment()  # Create a new environment for every agent run!\n",
-    "        tools = env.tools\n",
-    "        llm = ChatOpenAI(temperature=0, model=self.model)\n",
-    "        agent_executor = initialize_agent(\n",
-    "            tools,\n",
-    "            llm,\n",
-    "            agent=AgentType.OPENAI_FUNCTIONS,\n",
-    "            return_intermediate_steps=True,\n",
-    "            handle_parsing_errors=True,\n",
-    "        )\n",
-    "        # Apply the adapters so that inputs and outputs match dataset schema\n",
-    "        # state_reader automatically adds the state of the environment at the end of the run.\n",
-    "        return apply_agent_executor_adapter(agent_executor, state_reader=env.read_state)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "0ae8c6be-899c-44a6-a89b-0fc04c2cb05c",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-32k-0613\"]\n",
-    "agent_factory = AgentFactory(task, models[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "87a64f76-65ae-4367-b43f-f2be3431e7af",
-   "metadata": {},
-   "source": [
-    "Let's test that our agent works"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "127a8aa5-839c-469c-a870-7b498f37c187",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain import globals\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
    "\n",
-    "globals.set_verbose(True)"
+    "model = ChatOpenAI(temperature=0)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
+    "        (\"human\", \"{question}\"),  # Populated from the test data\n",
+    "        (\n",
+    "            \"placeholder\",\n",
+    "            \"{agent_scratchpad}\",\n",
+    "        ),  # Work where the agent can do its work (e.g., call multiple tools)\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
-   "id": "0e4896fa-3633-44a1-857f-80a263cf2e03",
+   "id": "382ff2f6-8099-415e-a58c-e659345f52fc",
   "metadata": {
    "tags": []
   },
@@ -280,11 +221,11 @@
      "\n",
      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
      "\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `find_locations_by_name` with `{'city': 'Los Angeles'}`\n",
+      "Invoking: `find_locations_by_name` with `{'city': 'LA'}`\n",
      "\n",
      "\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3m[{'id': 2, 'city': 'Los Angeles'}, {'id': 4, 'city': 'Houston'}, {'id': 1, 'city': 'New York'}, {'id': 3, 'city': 'Chicago'}, {'id': 5, 'city': 'Miami'}]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `get_weather_at_location` with `{'location_id': 2}`\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3m[{'id': 2, 'city': 'Los Angeles'}, {'id': 1, 'city': 'New York'}, {'id': 3, 'city': 'Chicago'}, {'id': 4, 'city': 'Houston'}, {'id': 5, 'city': 'Miami'}]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `get_current_weather_for_location` with `{'location_id': 2}`\n",
      "\n",
      "\n",
      "\u001b[0m\u001b[36;1m\u001b[1;3mSunny, Temperature: 75°F\u001b[0m\u001b[32;1m\u001b[1;3mThe weather in Los Angeles is sunny with a temperature of 75°F.\u001b[0m\n",
@@ -295,15 +236,15 @@
    {
     "data": {
      "text/plain": [
-       "{'input': 'whats the weather in LA?',\n",
+       "{'question': 'what is the weather in LA',\n",
       " 'output': 'The weather in Los Angeles is sunny with a temperature of 75°F.',\n",
-       " 'intermediate_steps': [(AgentActionMessageLog(tool='find_locations_by_name', tool_input={'city': 'Los Angeles'}, log=\"\\nInvoking: `find_locations_by_name` with `{'city': 'Los Angeles'}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\"city\":\"Los Angeles\"}', 'name': 'find_locations_by_name'}})]),\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='find_locations_by_name', tool_input={'city': 'LA'}, log=\"\\nInvoking: `find_locations_by_name` with `{'city': 'LA'}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo', 'function': {'arguments': '{\"city\":\"LA\"}', 'name': 'find_locations_by_name'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-23ccffb0-3b17-46a4-b42e-5eaa3220b211', tool_calls=[{'name': 'find_locations_by_name', 'args': {'city': 'LA'}, 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo'}], tool_call_chunks=[{'name': 'find_locations_by_name', 'args': '{\"city\":\"LA\"}', 'id': 'call_hJrCZgP4eDgaj6s4RtCKXTOo', 'index': 0}])], tool_call_id='call_hJrCZgP4eDgaj6s4RtCKXTOo'),\n",
       "   [{'id': 2, 'city': 'Los Angeles'},\n",
-       "    {'id': 4, 'city': 'Houston'},\n",
       "    {'id': 1, 'city': 'New York'},\n",
       "    {'id': 3, 'city': 'Chicago'},\n",
+       "    {'id': 4, 'city': 'Houston'},\n",
       "    {'id': 5, 'city': 'Miami'}]),\n",
-       "  (AgentActionMessageLog(tool='get_weather_at_location', tool_input={'location_id': 2}, log=\"\\nInvoking: `get_weather_at_location` with `{'location_id': 2}`\\n\\n\\n\", message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\"location_id\":2}', 'name': 'get_weather_at_location'}})]),\n",
+       "  (ToolAgentAction(tool='get_current_weather_for_location', tool_input={'location_id': 2}, log=\"\\nInvoking: `get_current_weather_for_location` with `{'location_id': 2}`\\n\\n\\n\", message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_lopYjo00MF9mZtnHtiisTqyp', 'function': {'arguments': '{\"location_id\":2}', 'name': 'get_current_weather_for_location'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9bba5827-d98b-464d-8028-25eb4a05d227', tool_calls=[{'name': 'get_current_weather_for_location', 'args': {'location_id': 2}, 'id': 'call_lopYjo00MF9mZtnHtiisTqyp'}], tool_call_chunks=[{'name': 'get_current_weather_for_location', 'args': '{\"location_id\":2}', 'id': 'call_lopYjo00MF9mZtnHtiisTqyp', 'index': 0}])], tool_call_id='call_lopYjo00MF9mZtnHtiisTqyp'),\n",
       "   'Sunny, Temperature: 75°F')]}"
      ]
     },
@@ -313,270 +254,31 @@
    }
   ],
   "source": [
+    "from langchain import globals\n",
+    "\n",
+    "globals.set_verbose(True)\n",
+    "\n",
    "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"whats the weather in LA?\"})"
+    "agent.invoke({\"question\": \"what is the weather in LA\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "142ac640-3ce0-4f38-89cd-8d24d65997e4",
+   "metadata": {},
+   "source": [
+    "## Benchmarking\n",
+    "\n",
+    "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
-   "id": "43edee23-109d-4f75-be68-d2b4b3240c9b",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": null,
+   "id": "e49455cc-13c5-4ea6-bb4b-e61c39ea0267",
+   "metadata": {},
   "outputs": [],
-   "source": [
-    "globals.set_verbose(False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-   "metadata": {},
-   "source": [
-    "## Eval\n",
-    "\n",
-    "Let's evaluate an agent now"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "2e02fb65-eecf-43b8-bf76-1e86ca535da0",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "View the evaluation results for project 'tool-usage-relational-data-gpt-3.5-turbo-1106-8258' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/8aae8e36-720a-42c8-8540-5d5475e7181e?eval=true\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "View the evaluation results for project 'tool-usage-relational-data-gpt-3.5-turbo-0613-8258' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/d8773df1-b054-41e4-a947-7b256ca8738b?eval=true\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21\n",
-      "View the evaluation results for project 'tool-usage-relational-data-gpt-4-0613-8258' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/090fecae-923f-4281-93f7-2c5253a2a2a4?eval=true\n",
-      "\n",
-      "View all tests for Dataset Tool Usage - Relational Data at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826\n",
-      "[------------------------------------------------->] 21/21"
-     ]
-    }
-   ],
-   "source": [
-    "import uuid\n",
-    "\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "from langchain_benchmarks import clone_public_dataset\n",
-    "\n",
-    "clone_public_dataset(task.dataset_id, dataset_name=task.name)  # Clone dataset\n",
-    "\n",
-    "experiment_uuid = uuid.uuid4().hex[:4]\n",
-    "\n",
-    "client = Client()\n",
-    "\n",
-    "models = [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-0613\", \"gpt-4-0613\"]\n",
-    "\n",
-    "for model in models:\n",
-    "    print()\n",
-    "    agent_factory = AgentFactory(task, model=model)\n",
-    "    test_run = client.run_on_dataset(\n",
-    "        dataset_name=task.name,\n",
-    "        llm_or_chain_factory=agent_factory,\n",
-    "        evaluation=task.get_eval_config(),\n",
-    "        verbose=False,\n",
-    "        project_name=f\"tool-usage-relational-data-{model}-{experiment_uuid}\",\n",
-    "        tags=[model],\n",
-    "        project_metadata={\n",
-    "            \"model\": model,\n",
-    "            \"arch\": \"openai-functions-agent\",\n",
-    "            \"id\": experiment_uuid,\n",
-    "        },\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1b039225-01cf-481a-87a6-4e880e9b1dcd",
-   "metadata": {},
-   "source": [
-    "## Inspect\n",
-    "\n",
-    "Here, we'll take a look at the underlying results a little bit."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "fe9b20c4-9da0-47a2-95a3-b5660a54855a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from langsmith.client import Client\n",
-    "\n",
-    "client = Client()\n",
-    "projects = list(\n",
-    "    client.list_projects(reference_dataset_name=\"Tool Usage - Relational Data\")\n",
-    ")\n",
-    "\n",
-    "dfs = []\n",
-    "for project in projects:\n",
-    "    first_root_run = next(\n",
-    "        client.list_runs(project_name=project.name, execution_order=1)\n",
-    "    )\n",
-    "    # Temporary way to get tag information\n",
-    "    tags = first_root_run.tags\n",
-    "    test_results = client.get_test_results(project_name=project.name)\n",
-    "    test_results[\"model\"] = tags[0]\n",
-    "    dfs.append(test_results)\n",
-    "\n",
-    "\n",
-    "df = pd.concat(dfs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "da6962a1-81f2-445f-8547-513a105a3847",
-   "metadata": {},
-   "source": [
-    "### Stats"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4b7d366a-8754-417a-a654-956528f134e2",
-   "metadata": {},
-   "source": [
-    "In terms of function usage, gpt-4 uses more calls than is strictly necessary (`feedback.# steps / # expected steps` is > 1). However, it's doing a pretty good job.\n",
-    "\n",
-    "The gpt-3.5 models do not use tools enough (`feedback.# steps / # expected steps` is < 1) and as a result do a worse job at the task.\n",
-    "\n",
-    "Note: The intermediate step correctness happens to have the same average for the 3 models -- this is just a coincidence you can confirm by inspecting underlying results."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "066551f2-eb30-4bc1-94fd-0ca0085103ad",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>feedback.correctness</th>\n",
-       "      <th>feedback.Intermediate steps correctness</th>\n",
-       "      <th>execution_time</th>\n",
-       "      <th>feedback.# steps / # expected steps</th>\n",
-       "      <th>n</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>model</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-0613</th>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>4.829506</td>\n",
-       "      <td>0.825390</td>\n",
-       "      <td>21</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-3.5-turbo-1106</th>\n",
-       "      <td>0.857143</td>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>5.464218</td>\n",
-       "      <td>0.965871</td>\n",
-       "      <td>21</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>gpt-4-0613</th>\n",
-       "      <td>0.952381</td>\n",
-       "      <td>0.714286</td>\n",
-       "      <td>8.544358</td>\n",
-       "      <td>1.037300</td>\n",
-       "      <td>21</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                    feedback.correctness  \\\n",
-       "model                                      \n",
-       "gpt-3.5-turbo-0613              0.714286   \n",
-       "gpt-3.5-turbo-1106              0.857143   \n",
-       "gpt-4-0613                      0.952381   \n",
-       "\n",
-       "                    feedback.Intermediate steps correctness  execution_time  \\\n",
-       "model                                                                         \n",
-       "gpt-3.5-turbo-0613                                 0.714286        4.829506   \n",
-       "gpt-3.5-turbo-1106                                 0.714286        5.464218   \n",
-       "gpt-4-0613                                         0.714286        8.544358   \n",
-       "\n",
-       "                    feedback.# steps / # expected steps   n  \n",
-       "model                                                        \n",
-       "gpt-3.5-turbo-0613                             0.825390  21  \n",
-       "gpt-3.5-turbo-1106                             0.965871  21  \n",
-       "gpt-4-0613                                     1.037300  21  "
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "count_df = df.groupby(\"model\").size().to_frame(\"n\")\n",
-    "df.groupby(\"model\")[\n",
-    "    [\n",
-    "        \"feedback.correctness\",\n",
-    "        \"feedback.Intermediate steps correctness\",\n",
-    "        \"execution_time\",\n",
-    "        \"feedback.# steps / # expected steps\",\n",
-    "    ]\n",
-    "].mean().join(count_df)"
-   ]
+   "source": []
  }
 ],
 "metadata": {
@@ -71,7 +71,7 @@
       "</table>"
      ],
      "text/plain": [
-       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x7f1b23b13240>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
+       "ToolUsageTask(name='Tool Usage - Typewriter (26 tools)', dataset_id='https://smith.langchain.com/public/128af05e-aa00-4e3b-a958-d166dd450581/d', description=\"Environment with 26 tools each tool represents a letter of the alphabet.\\n\\nThe objective of this task is to evaluate the model's ability the use tools\\nfor a simple repetition task.\\n\\nFor example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\\n\\nThe dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\\n\\nThis is a variation of the typer writer task, where 26 parameterless tools are\\ngiven instead of a single tool that takes a letter as an argument.\\n\", create_environment=<function get_environment at 0x75aa9dec2d40>, instructions=\"Repeat the given string by using the provided tools. Do not write anything else or provide any explanations. For example, if the string is 'abc', you must invoke the tools 'a', 'b', and 'c' in that order. Please invoke the functions without any arguments.\", eval_params={'output_evaluation': 'none'})"
      ]
     },
     "execution_count": 2,
@@ -106,7 +106,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "id": "61535a75-24f6-4727-9549-f76c263e9153",
   "metadata": {
    "tags": []
@@ -118,7 +118,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "id": "f35a0a1d-5a1e-4de1-8d8c-c7c9a264a6c7",
   "metadata": {
    "tags": []
@@ -127,14 +127,14 @@
    {
     "data": {
      "text/plain": [
-       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e6520>),\n",
-       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e65c0>),\n",
-       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e6660>),\n",
-       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e6700>),\n",
-       " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=<class 'pydantic.v1.main.eSchemaSchema'>, func=<function _create_typing_func.<locals>.func at 0x7f6cd20e67a0>)]"
+       "[StructuredTool(name='a', description='a() -> str - Run to Type the letter \"a\".', args_schema=<class 'pydantic.v1.main.aSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc180>),\n",
+       " StructuredTool(name='b', description='b() -> str - Run to Type the letter \"b\".', args_schema=<class 'pydantic.v1.main.bSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc220>),\n",
+       " StructuredTool(name='c', description='c() -> str - Run to Type the letter \"c\".', args_schema=<class 'pydantic.v1.main.cSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc2c0>),\n",
+       " StructuredTool(name='d', description='d() -> str - Run to Type the letter \"d\".', args_schema=<class 'pydantic.v1.main.dSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc360>),\n",
+       " StructuredTool(name='e', description='e() -> str - Run to Type the letter \"e\".', args_schema=<class 'pydantic.v1.main.eSchema'>, func=<function _create_typing_func.<locals>.func at 0x75aa9defc400>)]"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -145,11 +145,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "id": "5bea0190-39ec-4f30-9a00-90136bc6bf0b",
   "metadata": {
    "tags": []
   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'OK'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env.tools[0].invoke({})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "bf7444da-15a1-455a-b22e-639cbfff8432",
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "data": {
@@ -162,36 +185,13 @@
     "output_type": "execute_result"
    }
   ],
-   "source": [
-    "env.tools[0].invoke({})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "bf7444da-15a1-455a-b22e-639cbfff8432",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'OK'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "env.tools[3].invoke({})"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "id": "d12bd710-5c01-4539-a4b9-afbf03164923",
   "metadata": {
    "tags": []
@@ -203,7 +203,7 @@
       "'ad'"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -217,105 +217,110 @@
   "id": "f1d62a13-3771-460f-b131-4443f669ca3d",
   "metadata": {},
   "source": [
-    "## Agent Factory\n",
+    "## Explore the task\n",
    "\n",
    "For evaluation, we need an agent factory that will create a new instance of an agent executor for every evaluation run.\n",
    "\n",
-    "We'll use an `OpenAIAgentFactory` provided with LangChain Benchmarks -- look at the `intro` section to see how to define your own."
+    "We'll use the `StandardAgentFactory` -- look at the `intro` for more information about what it does and/or how to create a custom one."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
   "id": "6142cf4e-862c-47a3-aa75-81d7d3231308",
   "metadata": {
    "tags": []
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': 'hello',\n",
-       " 'output': 'hello\\nhello',\n",
-       " 'intermediate_steps': [(AgentActionMessageLog(tool='h', tool_input={}, log='\\nInvoking: `h` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'h', 'arguments': ''}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='e', tool_input={}, log='\\nInvoking: `e` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'e', 'arguments': ''}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='l', tool_input={}, log='\\nInvoking: `l` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'l', 'arguments': ''}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='l', tool_input={}, log='\\nInvoking: `l` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'l', 'arguments': ''}})]),\n",
-       "   'OK'),\n",
-       "  (AgentActionMessageLog(tool='o', tool_input={}, log='\\nInvoking: `o` with `{}`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'name': 'o', 'arguments': ''}})]),\n",
-       "   'OK')],\n",
-       " 'state': 'hello'}"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain_benchmarks.tool_usage import agents\n",
-    "\n",
-    "agent_factory = agents.OpenAIAgentFactory(task, model=\"gpt-3.5-turbo-16k\")\n",
-    "\n",
-    "# Let's test that our agent works\n",
-    "agent = agent_factory()\n",
-    "agent.invoke({\"question\": \"hello\"})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3821e4b0-8e67-418a-840c-470fcde42df0",
-   "metadata": {},
-   "source": [
-    "## Eval\n",
-    "\n",
-    "Let's evaluate an agent now."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
-   "metadata": {
-    "tags": []
-   },
   "outputs": [],
   "source": [
-    "import uuid\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_openai.chat_models import ChatOpenAI\n",
    "\n",
-    "from langsmith.client import Client\n",
+    "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n",
    "\n",
-    "from langchain_benchmarks import clone_public_dataset\n",
+    "model = ChatOpenAI(temperature=0)\n",
+    "prompt = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"{instructions}\"),  # Populated from task.instructions automatically\n",
+    "        (\"human\", \"{question}\"),  # Populated from the test data\n",
+    "        (\n",
+    "            \"placeholder\",\n",
+    "            \"{agent_scratchpad}\",\n",
+    "        ),  # Work where the agent can do its work (e.g., call multiple tools)\n",
+    "    ]\n",
+    ")\n",
    "\n",
-    "# Clone the dataset\n",
-    "clone_public_dataset(task.dataset_id, dataset_name=task.name)\n",
+    "agent_factory = StandardAgentFactory(task, model, prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `a` with `{}`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `b` with `{}`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `c` with `{}`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3mOK\u001b[0m\u001b[32;1m\u001b[1;3mabcabcabc\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Finished chain.\u001b[0m\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'abc',\n",
+       " 'output': 'abcabcabc',\n",
+       " 'intermediate_steps': [(ToolAgentAction(tool='a', tool_input={}, log='\\nInvoking: `a` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_OrpjShN5uNzw2Rsb1tWF6swI'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='b', tool_input={}, log='\\nInvoking: `b` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_2XO5RNgt9FjGvTXztgD0tKqW'),\n",
+       "   'OK'),\n",
+       "  (ToolAgentAction(tool='c', tool_input={}, log='\\nInvoking: `c` with `{}`\\n\\n\\n', message_log=[AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'function': {'arguments': '{}', 'name': 'a'}, 'type': 'function'}, {'index': 1, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'function': {'arguments': '{}', 'name': 'b'}, 'type': 'function'}, {'index': 2, 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'function': {'arguments': '{}', 'name': 'c'}, 'type': 'function'}]}, response_metadata={'finish_reason': 'tool_calls'}, id='run-9a1af767-29e4-4759-ab28-5b29236e8f22', tool_calls=[{'name': 'a', 'args': {}, 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI'}, {'name': 'b', 'args': {}, 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW'}, {'name': 'c', 'args': {}, 'id': 'call_MRAOAgbi8vT445clqC8OybMR'}], tool_call_chunks=[{'name': 'a', 'args': '{}', 'id': 'call_OrpjShN5uNzw2Rsb1tWF6swI', 'index': 0}, {'name': 'b', 'args': '{}', 'id': 'call_2XO5RNgt9FjGvTXztgD0tKqW', 'index': 1}, {'name': 'c', 'args': '{}', 'id': 'call_MRAOAgbi8vT445clqC8OybMR', 'index': 2}])], tool_call_id='call_MRAOAgbi8vT445clqC8OybMR'),\n",
+       "   'OK')],\n",
+       " 'state': 'abc'}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain import globals\n",
    "\n",
-    "experiment_uuid = uuid.uuid4().hex[:4]\n",
+    "globals.set_verbose(True)\n",
    "\n",
-    "client = Client()\n",
+    "agent = agent_factory()\n",
+    "agent.invoke({\"question\": \"abc\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "89124d06-41f7-4432-9f2e-542c0d85e2e5",
+   "metadata": {},
+   "source": [
+    "## Benchmarking\n",
    "\n",
-    "models = [\"gpt-3.5-turbo-16k\"]\n",
-    "\n",
-    "for model in models:\n",
-    "    print()\n",
-    "    agent_factory = agents.OpenAIAgentFactory(task, model=model)\n",
-    "    test_run = client.run_on_dataset(\n",
-    "        dataset_name=task.name,\n",
-    "        llm_or_chain_factory=agent_factory,\n",
-    "        evaluation=task.get_eval_config(),\n",
-    "        verbose=False,\n",
-    "        concurrency_level=1,\n",
-    "        project_name=f\"typewriter-26-{model}-{experiment_uuid}\",\n",
-    "        tags=[model],\n",
-    "        project_metadata={\n",
-    "            \"model\": model,\n",
-    "            \"arch\": \"openai-functions-agent\",\n",
-    "            \"id\": experiment_uuid,\n",
-    "        },\n",
-    "    )"
+    "See `introduction` and `benchmark all` for information on how to run benchmarks. This notebook is just to here to explain and explore the task."
   ]
  }
 ],
@@ -1,8 +1,8 @@
 from typing import Optional

-from langchain.chat_models import ChatOpenAI
 from langchain.chat_models.base import BaseChatModel
 from langchain.smith import RunEvalConfig
+from langchain_openai import ChatOpenAI


 def get_eval_config(eval_llm: Optional[BaseChatModel] = None) -> RunEvalConfig:
@@ -2,10 +2,10 @@
 from typing import Any, Dict, List, Optional, Type

 from langchain.chains.openai_functions import convert_to_openai_function
-from langchain.chat_models import ChatOpenAI
 from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema.runnable import Runnable
+from langchain_openai import ChatOpenAI
 from langsmith.client import Client
 from pydantic import BaseModel

@@ -1,7 +1,7 @@
 from enum import Enum
 from typing import List, Optional

-from langchain.pydantic_v1 import BaseModel, Field
+from pydantic import BaseModel, Field


 class QuestionCategory(str, Enum):
@@ -2,7 +2,7 @@ from enum import Enum
 from typing import List, Optional

 from langchain.prompts import ChatPromptTemplate
-from langchain.pydantic_v1 import BaseModel, Field
+from pydantic import BaseModel, Field

 from langchain_benchmarks.schema import ExtractionTask

@@ -1,7 +1,7 @@
 from langchain.smith import RunEvalConfig
-from langchain_core.pydantic_v1 import BaseModel, Field
 from langsmith.evaluation import EvaluationResult, run_evaluator
 from langsmith.schemas import Example, Run
+from pydantic import BaseModel, Field

 from langchain_benchmarks.schema import ExtractionTask

@@ -1,8 +1,8 @@
 from typing import Optional

-from langchain.chat_models import ChatOpenAI
 from langchain.evaluation import load_evaluator
 from langchain.smith import RunEvalConfig
+from langchain_openai import ChatOpenAI

 try:
    from langchain.schema.language_model import BaseLanguageModel
@@ -1,9 +1,9 @@
 from typing import Optional

 from langchain.base_language import BaseLanguageModel
-from langchain.chat_models import ChatOpenAI
 from langchain.schema.retriever import BaseRetriever
 from langchain.schema.runnable import Runnable
+from langchain_openai import ChatOpenAI

 from langchain_benchmarks.rag.tasks.langchain_docs.architectures.crqa import (
    create_response_chain,
@@ -3,11 +3,9 @@ import os
 from functools import partial
 from typing import Callable, Iterable, List, Optional

-from langchain.chat_models import ChatOpenAI
 from langchain.indexes import SQLRecordManager, index
 from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
 from langchain.prompts import ChatPromptTemplate
-from langchain.pydantic_v1 import BaseModel
 from langchain.retrievers.multi_vector import MultiVectorRetriever
 from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
 from langchain.schema.document import Document
@@ -18,6 +16,8 @@ from langchain.schema.storage import BaseStore
 from langchain.schema.vectorstore import VectorStore
 from langchain.storage import InMemoryStore
 from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
 from tqdm.auto import tqdm

 logger = logging.getLogger(__name__)
@@ -25,7 +25,6 @@ registry = Registry(
        type_writer_26_funcs.TYPE_WRITER_26_FUNCS_TASK,
        relational_data.RELATIONAL_DATA_TASK,
        multiverse_math.MULTIVERSE_MATH,
-        multiverse_math.MULTIVERSE_MATH_TINY,
        email_task.EMAIL_EXTRACTION_TASK,
        chat_extraction.CHAT_EXTRACTION_TASK,
        LANGCHAIN_DOCS_TASK,
@@ -1,9 +1,15 @@
 """Package for helping to evaluate agent runs."""
-from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter
+from langchain_benchmarks.tool_usage.agents import (
+    CustomRunnableAgentFactory,
+    StandardAgentFactory,
+    apply_agent_executor_adapter,
+)
 from langchain_benchmarks.tool_usage.evaluators import get_eval_config

 # Please keep this list sorted!
 __all__ = [
    "apply_agent_executor_adapter",
+    "CustomRunnableAgentFactory",
    "get_eval_config",
+    "StandardAgentFactory",
 ]
@@ -1,25 +1,11 @@
 from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
-from langchain_benchmarks.tool_usage.agents.anthropic_tool_user import (
-    AnthropicToolUserFactory,
-)
-from langchain_benchmarks.tool_usage.agents.experimental.factory import (
-    CustomAgentFactory,
-)
-from langchain_benchmarks.tool_usage.agents.openai_assistant import (
-    OpenAIAssistantFactory,
-)
-from langchain_benchmarks.tool_usage.agents.openai_functions import OpenAIAgentFactory
 from langchain_benchmarks.tool_usage.agents.runnable_agent import (
    CustomRunnableAgentFactory,
 )
 from langchain_benchmarks.tool_usage.agents.tool_using_agent import StandardAgentFactory

 __all__ = [
-    "OpenAIAgentFactory",
-    "OpenAIAssistantFactory",
    "apply_agent_executor_adapter",
-    "CustomAgentFactory",
-    "AnthropicToolUserFactory",
    "CustomRunnableAgentFactory",
    "StandardAgentFactory",
 ]
@@ -41,27 +41,8 @@ def apply_agent_executor_adapter(
        else:
            return None

-    def _format_input(inputs: dict) -> dict:
-        """Make sure that the input is always called `input`."""
-
-        if "question" not in inputs:
-            raise ValueError(
-                "Expected 'question' to be in the inputs. Found only the following "
-                f"keys {sorted(inputs.keys())}."
-            )
-
-        inputs = inputs.copy()  # Because 'question' is popped below
-
-        if "input" not in inputs:
-            return {"input": inputs.pop("question"), **inputs}
-        return inputs
-
-    runnable = (
-        RunnableLambda(_format_input).with_config({"run_name": "Format Input"})
-        | agent_executor
-        | RunnableLambda(_ensure_output_exists).with_config(
-            {"run_name": "Ensure Output"}
-        )
+    runnable = agent_executor | RunnableLambda(_ensure_output_exists).with_config(
+        {"run_name": "Ensure Output"}
    )

    if state_reader is not None:
@@ -1,271 +0,0 @@
-"""Wrapper around the anthropic tool user SDK.
-
-The anthropic tool user SDK is an alpha release so this code will likely be
-changed or deleted in the future. It's here simply to make it easier to benchmark
-the performance of the existing tool user SDK, to compare it with the performance
-of other implementations.
-"""
-
-from importlib.util import find_spec
-from typing import Any, Dict, List, Optional, Sequence
-
-from langchain.tools import StructuredTool
-from langchain_core.callbacks.manager import trace_as_chain_group
-from langchain_core.runnables import Runnable, RunnableConfig, RunnableLambda
-
-from langchain_benchmarks import rate_limiting
-from langchain_benchmarks.schema import ToolUsageTask
-from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
-
-
-def convert_langchain_tool_to_tool_user_tool(lc_tool: StructuredTool) -> Any:
-    """Convert a langchain tool to a tool user tool."""
-    from tool_use_package.tools.base_tool import BaseTool
-
-    class DynamicTool(BaseTool):
-        def use_tool(self, **kwargs):
-            return lc_tool(kwargs)
-
-    schema = lc_tool.args_schema.schema()
-
-    properties = schema["properties"]
-    parameters = []
-    # Is this needed or is string OK?
-    type_adapter = {
-        "string": "str",  # str or string?
-        "integer": "int",
-        "number": "float",
-        "boolean": "bool",
-    }
-    for key, value in properties.items():
-        parameters.append(
-            {
-                "name": key,
-                "type": type_adapter.get(value["type"], value["type"]),
-                "description": value.get("description", ""),
-            }
-        )
-
-    return DynamicTool(lc_tool.name, lc_tool.description, parameters)
-
-
-def _handle_tool_inputs(
-    tool_inputs: List[Dict[str, Any]],
-    tools: Sequence[StructuredTool],
-    config: Optional[RunnableConfig] = None,
-) -> Dict[str, Any]:
-    """Handle tool inputs."""
-    tool_by_name = {tool.name: tool for tool in tools}
-    tool_error: Optional[str] = None
-    tool_outputs = []
-    for tool_input in tool_inputs:
-        tool_name = tool_input["tool_name"]
-        tool_arguments = tool_input["tool_arguments"]
-        tool = tool_by_name[tool_name]
-        try:
-            tool_result = tool.invoke(tool_arguments, config=config)
-        except Exception as e:  # Break on first error
-            tool_error = str(e)
-            tool_outputs = None
-            break
-        tool_outputs.append(
-            {
-                "tool_name": tool_name,
-                "tool_result": tool_result,
-            }
-        )
-    return {
-        "role": "tool_outputs",
-        "tool_outputs": tool_outputs,
-        "tool_error": tool_error,
-    }
-
-
-def run_anthropic_agent_simple(
-    tools: Sequence[StructuredTool],
-    user_message: str,
-    *,
-    max_iterations: int = 30,
-    config: Optional[RunnableConfig] = None,
-    **kwargs,
-) -> List[dict]:
-    """Make an anthropic agent."""
-    from tool_use_package.tool_user import ToolUser
-
-    verbose = kwargs.pop("verbose", False)
-
-    tool_user = ToolUser(
-        [convert_langchain_tool_to_tool_user_tool(tool) for tool in tools], **kwargs
-    )
-    messages = [
-        {
-            "role": "human",
-            "content": user_message,
-            "tool_error": None,
-            "tool_outputs": [],
-            "tool_inputs": [],
-        }
-    ]
-    with trace_as_chain_group(
-        "Anthropic Agent Run",
-        inputs={"user_message": user_message},
-        callback_manager=config.get("callbacks", None) if config else None,
-    ) as group_manager:
-        for num_iteration in range(max_iterations):
-            with trace_as_chain_group(
-                f"Anthropic Agent Iteration {num_iteration}",
-                inputs={"messages": messages},
-                callback_manager=group_manager.parent_run_manager.get_child(),
-            ) as iteration_manager:
-                last_message = tool_user.use_tools(
-                    messages, execution_mode="manual", verbose=verbose
-                )
-                new_messages = [last_message]
-
-                if last_message["role"] == "tool_inputs":
-                    tool_inputs = last_message["tool_inputs"]
-                    new_message = _handle_tool_inputs(
-                        tool_inputs,
-                        tools,
-                        config={
-                            "callbacks": iteration_manager.parent_run_manager.get_child(),
-                        },
-                    )
-                    new_messages.append(new_message)
-
-                iteration_manager.on_chain_end(outputs=new_messages)
-                messages.extend(new_messages)
-
-                # Finally break if the last message is from the assistant
-                if last_message["role"] == "assistant":
-                    break
-        else:
-            raise ValueError("Max iterations reached")
-        group_manager.on_chain_end(outputs=messages)
-    return messages
-
-
-def convert_messages_to_finalized_output(
-    messages: List[Dict[str, Any]],
-) -> Dict[str, Any]:
-    """Convert the history of messages into the expected output for eval.
-
-    This matches the agent executor output which has the following structure:
-
-    {
-        "output": "The output of the agent",
-        "intermediate_steps": [
-            (
-                AgentAction(
-                    tool="add_x_y",
-                    tool_input={"x": 2.0, "y": 5.0},
-                    log="Invoking tool `add_x_y` with `{'x': 2.0, 'y': 5.0}`",
-                ),
-                9.0,
-            )
-        ],
-        "state": Any, # Optional key for tasks that involve manipulation of an env.
-    }
-    """
-    if not messages:
-        raise ValueError("Expected at least one message")
-
-    last_message = messages[-1]
-
-    if last_message["role"] != "assistant":
-        raise ValueError(
-            f"Expected the last message to be from the assistant. "
-            f"Instead got {last_message}."
-        )
-
-    actual_steps = []
-
-    for message in messages:
-        if "role" not in message:
-            raise ValueError(f"Expected role in message {message}")
-        role = message["role"]
-
-        if role == "tool_inputs":
-            # Get the name of the tool used
-            for tool_input in message["tool_inputs"]:
-                actual_steps.append(tool_input["tool_name"])
-
-    return {
-        "output": last_message["content"],
-        "actual_steps": actual_steps,
-    }
-
-
-def create_agent(tools: Sequence[StructuredTool]) -> RunnableLambda:
-    """Create an agent."""
-
-    def run_agent(
-        input: dict, config: Optional[RunnableConfig] = None, **kwargs
-    ) -> dict:
-        """Run the agent."""
-        messages = run_anthropic_agent_simple(
-            tools, input["input"], config=config, **kwargs
-        )
-        return convert_messages_to_finalized_output(messages)
-
-    return RunnableLambda(run_agent)
-
-
-class AnthropicToolUserFactory:
-    def __init__(
-        self,
-        task: ToolUsageTask,
-        *,
-        rate_limiter: Optional[rate_limiting.RateLimiter] = None,
-    ) -> None:
-        """Create an OpenAI agent factory for the given task.
-
-
-        Args:
-            task: The task to create an agent factory for.
-            rate_limiter: The rate limiter to use
-        """
-        self.task = task
-        self.rate_limiter = rate_limiter
-        if not find_spec("tool_use_package"):
-            raise ImportError(
-                'Could not import "tool_use_package". Please '
-                "follow instructions here to install "
-                "https://github.com/anthropics/anthropic-tools/tree/main"
-            )
-
-    def __call__(self, **kwargs: Any) -> Runnable:
-        env = self.task.create_environment()
-
-        def _add_task_instructions(
-            input: dict, config: Optional[RunnableConfig] = None, **kwargs
-        ) -> dict:
-            """Add task instructions to the question."""
-            if not isinstance(input, dict) or "question" not in input:
-                raise ValueError(
-                    f"Expected input to be a dict with key `question`. "
-                    f"Found {type(input)}."
-                )
-
-            input = input.copy()
-            input["question"] = (
-                f"{self.task.instructions}\nWrite down your answer, "
-                f"but do not explain it. Input: `{input['question']}`"
-            )
-            return input
-
-        agent = create_agent(env.tools)  # type: ignore
-        # Returns `state` in the output if the environment has a state reader
-        # makes sure that `output` is always in the output
-
-        if kwargs:
-            agent = agent.bind(**kwargs)
-
-        runnable = _add_task_instructions | apply_agent_executor_adapter(
-            agent, state_reader=env.read_state
-        )
-
-        if self.rate_limiter:  # Add a rate limiter
-            runnable = rate_limiting.with_rate_limit(runnable, self.rate_limiter)
-
-        return runnable
@@ -0,0 +1,11 @@
+import abc
+
+from langchain_core.runnables import Runnable
+
+
+class AgentFactory(abc.ABC):
+    """Abstract class for agent factory"""
+
+    @abc.abstractmethod
+    def __call__(self) -> Runnable:
+        """Create a new agent"""
@@ -1,133 +0,0 @@
-from typing import List, Literal, Optional, Sequence, Tuple, Union
-
-from langchain.agents import AgentOutputParser
-from langchain.prompts.chat import ChatPromptTemplate
-from langchain.schema.runnable import Runnable
-from langchain.tools import StructuredTool
-from langchain_core.agents import AgentAction, AgentFinish
-from langchain_core.language_models import BaseChatModel, BaseLanguageModel
-from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
-from langchain_core.prompts import MessagesPlaceholder
-from typing_extensions import NotRequired, TypedDict
-
-from langchain_benchmarks import RateLimiter
-from langchain_benchmarks.rate_limiting import with_rate_limit
-from langchain_benchmarks.tool_usage.agents.experimental.encoder import (
-    AstPrinter,
-    FunctionResult,
-    TypeScriptEncoder,
-    XMLEncoder,
-)
-from langchain_benchmarks.tool_usage.agents.experimental.prompts import (
-    _AGENT_INSTRUCTIONS_BLOB_STYLE,
-)
-from langchain_benchmarks.tool_usage.agents.experimental.tool_utils import (
-    convert_tool_to_function_definition,
-)
-
-
-def format_steps_for_chat(
-    intermediate_steps: List[Tuple[AgentAction, str]],
-    ast_printer: AstPrinter,
-) -> List[BaseMessage]:
-    """Format the steps."""
-    messages = []
-    for action, observation in intermediate_steps:
-        # Action messages contains the tool invocation request from the LLM
-        # Now add the result of the tool invocation.
-
-        if action.tool == "_Exception":
-            messages.append(
-                AIMessage(
-                    content=action.log,
-                )
-            )
-            messages.append(
-                # Tool input is the error message for the exception
-                HumanMessage(content=action.tool_input)
-            )
-        else:
-            messages.extend(action.messages)
-            function_result: FunctionResult = {
-                "name": action.tool,
-                "error": None,
-                "result": observation,
-            }
-            messages.append(
-                HumanMessage(
-                    content=ast_printer.visit_function_result(function_result),
-                )
-            )
-
-    return messages
-
-
-# PUBLIC API
-
-
-class AgentInput(TypedDict):
-    """The input to the agent."""
-
-    input: str
-    """The input to the agent."""
-    intermediate_steps: List[Tuple[AgentAction, str]]
-    """The intermediate steps taken by the agent."""
-    examples: NotRequired[List[BaseMessage]]
-    """A list of messages that can be used to form example traces."""
-
-
-def create_agent(
-    model: Union[BaseChatModel, BaseLanguageModel],
-    tools: Sequence[StructuredTool],
-    parser: AgentOutputParser,
-    *,
-    ast_printer: Union[AstPrinter, Literal["xml"]] = "xml",
-    rate_limiter: Optional[RateLimiter] = None,
-) -> Runnable[AgentInput, Union[AgentAction, AgentFinish]]:
-    """Create an agent for a chat model."""
-    if isinstance(ast_printer, str):
-        if ast_printer == "xml":
-            ast_printer_ = XMLEncoder()
-        elif ast_printer == "typescript":
-            ast_printer_ = TypeScriptEncoder()
-        else:
-            raise ValueError(f"Unknown ast printer: {ast_printer}")
-    elif isinstance(ast_printer, AstPrinter):
-        ast_printer_ = ast_printer
-    else:
-        raise TypeError(
-            f"Expected AstPrinter or str, got {type(ast_printer)} for `ast_printer`"
-        )
-
-    function_definitions = [convert_tool_to_function_definition(tool) for tool in tools]
-    tool_description = ast_printer_.visit_function_definitions(function_definitions)
-
-    template = ChatPromptTemplate.from_messages(
-        [
-            ("system", _AGENT_INSTRUCTIONS_BLOB_STYLE),
-            MessagesPlaceholder("examples"),  # Can use to add example traces
-            ("human", "{input}"),
-            MessagesPlaceholder("history"),
-        ]
-    ).partial(tool_description=tool_description)
-
-    # For the time being, hard-coding the fact that we're using a <tool> tag.
-    model = model.bind(stop=["</tool>"])
-
-    if rate_limiter:
-        # Apply a rate limiter if it was provided
-        model = with_rate_limit(model, rate_limiter)
-
-    agent = (
-        {
-            "input": lambda x: x["input"],
-            "history": lambda x: format_steps_for_chat(
-                x["intermediate_steps"], ast_printer_
-            ),
-            "examples": lambda x: x.get("examples", []),
-        }
-        | template
-        | model
-        | parser
-    )
-    return agent
@@ -1,240 +0,0 @@
-"""Prototyping code for rendering function definitions, invocations, and results.
-
-Types are simplified for now to `str`.
-
-We should actually support something like pydantic or jsonschema for the types, so
-we can expand them recursively for nested types.
-"""
-import abc
-from typing import Any, List, Optional
-
-from typing_extensions import NotRequired, TypedDict
-
-
-class Parameter(TypedDict):
-    """Representation for a parameter."""
-
-    name: str
-    type: str
-    description: str
-
-
-class Arguments(TypedDict):
-    """Arguments are passed to a function during function invocation."""
-
-    name: Optional[str]
-    value: Any
-
-
-class ReturnValue(TypedDict):
-    """Representation for a return value of a function call."""
-
-    type: str
-    description: NotRequired[str]
-
-
-class FunctionDefinition(TypedDict):
-    """Representation for a function."""
-
-    name: str
-    description: str  # Function description
-    parameters: List[Parameter]
-    return_value: ReturnValue
-
-
-class FunctionInvocation(TypedDict):
-    """Representation for a function invocation."""
-
-    id: NotRequired[str]
-    name: str
-    arguments: List[Arguments]
-
-
-class FunctionResult(TypedDict):
-    """Representation for a function result."""
-
-    id: NotRequired[str]
-    name: str
-    result: Optional[str]
-    error: Optional[str]
-
-
-class Visitor(abc.ABC):
-    @abc.abstractmethod
-    def visit_function_definition(self, function_definition: FunctionDefinition) -> str:
-        """Render a function."""
-
-    @abc.abstractmethod
-    def visit_function_definitions(
-        self, function_definitions: List[FunctionDefinition]
-    ) -> str:
-        """Render a function."""
-
-    @abc.abstractmethod
-    def visit_function_invocation(self, function_invocation: FunctionInvocation) -> str:
-        """Render a function invocation."""
-
-    @abc.abstractmethod
-    def visit_function_result(self, function_result: FunctionResult) -> str:
-        """Render a function result."""
-
-
-class AstPrinter(Visitor):
-    """Print the AST."""
-
-
-class XMLEncoder(AstPrinter):
-    def visit_function_definition(self, function_definition: FunctionDefinition) -> str:
-        """Render a function."""
-        parameters_lines = []
-
-        for parameter in function_definition["parameters"]:
-            parameters_lines.extend(
-                [
-                    "<parameter>",
-                    f"<name>{parameter['name']}</name>",
-                    f"<type>{parameter['type']}</type>",
-                    f"<description>{parameter['description']}</description>",
-                    "</parameter>",
-                ]
-            )
-        lines = [
-            "<function>",
-            f"<function_name>{function_definition['name']}</function_name>",
-            "<description>",
-            f"{function_definition['description']}",
-            "</description>",
-            "<parameters>",
-            *parameters_lines,
-            "</parameters>",
-            "<return_value>",
-            f"<type>{function_definition['return_value']['type']}</type>",
-        ]
-        if function_definition["return_value"].get("description"):
-            lines.append(
-                f"<description>{function_definition['return_value']['description']}"
-                f"</description>"
-            )
-
-        lines.extend(["</return_value>", "</function>"])
-        return "\n".join(lines)
-
-    def visit_function_definitions(
-        self, function_definitions: List[FunctionDefinition]
-    ) -> str:
-        """Render a function."""
-        strs = [
-            self.visit_function_definition(function_definition)
-            for function_definition in function_definitions
-        ]
-        return "<functions>\n" + "\n".join(strs) + "\n</functions>"
-
-    def visit_function_invocation(self, invocation: FunctionInvocation) -> str:
-        """Render a function invocation."""
-        arguments_as_strings = [
-            "<argument>\n"
-            f"<name>{argument['name']}</name>\n"
-            f"<value>{argument['value']}</value>\n"
-            "</argument>\n"
-            for argument in invocation["arguments"]
-        ]
-        lines = ["<function_invocation>"]
-
-        if invocation.get("id"):
-            lines.append(f"<id>{invocation['id']}</id>")
-
-        lines.extend(
-            [
-                f"<function_name>{invocation['name']}</function_name>\n"
-                "<arguments>\n"
-                f"{''.join(arguments_as_strings)}"  # Already includes trailing newline
-                "</arguments>\n"
-                "</function_invocation>"
-            ]
-        )
-        return "\n".join(lines)
-
-    def visit_function_result(self, function_result: FunctionResult) -> str:
-        """Render a function result."""
-        lines = [
-            "<function_result>",
-        ]
-
-        if function_result.get("id"):
-            lines.append(f"<id>{function_result['id']}</id>")
-
-        lines.append(f"<function_name>{function_result['name']}</function_name>")
-
-        if function_result["error"]:
-            lines.extend(
-                [
-                    f"<error>{function_result['error']}</error>",
-                ]
-            )
-        else:
-            lines.append(
-                f"<result>{function_result['result']}</result>",
-            )
-
-        lines.append("</function_result>")
-
-        return "\n".join(lines)
-
-
-class TypeScriptEncoder(AstPrinter):
-    def visit_function_definition(self, function_definition: FunctionDefinition) -> str:
-        """Render a function."""
-        parameters_as_strings = [
-            f"{parameter['name']}: {parameter['type']}"
-            for parameter in function_definition["parameters"]
-        ]
-        # Let's use JSdoc style comments
-        # First the function description
-        lines = [
-            f"// {function_definition['description']}",
-            # Then the parameter descriptions
-            *[
-                f"// @param {parameter['name']} {parameter['description']}"
-                for parameter in function_definition["parameters"]
-            ],
-            # Then the return value description
-            f"// @returns {function_definition['return_value']['description']}",
-            # Then the function definition
-            f"function {function_definition['name']}("
-            f"{', '.join(parameters_as_strings)}): "
-            f"{function_definition['return_value']['type']};",
-        ]
-
-        # finally join
-        function = "\n".join(lines)
-        return function
-
-    def visit_function_definitions(
-        self, function_definitions: List[FunctionDefinition]
-    ) -> str:
-        """Render a function."""
-        strs = [
-            self.visit_function_definition(function_definition)
-            for function_definition in function_definitions
-        ]
-        return "\n\n".join(strs)
-
-    def visit_function_invocation(self, invocation: FunctionInvocation) -> str:
-        """Render a function invocation."""
-        arguments_as_strings = [
-            f"{argument['name']}: {argument['value']}"
-            for argument in invocation["arguments"]
-        ]
-        lines = [f"{invocation['name']}(" f"{', '.join(arguments_as_strings)});"]
-        return "\n".join(lines)
-
-    def visit_function_result(self, function_result: FunctionResult) -> str:
-        """Render a function result."""
-        lines = []
-        if function_result["error"]:
-            lines.append(f"ERROR: {function_result['error']}")
-        else:
-            lines.append(f"> {function_result['result']}")
-        if function_result.get("id"):
-            lines.append(f"// ID: {function_result['id']}")
-        return "\n".join(lines)
@@ -1,93 +0,0 @@
-"""Factory for creating agents for the tool usage task."""
-from typing import Optional
-
-from langchain.agents import AgentExecutor
-from langchain_core.runnables import Runnable, RunnableConfig
-
-from langchain_benchmarks import RateLimiter, model_registry
-from langchain_benchmarks.schema import ToolUsageTask
-from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
-from langchain_benchmarks.tool_usage.agents.experimental.agent import create_agent
-from langchain_benchmarks.tool_usage.agents.experimental.parser import (
-    GenericAgentParser,
-)
-
-
-class CustomAgentFactory:
-    """A factory for creating tool using agents.
-
-    A factory for agents that do not leverage any special JSON mode for
-    function usage; instead all function invocation behavior is implemented solely
-    through prompt engineering and parsing.
-    """
-
-    def __init__(
-        self,
-        task: ToolUsageTask,
-        model: str,
-        *,
-        rate_limiter: Optional[RateLimiter] = None,
-        num_retries: int = 0,
-    ) -> None:
-        """Create an agent factory for the given tool usage task.
-
-        Args:
-            task: The task to create an agent factory for
-            model: model name (check model_registry)
-            rate_limiter: The rate limiter to use if provided
-            num_retries: The number of times to retry the agent if it fails
-        """
-        if model not in model_registry:
-            raise ValueError(f"Unknown model: {model}")
-        self.task = task
-        self.model = model
-        self.rate_limiter = rate_limiter
-        self.num_retries = num_retries
-
-    def __call__(self) -> Runnable:
-        if isinstance(self.model, str):
-            registered_model = model_registry.get_model(self.model)
-            if registered_model is None:
-                raise ValueError(f"Unknown model: {self.model}")
-            model = registered_model.get_model(model_params={"temperature": 0})
-        else:
-            model = self.model
-
-        def _add_task_instructions(
-            input: dict, config: Optional[RunnableConfig] = None, **kwargs
-        ) -> dict:
-            """Add task instructions to the question."""
-            if not isinstance(input, dict):
-                raise ValueError(
-                    f"Expected input to be a dict with key `question`. "
-                    f"Found {type(input)}."
-                )
-            input = input.copy()
-            input["question"] = (
-                f"{self.task.instructions}\nWrite down your answer, "
-                f"but do not explain it. Input: `{input['question']}`"
-            )
-            return input
-
-        env = self.task.create_environment()
-
-        agent = create_agent(
-            model,
-            env.tools,
-            GenericAgentParser(wrapping_xml_tag="tool", require_closing_xml_tag=False),
-            rate_limiter=self.rate_limiter,
-        )
-        if self.num_retries > 0:
-            agent = agent.with_retry(
-                stop_after_attempt=self.num_retries + 1,
-            )
-        executor = AgentExecutor(
-            agent=agent,
-            tools=env.tools,
-            handle_parsing_errors=True,
-            return_intermediate_steps=True,
-        )
-
-        return _add_task_instructions | apply_agent_executor_adapter(
-            executor, state_reader=env.read_state
-        )
@@ -1,122 +0,0 @@
-import ast
-import re
-from typing import Dict, Optional, Union
-
-from langchain.agents import AgentOutputParser
-from langchain.pydantic_v1 import BaseModel, Field
-from langchain_core.agents import AgentAction, AgentActionMessageLog, AgentFinish
-from langchain_core.exceptions import OutputParserException
-from langchain_core.messages import AIMessage
-
-
-class _ToolInvocationRequest(BaseModel):
-    """Light-weight pydantic model for validating the raw tool invocation request.
-
-    The purpose of this model, is to make sure that whatever as parsed from
-    the raw llm output has `tool_name` and potential `arguments` fields, and
-    nothing else.
-    """
-
-    tool_name: str
-    # OK parameterless tools which do not take arguments
-    arguments: Optional[Dict] = Field(default_factory=dict)
-
-
-class GenericAgentParser(AgentOutputParser):
-    """A generalized parser that makes it easier to parameterize different parsing."""
-
-    wrapping_xml_tag: str
-    """The tag that wraps the function invocation request.
-    
-    For example, if "tool", then the function invocation request should be wrapped
-    in <tool>...</tool>.
-    """
-    require_closing_xml_tag: bool = False
-    """Whether we should require a closing tag for the wrapping_xml_tag.
-    
-    For example, if True, then the function invocation request should be wrapped
-    """
-
-    def parse(self, text: str) -> Union[AgentFinish, AgentAction]:
-        """Parse the output of the agent."""
-        open_tag = f"<{self.wrapping_xml_tag}>"
-        close_tag = f"</{self.wrapping_xml_tag}>"
-        if open_tag in text:
-            # This is a hack to make sure that </tool> is always present
-            # in the output if <tool>. </tool> may be a stop sequence for the
-            # language model, so depending on implementation
-            # the stop sequence may be cut off.
-            # There might be a better way to do this, but this works and
-            # is simple.
-            if not self.require_closing_xml_tag:
-                text += close_tag
-
-        pattern = rf"{open_tag}(?P<invocation>.*?){close_tag}"
-        match = re.search(pattern, text, re.DOTALL)
-        if match:
-            content = match.group("invocation").strip()
-            return parse_invocation(content, self.wrapping_xml_tag)
-
-        return AgentFinish(
-            log=text,
-            return_values={
-                "output": text,
-            },
-        )
-
-
-def parse_invocation(text: str, tag: str) -> AgentAction:
-    """Parse the content of the function invocation.
-
-    Args:
-        text: The text to parse.
-        tag: The tag that wraps the function invocation request.
-
-    Returns:
-        An AgentAction that corresponds to the function invocation.
-
-    Raises:
-        OutputParserException: If the parsing fails.
-
-        This exception is meant to be caught by the agent executor and
-        handled appropriately to provide feedback to the LLM.
-    """
-    ai_content = f"<{tag}>{text}</{tag}>\n"
-
-    try:
-        result = ast.literal_eval(text)
-    except BaseException as e:
-        # Convert this to something controllable by the user.
-        err_msg = (
-            f"ERROR: Please use the format "
-            f'<{tag}>{{"tool_name": $TOOL_NAME, "arguments": $ARGUMENTS}}</{tag}>\n'
-        )
-
-        raise OutputParserException(
-            error=e,
-            llm_output=ai_content,
-            observation=err_msg,
-            send_to_llm=True,
-        )
-
-    try:
-        request = _ToolInvocationRequest.validate(result)
-    except Exception as e:  # Using broad exception since it's not just ValidationError
-        # Can also raise DictError if result is not a dict.
-        err_msg = (
-            f"ERROR: Please use the format "
-            f'<{tag}>{{"tool_name": $TOOL_NAME, "arguments": $ARGUMENTS}}</{tag}>\n'
-        )
-        raise OutputParserException(
-            error=e,
-            llm_output=ai_content,
-            send_to_llm=True,
-            observation=err_msg,
-        )
-
-    return AgentActionMessageLog(
-        message_log=[AIMessage(content=ai_content)],
-        tool=request.tool_name,
-        tool_input=request.arguments,
-        log=f"\nInvoking {request.tool_name}: {request.arguments}\n\t",
-    )
@@ -1,42 +0,0 @@
-AGENT_INSTRUCTIONS_XML_FORMAT = """\
-In this environment you have access to a set of tools you can use to answer the user's question.
-
-You may call them like this:
-<function_calls>
-<invoke>
-<tool_name>$TOOL_NAME</tool_name>
-<parameters>
-<$PARAMETER_NAME>$PARAMETER_VALUE</$PARAMETER_NAME>
-...
-</parameters>
-</invoke>
-</function_calls>
-
-Here are the tools available:
-
-{tool_description}
-"""  # noqa: E501
-
-_AGENT_INSTRUCTIONS_BLOB_STYLE = """\
-In this environment you have access to a set of tools you can use to answer the user's question.
-
-Here are the tools available:
-
-{tool_description}
-
-You may call one tool at a time using a format that includes <tool> and </tool> tag. 
-
-Inside the tag the content is a python dictionary that uses python literals (e.g., numbers, strings, lists, dictionaries, etc.) to specify the tool invocation.
-
-It must match the schema of the function as described in the tool description.
-"arguments" is a dictionary of the arguments to the function.
-
-<tool>
-{{
-    "tool_name": $TOOL_NAME,
-    "arguments": $ARGUMENTS
-}}
-</tool>
-
-If you do not know the answer use more tools. You can only take a single action at a time.\
-"""  # noqa: E501
@@ -1,57 +0,0 @@
-"""Utilities to extract information from langchain tools for use in prompts."""
-import inspect
-from textwrap import dedent
-from typing import List
-
-from langchain.tools.base import StructuredTool
-
-from langchain_benchmarks.tool_usage.agents.experimental.encoder import (
-    FunctionDefinition,
-    Parameter,
-)
-
-# PUBLIC API
-
-
-def get_parameters_from_tool(tool: StructuredTool) -> List[Parameter]:
-    """Convert a langchain tool to a tool user tool."""
-    schema = tool.args_schema.schema()
-
-    properties = schema["properties"]
-    parameters = []
-    # Is this needed or is string OK?
-    type_adapter = {
-        "string": "str",  # str or string?
-        "integer": "int",
-        "number": "float",
-        "boolean": "bool",
-    }
-    for key, value in properties.items():
-        parameters.append(
-            {
-                "name": key,
-                "type": type_adapter.get(value["type"], value["type"]),
-                "description": value.get("description", ""),
-            }
-        )
-
-    return parameters
-
-
-#
-def convert_tool_to_function_definition(tool: StructuredTool) -> FunctionDefinition:
-    """Convert a langchain tool to a tool user tool."""
-    # Here we re-inspect the underlying function to get the doc-string
-    # since StructuredTool modifies it, but we want the raw one for maximum
-    # flexibility.
-    description = inspect.getdoc(tool.func)
-
-    parameters = get_parameters_from_tool(tool)
-    return {
-        "name": tool.name,
-        "description": dedent(description),
-        "parameters": parameters,
-        "return_value": {
-            "type": "Any",
-        },
-    }
@@ -1,77 +0,0 @@
-"""Code for creating an assistant factory for evaluating tool usage tasks.
-
-See: https://platform.openai.com/docs/assistants/how-it-works/creating-assistants
-"""
-from typing import Optional
-
-from langchain.agents import AgentExecutor
-from langchain.agents.openai_assistant.base import OpenAIAssistantRunnable
-from langchain.schema.runnable import Runnable
-
-from langchain_benchmarks import rate_limiting
-from langchain_benchmarks.schema import ToolUsageTask
-from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
-
-
-class OpenAIAssistantFactory:
-    def __init__(
-        self,
-        task: ToolUsageTask,
-        *,
-        model: str,
-        rate_limiter: Optional[rate_limiting.RateLimiter] = None,
-        num_retries: int = 0,
-    ) -> None:
-        """Create an OpenAI agent factory for the given task.
-
-        Args:
-            task: The task to create an agent factory for.
-            model: The model to use -- this must be an open AI model.
-            rate_limiter: The rate limiter to use
-            num_retries: The number of times to retry the assistant if it fails
-        """
-        if not isinstance(model, str):
-            raise ValueError(f"Expected str for model, got {type(model)}")
-        self.task = task
-        tools = task.create_environment().tools
-        # Stateless, so we only need to create it once
-        self.agent = OpenAIAssistantRunnable.create_assistant(
-            name=f"{task.name} assistant",
-            instructions=self.task.instructions,
-            tools=tools,
-            model=model,
-            as_agent=True,
-        )
-        self.rate_limiter = rate_limiter
-        self.num_retries = num_retries
-
-    def __call__(self) -> Runnable:
-        env = self.task.create_environment()
-
-        agent = self.agent
-        if self.rate_limiter is not None:
-            # Rate limited model
-            agent = rate_limiting.with_rate_limit(agent, self.rate_limiter)
-
-        def _map_key(x: dict):
-            # Assistant expects the 'content' key explicitly
-            return {
-                "content": x["input"],
-                **{k: v for k, v in x.items() if k != "input"},
-            }
-
-        agent = _map_key | self.agent
-        if self.num_retries > 0:
-            agent = agent.with_retry(
-                stop_after_attempt=self.num_retries + 1,
-            )
-        runnable = AgentExecutor(
-            agent=agent,
-            tools=env.tools,
-            handle_parsing_errors=True,
-            return_intermediate_steps=True,
-        )
-
-        # Returns `state` in the output if the environment has a state reader
-        # makes sure that `output` is always in the output
-        return apply_agent_executor_adapter(runnable, state_reader=env.read_state)
@@ -1,166 +0,0 @@
-"""Code for creating an agent factory for evaluating tool usage tasks."""
-from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
-
-from langchain.agents import AgentExecutor
-from langchain.agents.format_scratchpad.openai_tools import (
-    format_to_openai_tool_messages,
-)
-from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
-from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain.schema.runnable import Runnable
-from langchain.tools.render import format_tool_to_openai_tool
-from langchain_core.language_models import BaseChatModel, BaseLanguageModel
-from langchain_core.language_models.base import LanguageModelInput
-from langchain_core.messages import BaseMessage
-from langchain_core.pydantic_v1 import BaseModel
-
-from langchain_benchmarks import model_registry, rate_limiting
-from langchain_benchmarks.model_registration import RegisteredModel
-from langchain_benchmarks.schema import ToolUsageTask
-from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
-
-# PUBLIC API
-
-
-def _bind_tools(
-    llm: BaseChatModel,
-    tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable]],
-    tool_choice: Optional[str] = None,
-    json_mode: bool = False,
-    **kwargs: Any,
-) -> Runnable[LanguageModelInput, BaseMessage]:
-    """Bind tools (and other objects) to this chat model.
-
-    Args:
-        tools: A list of tool definitions to bind to this chat model.
-            Can be  a dictionary, pydantic model, or callable. Pydantic
-            models and callables will be automatically converted to
-            their schema dictionary representation.
-        tool_choice: Which tool to require the model to call.
-            Must be the name of the single provided tool or
-            "auto" to automatically determine which tool to call
-            (if any).
-        json_mode: Whether to set JSON mode for the tool call.
-            This guarantees the model will respond in valid JSON
-            (unless truncated).
-        kwargs: Any additional parameters to pass to the
-            :class:`~langchain.runnable.Runnable` constructor.
-
-    """
-    formatted_tools: List[Dict[str, Union[str, dict]]] = [
-        format_tool_to_openai_tool(tool) for tool in tools
-    ]
-    if tool_choice is not None:
-        if not formatted_tools:
-            raise ValueError(
-                "When specifying `tool_choice`, you must provide at least one " "tool."
-            )
-        tool_names = [tool["function"]["name"] for tool in formatted_tools]
-        if not any(tool_name == tool_choice for tool_name in tool_names):
-            raise ValueError(
-                f"Tool choice {tool_choice} was specified, but the only "
-                f"provided tools were {tool_names}."
-            )
-        tool_choice_ = {"type": "function", "function": {"name": tool_choice}}
-        kwargs = {**kwargs, "tool_choice": tool_choice_}
-    if json_mode:
-        kwargs = {**kwargs, "response_format": {"type": "json_object"}}
-    return llm.bind(
-        tools=formatted_tools,
-        **kwargs,
-    )
-
-
-class OpenAIAgentFactory:
-    def __init__(
-        self,
-        task: ToolUsageTask,
-        *,
-        model: Union[
-            str, RegisteredModel, BaseLanguageModel, BaseChatModel
-        ] = "gpt-3.5-turbo-16k",
-        rate_limiter: Optional[rate_limiting.RateLimiter] = None,
-        num_retries: int = 0,
-    ) -> None:
-        """Create an OpenAI agent factory for the given task.
-
-        Args:
-            task: The task to create an agent factory for.
-            model: The model to use -- this must be an open AI model.
-            rate_limiter: The rate limiter to use
-        """
-        self.task = task
-        self.model = model
-        self.rate_limiter = rate_limiter
-        self.num_retries = num_retries
-
-    def _create_model(self) -> Union[BaseChatModel, BaseLanguageModel]:
-        if isinstance(self.model, RegisteredModel):
-            return self.model.get_model(
-                model_params={"temperature": 0, "model_kwargs": {"seed": 0}}
-            )
-        elif isinstance(self.model, (BaseChatModel, BaseLanguageModel)):
-            return self.model
-        elif isinstance(self.model, str):
-            if self.model in model_registry:
-                registered_model = model_registry.get_model(self.model)
-                model_instance = registered_model.get_model(
-                    model_params={"temperature": 0, "model_kwargs": {"seed": 0}}
-                )
-                return model_instance
-            else:
-                raise ValueError(f"Unknown model: {self.model}")
-        else:
-            raise TypeError(f"Expected str or RegisteredModel, got {type(self.model)}")
-
-    def create(self) -> Runnable:
-        """Agent Executor"""
-        # For backwards compatibility
-        return self()
-
-    def __call__(self) -> Runnable:
-        model = self._create_model()
-        env = self.task.create_environment()
-
-        model = _bind_tools(model, env.tools)
-
-        if self.rate_limiter is not None:
-            # Rate limited model
-            model = rate_limiting.with_rate_limit(model, self.rate_limiter)
-
-        prompt = ChatPromptTemplate.from_messages(
-            [
-                (
-                    "system",
-                    self.task.instructions,
-                ),
-                ("user", "{input}"),
-                MessagesPlaceholder(variable_name="agent_scratchpad"),
-            ]
-        )
-
-        runnable_agent = (
-            {
-                "input": lambda x: x["input"],
-                "agent_scratchpad": lambda x: format_to_openai_tool_messages(
-                    x["intermediate_steps"]
-                ),
-            }
-            | prompt
-            | model
-            | OpenAIToolsAgentOutputParser()
-        )
-        if self.num_retries > 0:
-            runnable_agent = runnable_agent.with_retry(
-                stop_after_attempt=self.num_retries + 1,
-            )
-        runnable = AgentExecutor(
-            agent=runnable_agent,
-            tools=env.tools,
-            handle_parsing_errors=True,
-            return_intermediate_steps=True,
-        )
-
-        # Returns `state` in the output if the environment has a state reader
-        # makes sure that `output` is always in the output
-        return apply_agent_executor_adapter(runnable, state_reader=env.read_state)
@@ -10,9 +10,10 @@ from langchain_core.runnables import Runnable

 from langchain_benchmarks.schema import ToolUsageTask
 from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
+from langchain_benchmarks.tool_usage.agents.base import AgentFactory


-class CustomRunnableAgentFactory:
+class CustomRunnableAgentFactory(AgentFactory):
    """A factory for creating tool using agents.

    A factory for agents that do not leverage any special JSON mode for
@@ -46,4 +47,6 @@ class CustomRunnableAgentFactory:
            return_intermediate_steps=True,
        )

-        return apply_agent_executor_adapter(executor, state_reader=env.read_state)
+        return apply_agent_executor_adapter(
+            executor, state_reader=env.read_state
+        ).with_config({"run_name": "Agent", "metadata": {"task": self.task.name}})
@@ -4,7 +4,7 @@ This is useful for agents that follow the standard LangChain tool format.
 """
 from typing import Optional

-from langchain.agents import AgentExecutor
+from langchain.agents import AgentExecutor, create_tool_calling_agent
 from langchain_core.language_models import BaseChatModel
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import Runnable
@@ -12,9 +12,10 @@ from langchain_core.runnables import Runnable
 from langchain_benchmarks.rate_limiting import RateLimiter, with_rate_limit
 from langchain_benchmarks.schema import ToolUsageTask
 from langchain_benchmarks.tool_usage.agents.adapters import apply_agent_executor_adapter
+from langchain_benchmarks.tool_usage.agents.base import AgentFactory


-class StandardAgentFactory:
+class StandardAgentFactory(AgentFactory):
    """A standard agent factory.

    Use this factory with chat models that support the standard LangChain tool
@@ -55,8 +56,6 @@ class StandardAgentFactory:

    def __call__(self) -> Runnable:
        """Call the factory to create Runnable agent."""
-        # Temporarily import here until new langchain is released with create_tools_agent
-        from langchain.agents import create_tool_calling_agent

        env = self.task.create_environment()

@@ -10,11 +10,11 @@ from typing import Any, Literal, Optional, Union

 from langchain.callbacks.manager import collect_runs
 from langchain.chains import LLMChain
-from langchain.chat_models import ChatOpenAI
 from langchain.evaluation import EvaluatorType, StringEvaluator, load_evaluator
 from langchain.evaluation.schema import StringEvaluator
 from langchain.smith import RunEvalConfig
 from langchain_core.language_models import BaseChatModel, BaseLanguageModel
+from langchain_openai import ChatOpenAI
 from langsmith.evaluation.evaluator import (
    EvaluationResult,
    EvaluationResults,
@@ -175,7 +175,7 @@ class AgentTrajectoryEvaluator(RunEvaluator):
            eval_llm = eval_llm or ChatOpenAI(
                model="gpt-4",
                temperature=0,
-                model_kwargs={"seed": 42},
+                seed=42,
                max_retries=1,
                request_timeout=60,
            )
@@ -33,7 +33,9 @@ INPUT_A: input_a here
 INPUT_B: input_b here
 COMPARISON: CORRECT or INCORRECT here

-Ignore differences in punctuation and phrasing between the student answer and true answer. 
+Ignore differences in punctuation and phrasing between the student answer and true answer, please only compare the first 4 decimal digits.
+
+For instance if INPUT_A = 123.6751345 and INPUT_B = 123.6751456 you should return CORRECT, since the first 4 decimal points match.

 Begin!

@@ -245,6 +245,9 @@ DATASET = DATASET_TINY + [
    },
 ]

+# Provided here for backwards compatibility, but we do not register
+# it as a task in the task registry.
+# TINY is just the multiverse math task with 10 examples instead of full dataset.
 MULTIVERSE_MATH_TINY = ToolUsageTask(
    name="Multiverse Math (Tiny)",
    dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d",
@@ -0,0 +1,996 @@
+from datetime import datetime
+from typing import List, Literal, Union, cast
+
+from langchain.tools import BaseTool, tool
+from langchain_core.messages import HumanMessage
+from langsmith.client import Client
+from pydantic import BaseModel, Field
+
+from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
+
+
+class DocQuery(BaseModel):
+    """Query against documentation"""
+
+    query: str = Field(..., description="The question to answer")
+    source: Literal["langchain", "langsmith", "langgraph"] = Field(
+        ...,
+        description="The documentation source to search against. Should be one of 'langchain', 'langsmith', or "
+        "'langgraph' depending on which one product the user question pertains to",
+    )
+
+
+class TweetQuery(BaseModel):
+    """Query against tweets"""
+
+    subject: str = Field(..., description="Subject to search for")
+    min_likes: Union[int, None] = Field(
+        None, description="Minimum amount of likes on the tweet"
+    )
+    max_likes: Union[int, None] = Field(
+        None, description="Maximum amount of likes on the tweet"
+    )
+    start_date: Union[datetime, None] = Field(
+        None, description="Earliest date to start pulling tweets from"
+    )
+    end_date: Union[datetime, None] = Field(
+        None,
+        description="Latest date to pull tweets from, None if pulling up to the present",
+    )
+    has_link: bool = Field(
+        False, description="Whether to query for tweets that have a link."
+    )
+
+
+class BlogQuery(BaseModel):
+    """Query against blog posts"""
+
+    subject: Union[str, None] = Field(..., description="Subject to search for")
+    authors: List[str] = Field(
+        None,
+        description="Authors to search for. None if not searching for a speific author,  list if searching for more than one.",
+    )
+    start_date: Union[datetime, None] = Field(
+        None, description="Earliest date to start pulling blog posts from"
+    )
+    end_date: Union[datetime, None] = Field(
+        None, description="Latest date to pull blog posts from"
+    )
+
+
+def get_environment() -> ToolUsageEnvironment:
+    """Create an environment."""
+    tools = cast(
+        List[BaseTool],
+        [tool(func) for func in [TweetQuery, DocQuery, BlogQuery]],
+    )
+    return ToolUsageEnvironment(
+        tools=tools,
+        read_state=None,
+    )
+
+
+DOC_DATASET = [
+    {
+        "question": [
+            HumanMessage(
+                "Can I use the send method to map-reduce the values of different branch points?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "send method map-reduce", "source": "langgraph"},
+            }
+        ],
+    },
+    {
+        "question": [HumanMessage("where is olllama function calling mentioned?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "ollama function calling", "source": "langchain"},
+            },
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "ollama function calling",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": False,
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "ollama function calling",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage("Are pairwise evals supported for different models?")
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "pairwise evals different models",
+                    "source": "langsmith",
+                },
+            }
+        ],
+    },
+    {
+        "question": [HumanMessage("Can a user update state during a run?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "user update state", "source": "langgraph"},
+            }
+        ],
+    },
+    {
+        "question": [HumanMessage("Can I change config after each AI response?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "update model config", "source": "langchain"},
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "How can I build my own run rules? Can I set up a schedule for them?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "custom run rules", "source": "langsmith"},
+            },
+            {
+                "name": "DocQuery",
+                "args": {"query": "run rules schedule", "source": "langsmith"},
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("Is there a page on routing functions?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "routing functions", "source": "langgraph"},
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage("Is there information on using Pinecone as a vectorstore?")
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "Pinecone vectorstore",
+                    "source": "langchain",
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "Pinecone vectorstore",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("is it possible to prevent exposing personal data?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "personal data privacy", "source": "langsmith"},
+            }
+        ],
+    },
+    {
+        "question": [HumanMessage("How do you use conditional entry?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "conditional entry", "source": "langgraph"},
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "How do I extract text from PDF data using PyPDF? Can I combine image and text in a prompt?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "PDF extraction using PyPDF", "source": "langchain"},
+            },
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "combine image and text in a prompt",
+                    "source": "langchain",
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "How do I setup automation rules for my chat model app? How do I view logs for those rules?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "automation rules for chat model app",
+                    "source": "langsmith",
+                },
+            },
+            {
+                "name": "DocQuery",
+                "args": {"query": "automation rules logs", "source": "langsmith"},
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage("where can I read about how use Chroma embeddings locally?")
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "local Chroma embeddings", "source": "langchain"},
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "local Chroma embeddings",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("how to index documents in a RAG app?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "index documents RAG app", "source": "langchain"},
+            },
+            {
+                "name": "DocQuery",
+                "args": {"query": "index documents RAG app", "source": "langgraph"},
+            },
+        ],
+    },
+]
+
+TWEET_DATASET = [
+    {
+        "question": [
+            HumanMessage(
+                "Did we have any announcements about agents with more than 1000 likes that also included a link?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "agents",
+                    "min_likes": 1000,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": True,
+                },
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Are there any posts about evaluators by langchain with less than 100 likes?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "evaluators",
+                    "min_likes": None,
+                    "max_likes": 100,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": False,
+                },
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Is there anywhere on socials where we link to the anthropic website in the last year?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "anthropic",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 1, 1),
+                    "end_date": None,
+                    "has_link": True,
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "anthropic",
+                    "authors": None,
+                    "start_date": datetime(2023, 1, 1),
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("In Q2 2023 what updates to LangSmith were made?")],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "LangSmith",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 4, 1),
+                    "end_date": datetime(2023, 6, 30),
+                    "has_link": False,
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "LangSmith",
+                    "authors": None,
+                    "start_date": datetime(2023, 4, 1),
+                    "end_date": datetime(2023, 6, 30),
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Were there any social media posts with triple digit likes about few shot prompting?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "few shot prompting",
+                    "min_likes": 100,
+                    "max_likes": 999,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": False,
+                },
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Are there any posts about LangServe before June 2023 that have more than 2000 likes and include a link?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "LangServe",
+                    "min_likes": 2000,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": datetime(2023, 5, 31),
+                    "has_link": True,
+                },
+            }
+        ],
+    },
+]
+
+BLOG_DATASET = [
+    {
+        "question": [
+            HumanMessage("Have there been release notes in the past year about agents?")
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "agents",
+                    "authors": None,
+                    "start_date": datetime(2023, 1, 1),
+                    "end_date": None,
+                },
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "how many press releases mentioned chat-gpt in the month after October 2023?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "chat-gpt",
+                    "authors": None,
+                    "start_date": datetime(2023, 11, 1),
+                    "end_date": datetime(2023, 11, 30),
+                },
+            },
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "chat-gpt",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 11, 1),
+                    "end_date": datetime(2023, 11, 30),
+                    "has_link": False,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage("what has been said about universal configurable models?")
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "universal configurable models",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "universal configurable models",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": False,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "In the last week, Have Harrison or Bagatur written anything about passing in runnables as tools in LangChain?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "runnables as tools",
+                    "authors": ["Harrison", "Bagatur"],
+                    "start_date": datetime(2023, 12, 25),
+                    "end_date": None,
+                },
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Are there any case studies of agents running on swe-benchmark?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "agents running on swe-benchmark",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            }
+        ],
+    },
+    {
+        "question": [HumanMessage("Why is using fewshot prompting helpful?")],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "fewshot prompting",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+            {
+                "name": "DocQuery",
+                "args": {"query": "few shot prompting", "source": "langchain"},
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "i need to implement similarity search with filtering in FAISS. how can i do that in my app?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "similarity search with FAISS",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            }
+        ],
+    },
+]  # Realease notes/announcements + Case studies +
+
+AMBIGUOUS_DATASET = [
+    {
+        "question": [
+            HumanMessage(
+                "I want to migrate from agentexecutor to langgraph. What do I need to do?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "migrate agentexecutor", "source": "langchain"},
+            },
+            {
+                "name": "DocQuery",
+                "args": {"query": "migrate agentexecutor", "source": "langgraph"},
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "In the last month, what are the latest updates to the openai partner package?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "openai partner package",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 12, 1),
+                    "end_date": None,
+                    "has_link": False,
+                },
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "What are best practices for setting up a document loader for a RAG chain?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "document loader for RAG chain",
+                    "source": "langchain",
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "document loader best practies",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("case studies using langgraph last week?")],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "langgraph case studies",
+                    "authors": None,
+                    "start_date": datetime(2023, 12, 25),
+                    "end_date": None,
+                },
+            }
+        ],
+    },
+]
+
+DATASET = DOC_DATASET + TWEET_DATASET + BLOG_DATASET + AMBIGUOUS_DATASET
+
+QUERY_ANALYSIS_TASK = ToolUsageTask(
+    name="Extraction Task",
+    dataset_id="https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d",
+    create_environment=get_environment,
+    instructions=(
+        """
+                    You are requested to generate queries for searching either through tweets, docs, or blog entries. 
+                    Inside the docs there are three different sources that you may wish to query for: LangGraph, LangSmith, or LangChain. 
+                    LangGraph is a library for building multi-actor applications with LLMs, used to create agent and multi-agent workflows. 
+                    LangSmith is an all-in-one developer platform for every step of the LLM-powered application lifecycle. 
+                    It helps you debug, evaluate, test, and monitor your LLM applications. LangChain is a framework to build with LLMs by chaining interoperable components.
+                    One last important thing to remember is that some queries will ask for date ranges, and you must remember that today is 2024-01-01. Also, remember that \
+                    each question should be answered by a single query. In addition, you can return multiple queries to answer one question. Do not generate text, just tool calls that \
+                    if executed would answer the users question. Do NOT pass the whole question as the query/subject, only extract key ideas/words.
+                 """
+    ),
+    description=(
+        """\
+An environment that contains three different mock query tools for searching through LangChain material.
+
+The three tools are for querying LangChain documentation, tweets, and blogs respectively.
+
+The objective of the task it to measure how well the agent can select the correct tool and \
+select the right parameters for the query. It is not a test of the actual querying process, \
+merely the process of constructing the query.
+"""
+    ),
+    eval_params={
+        "output_evaluation": "qa_math_without_question",
+    },
+)
+
+FEW_SHOT_DATASET = [
+    {
+        "question": [
+            HumanMessage(
+                "What are good rules to follow when using multi modal chat models?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "multi modal chat models", "source": "langchain"},
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "multi modal chat models",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage("How do you build a RAG chain with a Postgres vectorstore?")
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "RAG chain with Postgres vectorstore",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "RAG chain with Postgres vectorstore",
+                    "source": "langchain",
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage("What case studies have we written about tool usage?")
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "tool usage case study",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("How do I migrate from run_on_dataset to evaluate?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "migrate run_on_dataset to evaluate",
+                    "source": "langchain",
+                },
+            },
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "migrate run_on_dataset to evaluate",
+                    "source": "langsmith",
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Do any of our posts in the last 2 months about Anthropic have less than 100 likes?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "Anthropic",
+                    "min_likes": None,
+                    "max_likes": 100,
+                    "start_date": datetime(2023, 11, 1),
+                    "end_date": None,
+                    "has_link": True,
+                },
+            }
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Did we release any information about claude-3.5 in the last week?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "claude-3.5",
+                    "authors": None,
+                    "start_date": datetime(2023, 12, 25),
+                    "end_date": None,
+                },
+            },
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "claude-3.5",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 12, 25),
+                    "end_date": None,
+                    "has_link": False,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Do we have press statements about filtering traces by metadata before October 2023?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "filtering traces by metadata",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": datetime(2023, 9, 30),
+                },
+            },
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "filtering traces by metadata",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": datetime(2023, 9, 30),
+                    "has_link": False,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "What updates to mistral partner package were posted in the last year?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "mistral partner package",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 1, 1),
+                    "end_date": None,
+                    "has_link": False,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "Have there been updates to the best practices for initializing chat models in the past month?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "best practices for initializing chat models",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": datetime(2023, 12, 1),
+                    "end_date": None,
+                    "has_link": False,
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "best practices for initializing chat models",
+                    "authors": None,
+                    "start_date": datetime(2023, 12, 1),
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "How can I learn about the differences between chat agents and graphs"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "differences between chat agents and graphs",
+                    "source": "langchain",
+                },
+            },
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "differences between chat agents and graphs",
+                    "source": "langgraph",
+                },
+            },
+        ],
+    },
+    {
+        "question": [
+            HumanMessage(
+                "What are good practices to follow for switching from legacy packages?"
+            )
+        ],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "switching from legacy packages",
+                    "source": "langchain",
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "switching from legacy packages",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("What data is exposed when I run custom evals?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {
+                    "query": "data exposed running custom evaluation",
+                    "source": "langsmith",
+                },
+            },
+        ],
+    },
+    {
+        "question": [HumanMessage("Where are document loaders talked about?")],
+        "tool_calls": [
+            {
+                "name": "DocQuery",
+                "args": {"query": "document loaders", "source": "langchain"},
+            },
+            {
+                "name": "TweetQuery",
+                "args": {
+                    "subject": "document loaders",
+                    "min_likes": None,
+                    "max_likes": None,
+                    "start_date": None,
+                    "end_date": None,
+                    "has_link": False,
+                },
+            },
+            {
+                "name": "BlogQuery",
+                "args": {
+                    "subject": "document loaders",
+                    "authors": None,
+                    "start_date": None,
+                    "end_date": None,
+                },
+            },
+        ],
+    },
+]
+
+
+def _create_dataset(examples: list, dataset_id: str) -> None:
+    """Create a dataset with the langsmith client."""
+
+    client = Client()
+    for example in examples:
+        client.create_example(
+            inputs={"question": example["question"]},
+            outputs={"reference": example["tool_calls"]},
+            dataset_id=dataset_id,
+        )
@@ -1,48 +1,33 @@
 [tool.poetry]
 name = "langchain-benchmarks"
-version = "0.0.11"
+version = "0.0.15"
 description = "🦜💪 Flex those feathers!"
 authors = ["LangChain AI"]
 license = "MIT"
 readme = "README.md"

 [tool.poetry.dependencies]
-python = "^3.8.1"
-langchain = "^0.1.0"
+python = "^3.9"
+langchain = "^0.3"
+langchain-community = "^0.3"
+langchain-core= "^0.3.12"
 langsmith = ">=0.0.70"
 tqdm = "^4"
 ipywidgets = "^8"
 tabulate = ">=0.8.0"
+langchain-openai = "^0.2"

 [tool.poetry.group.dev]
 optional = true

 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"
-langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core"}
-langchain = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/langchain"}
-langchain-anthropic = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/anthropic"}
-langchain-fireworks = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/fireworks"}
-langchain-mistralai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/mistralai"}
-langchain-cohere = {git = "https://github.com/langchain-ai/langchain-cohere.git", subdirectory="libs/cohere"}
-langchain-groq = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/groq"}
-langchain-openai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/openai"}
-

 [tool.poetry.group.typing]
 optional = true

 [tool.poetry.group.typing.dependencies]
 mypy = "^1.7.0"
-langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core"}
-langchain = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/langchain"}
-langchain-anthropic = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/anthropic"}
-langchain-fireworks = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/fireworks"}
-langchain-mistralai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/mistralai"}
-langchain-cohere = {git = "https://github.com/langchain-ai/langchain-cohere.git", subdirectory="libs/cohere"}
-langchain-groq = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/groq"}
-langchain-openai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/openai"}
-
 [tool.poetry.group.lint]
 optional = true

@@ -73,14 +58,12 @@ pytest-socket = "^0.6.0"
 pytest-watch = "^4.2.0"
 pytest-timeout = "^2.2.0"
 freezegun = "^1.3.1"
-langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core"}
-langchain = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/langchain"}
-langchain-anthropic = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/anthropic"}
-langchain-fireworks = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/fireworks"}
-langchain-mistralai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/mistralai"}
-langchain-cohere = {git = "https://github.com/langchain-ai/langchain-cohere.git", subdirectory="libs/cohere"}
-langchain-groq = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/groq"}
-langchain-openai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/openai"}
+langchain-anthropic = "^0.2"
+langchain-fireworks = "^0.2"
+langchain-mistralai = "^0.2"
+langchain-groq = "^0.2"
+langchain-core = "^0.3.12"
+faiss-cpu = ">=1.8.0"

 [tool.ruff]
 select = [
@@ -0,0 +1,192 @@
+import datetime
+import sys
+import uuid
+
+from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
+from langchain_core.messages.utils import convert_to_messages
+from langsmith.client import Client
+
+from langchain_benchmarks import __version__
+
+sys.path.append("./../langchain_benchmarks")
+from langchain.agents import AgentExecutor, create_tool_calling_agent
+from langchain.chat_models import init_chat_model
+from langsmith.evaluation import evaluate
+from tool_usage.tasks.multiverse_math import *
+
+tests = [
+    (
+        "claude-3-haiku-20240307",
+        "anthropic",
+    ),
+    (
+        "claude-3-sonnet-20240229",
+        "anthropic",
+    ),
+    (
+        "claude-3-opus-20240229",
+        "anthropic",
+    ),
+    (
+        "claude-3-5-sonnet-20240620",
+        "anthropic",
+    ),
+    ("gpt-3.5-turbo-0125", "openai"),
+    (
+        "gpt-4o",
+        "openai",
+    ),
+    ("gpt-4o-mini", "openai"),
+]
+
+client = Client()  # Launch langsmith client for cloning datasets
+
+
+def get_few_shot_messages(task_name):
+    if task_name == "Multiverse Math":
+        uncleaned_examples = [
+            e
+            for e in client.list_examples(
+                dataset_name="multiverse-math-examples-for-few-shot"
+            )
+        ]
+        few_shot_messages = []
+        few_shot_three_messages = []
+        examples = []
+        for i in range(len(uncleaned_examples)):
+            converted_messages = convert_to_messages(
+                uncleaned_examples[i].outputs["output"]
+            )
+            examples.append(
+                # The message at index 1 is the human message asking the actual math question (0th message is system prompt)
+                {
+                    "question": converted_messages[1].content,
+                    "messages": [
+                        m
+                        for m in converted_messages
+                        if isinstance(m, SystemMessage) == False
+                    ],
+                }
+            )
+            few_shot_messages += converted_messages
+            if i < 3:
+                few_shot_three_messages += converted_messages
+
+        return (
+            examples,
+            [m for m in few_shot_messages if not isinstance(m, SystemMessage)],
+            [m for m in few_shot_three_messages if not isinstance(m, SystemMessage)],
+        )
+    else:
+        raise ValueError("Few shot messages not supported for this dataset")
+
+
+def turn_messages_to_str(few_shot_messages):
+    few_shot_str = ""
+    for m in few_shot_messages:
+        if isinstance(m.content, list):
+            few_shot_str += "<|im_start|>assistant"
+            for tool_use in m.content:
+                if "name" in tool_use:
+                    few_shot_str += f"Use tool {tool_use['name']}, input: {', '.join(f'{k}:{v}' for k,v in tool_use['input'].items())}"
+                else:
+                    few_shot_str += tool_use["text"]
+                few_shot_str += "\n"
+            few_shot_str += "\n<|im_end|>"
+        else:
+            if isinstance(m, HumanMessage):
+                few_shot_str += f"<|im_start|>user\n{m.content}\n<|im_end|>"
+            elif isinstance(m, ToolMessage):
+                few_shot_str += f"<|im_start|>tool\n{m.content}\n<|im_end|>"
+            else:
+                few_shot_str += f"<|im_start|>assistant\n{m.content}\n<|im_end|>"
+
+        few_shot_str += "\n"
+    return few_shot_str
+
+
+def get_few_shot_str_from_messages(few_shot_messages, few_shot_three_messages):
+    few_shot_str = turn_messages_to_str(few_shot_messages)
+    few_shot_three_str = turn_messages_to_str(few_shot_three_messages)
+    return few_shot_str, few_shot_three_str
+
+
+def get_prompts(task_name, **kwargs):
+    if task_name == "Multiverse Math":
+        return [
+            (
+                client.pull_prompt("langchain-ai/multiverse-math-no-few-shot"),
+                "no-few-shot",
+            ),
+            (
+                client.pull_prompt("langchain-ai/multiverse-math-few-shot-messages"),
+                "few-shot-messages",
+            ),
+            (
+                client.pull_prompt("langchain-ai/multiverse-math-few-shot-str"),
+                "few-shot-string",
+            ),
+            (
+                client.pull_prompt("langchain-ai/multiverse-math-few-shot-3-messages"),
+                "few-shot-three-messages",
+            ),
+            (
+                client.pull_prompt("langchain-ai/multiverse-math-few-shot-3-str"),
+                "few-shot-three-strings",
+            ),
+        ]
+
+
+def predict_from_callable(callable, instructions):
+    def predict(run):
+        return callable.invoke(
+            {"question": run["question"], "instructions": instructions}
+        )
+
+    return predict
+
+
+experiment_uuid = uuid.uuid4().hex[:4]
+today = datetime.date.today().isoformat()
+
+task = MULTIVERSE_MATH
+dataset_name = task.name
+examples, few_shot_messages, few_shot_three_messages = get_few_shot_messages(task.name)
+few_shot_str, few_shot_three_str = get_few_shot_str_from_messages(
+    few_shot_messages, few_shot_three_messages
+)
+
+prompts = get_prompts(
+    task.name,
+    examples=examples,
+    few_shot_three_messages=few_shot_three_messages,
+    few_shot_three_str=few_shot_three_str,
+)
+
+for model_name, model_provider in tests:
+    model = init_chat_model(model_name, model_provider=model_provider, temperature=0)
+
+    print(f"Benchmarking {task.name} with model: {model_name}")
+    eval_config = task.get_eval_config()
+
+    for prompt, prompt_name in prompts:
+        tools = task.create_environment().tools
+        agent = create_tool_calling_agent(model, tools, prompt)
+        agent_executor = AgentExecutor(
+            agent=agent, tools=tools, return_intermediate_steps=True
+        )
+
+        evaluate(
+            predict_from_callable(agent_executor, task.instructions),
+            data=dataset_name,
+            evaluators=eval_config.custom_evaluators,
+            max_concurrency=5,
+            metadata={
+                "model": model_name,
+                "id": experiment_uuid,
+                "task": task.name,
+                "date": today,
+                "langchain_benchmarks_version": __version__,
+            },
+            experiment_prefix=f"{model_name}-{task.name}-{prompt_name}",
+        )
@@ -0,0 +1,331 @@
+import uuid
+from collections import Counter
+from datetime import datetime
+from typing import Optional
+
+from langchain.chat_models import init_chat_model
+from langchain_community.vectorstores import FAISS
+from langchain_core.example_selectors import SemanticSimilarityExampleSelector
+from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    FewShotChatMessagePromptTemplate,
+    MessagesPlaceholder,
+)
+from langchain_openai import OpenAIEmbeddings
+from langsmith.client import Client
+from langsmith.evaluation import evaluate
+from langsmith.evaluation.evaluator import EvaluationResult, EvaluationResults
+from langsmith.schemas import Example, Run
+
+from langchain_benchmarks.tool_usage.tasks.query_analysis import (
+    QUERY_ANALYSIS_TASK,
+    BlogQuery,
+    DocQuery,
+    TweetQuery,
+)
+
+
+def calculate_recall(A, B):
+    # Count the occurrences of each element in A and B
+    count_A = Counter(A)
+    count_B = Counter(B)
+
+    # Calculate the number of true positives
+    true_positives = sum(min(count_A[elem], count_B.get(elem, 0)) for elem in count_A)
+
+    # Calculate recall
+    recall = true_positives / sum(count_A.values()) if count_A else 0
+
+    return recall
+
+
+client = Client()
+
+
+def is_iso_format(date_str):
+    if not isinstance(date_str, str):
+        return False
+    try:
+        # Try to parse the string with datetime.fromisoformat
+        datetime.fromisoformat(date_str)
+        return True
+    except ValueError:
+        return False
+
+
+llm_judge = init_chat_model("gpt-4o")
+
+judge_prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            "You are an llm tasked with determining if the subject extracted by another LLM is an accurate "
+            "representation of the correct answer. You are to check for general semantic similarity since the words might not "
+            "match up perfectly but the meaning might still be the same. Return YES if the answers match, and NO otherwise. "
+            "Never return anything other than YES or NO.",
+        ),
+        (
+            "human",
+            "Is this query: {run_query} somewhat similar to this reference query: {reference_query}",
+        ),
+    ]
+)
+
+judge_chain = judge_prompt | llm_judge | StrOutputParser()
+
+tools = [DocQuery, TweetQuery, BlogQuery]
+
+
+def compare_outputs(run_outputs: dict, example_outputs: dict) -> EvaluationResults:
+    if len(run_outputs["response"].tool_calls) == 0:
+        correct_tool_score, deterministic_score, nondeterministic_score = 0, 0, 0
+    else:
+        # Chose the correct tool
+        reference_tools = [tool["name"] for tool in example_outputs["reference"]]
+        outputted_tools = [tool["name"] for tool in run_outputs["response"].tool_calls]
+        correct_tool_score = calculate_recall(reference_tools, outputted_tools)
+
+        # Has the correct deterministic args
+        deterministic_score = 0
+        # Has the correct in-deterministic args
+        nondeterministic_score = 0
+
+        if correct_tool_score == 1:
+            deterministic_score, nondeterministic_score = 1, 1
+            for tool in example_outputs["reference"]:
+                corresponding_response_tool = [
+                    t
+                    for t in run_outputs["response"].tool_calls
+                    if t["name"] == tool["name"]
+                ][0]["args"]
+                for arg in tool["args"]:
+                    if arg in ["query", "subject"]:
+                        ans = judge_chain.invoke(
+                            {
+                                "run_query": corresponding_response_tool[arg],
+                                "reference_query": tool["args"][arg],
+                            }
+                        )
+                        nondeterministic_score = 1 if ans == "YES" else 0
+                    else:
+                        if (
+                            tool["args"][arg] and arg not in corresponding_response_tool
+                        ) or (
+                            tool["args"][arg]
+                            and not (
+                                tool["args"][arg] == corresponding_response_tool[arg]
+                            )
+                            and not (
+                                is_iso_format(tool["args"][arg])
+                                and is_iso_format(corresponding_response_tool[arg])
+                                and datetime.fromisoformat(
+                                    (corresponding_response_tool[arg])
+                                ).replace(tzinfo=None)
+                                == datetime.fromisoformat(tool["args"][arg])
+                            )
+                        ):
+                            deterministic_score = 0
+    # Overall correctness
+    overall_score = int(
+        correct_tool_score == 1
+        and bool(deterministic_score)
+        and bool(nondeterministic_score)
+    )
+    results = [
+        EvaluationResult(
+            key="Correct tool",
+            score=correct_tool_score,
+        ),
+        EvaluationResult(
+            key="Correct deterministic args",
+            score=deterministic_score,
+        ),
+        EvaluationResult(
+            key="Correct nondeterministic args",
+            score=nondeterministic_score,
+        ),
+        EvaluationResult(
+            key="Overall correctness",
+            score=overall_score,
+        ),
+    ]
+
+    return {"results": results}
+
+
+def evaluate_run(run: Run, example: Optional[Example] = None) -> EvaluationResults:
+    return compare_outputs(run.outputs, example.outputs)
+
+
+uncleaned_examples = [
+    e for e in client.list_examples(dataset_name="Extraction Task Few Shot")
+]
+static_indices = [0, 2, 5]
+few_shot_messages, few_shot_str = [], ""
+few_shot_messages_by_index = {}
+examples_for_semantic_search = []
+
+for j, example in enumerate(uncleaned_examples):
+    few_shot_messages_for_example = []
+    few_shot_messages_for_example.append(
+        HumanMessage(
+            name="example_human", content=example.inputs["question"][0]["content"]
+        )
+    )
+    few_shot_messages_for_example.append(
+        AIMessage(
+            name="example_assistant",
+            content="",
+            tool_calls=[
+                {
+                    "name": tc["name"],
+                    "args": tc["args"],
+                    "type": "tool_call",
+                    "id": f"{10*j+i}",
+                }
+                for i, tc in enumerate(example.outputs["reference"])
+            ],
+        )
+    )
+    few_shot_str += (
+        f"<|im_start|>user\n{example.inputs['question'][0]['content']}\n<|im_end|>"
+    )
+    few_shot_str += "\n<|im_start|>assistant\n"
+    for i, tool_call in enumerate(example.outputs["reference"]):
+        few_shot_messages_for_example.append(
+            ToolMessage(
+                "You have correctly called this tool",
+                name=tool_call["name"],
+                tool_call_id=f"{10*j+i}",
+            )
+        )
+        few_shot_str += f"Tool Call: Name: {tool_call['name']} Args: {{{', '.join(f'{k}: {v}' for k,v in tool_call['args'].items())}}}"
+        few_shot_str += "\n"
+    few_shot_str += "<|im_end|>"
+
+    few_shot_messages += few_shot_messages_for_example
+    few_shot_messages_by_index[j] = few_shot_messages_for_example
+    examples_for_semantic_search.append(
+        {
+            "question": example.inputs["question"][0]["content"],
+            "messages": few_shot_messages_for_example,
+        }
+    )
+
+prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", "{instructions}"),
+        MessagesPlaceholder("few_shot_message_list"),
+        ("human", "{input}"),
+    ]
+)
+
+
+def predict_for_model(model, instructions, few_shot_method, model_name):
+    few_shot_message_list = []
+    chain = prompt | model.bind_tools(tools).with_retry(stop_after_attempt=5)
+    if few_shot_method == "few-shot-string":
+        instructions += f"\n Here are some examples: \n {few_shot_str}"
+    elif few_shot_method == "few-shot-messages":
+        few_shot_message_list = few_shot_messages
+    elif few_shot_method == "few-shot-static-messages":
+        few_shot_message_list = [
+            message
+            for index in static_indices
+            for message in few_shot_messages_by_index[index]
+        ]
+    elif few_shot_method == "few-shot-dynamic-messages":
+
+        def predict(example: dict):
+            example_selector = SemanticSimilarityExampleSelector.from_examples(
+                examples_for_semantic_search,
+                OpenAIEmbeddings(model="text-embedding-3-large"),
+                FAISS,
+                k=3,
+                input_keys=["question"],
+                example_keys=["messages"],
+            )
+
+            few_shot_prompt = FewShotChatMessagePromptTemplate(
+                input_variables=[],
+                example_selector=example_selector,
+                example_prompt=MessagesPlaceholder("messages"),
+            )
+            return {
+                "response": chain.invoke(
+                    {
+                        "input": example["question"],
+                        "instructions": instructions,
+                        "few_shot_message_list": few_shot_prompt.invoke(
+                            {"question": example["question"][0]["content"]}
+                        ).messages,
+                    }
+                )
+            }
+
+        return predict
+
+    def predict(example: dict):
+        return {
+            "response": chain.invoke(
+                {
+                    "input": example["question"],
+                    "instructions": instructions,
+                    "few_shot_message_list": few_shot_message_list,
+                }
+            )
+        }
+
+    return predict
+
+
+models = [
+    (
+        "claude-3-haiku-20240307",
+        "anthropic",
+    ),
+    (
+        "claude-3-sonnet-20240229",
+        "anthropic",
+    ),
+    (
+        "claude-3-opus-20240229",
+        "anthropic",
+    ),
+    (
+        "claude-3-5-sonnet-20240620",
+        "anthropic",
+    ),
+    ("gpt-3.5-turbo-0125", "openai"),
+    ("gpt-4o", "openai"),
+    ("gpt-4o-mini", "openai"),
+]
+
+few_shot_methods = [
+    "no-few-shot",
+    "few-shot-string",
+    "few-shot-messages",
+    "few-shot-static-messages",
+    "few-shot-dynamic-messages",
+]
+
+from tqdm import tqdm
+
+experiment_uuid = uuid.uuid4().hex[:4]
+for i in tqdm(range(3)):
+    for model_name, model_provider in models:
+        model = init_chat_model(
+            model_name, model_provider=model_provider, temperature=0
+        )
+        for few_shot_method in few_shot_methods:
+            evaluate(
+                predict_for_model(
+                    model, QUERY_ANALYSIS_TASK.instructions, few_shot_method, model_name
+                ),
+                data=QUERY_ANALYSIS_TASK.name,
+                evaluators=[evaluate_run],
+                experiment_prefix=f"{model_name}-TEST-{i+2}-{few_shot_method}",
+                metadata={"id": experiment_uuid},
+            )
@@ -0,0 +1,61 @@
+# Security Policy
+
+## Reporting OSS Vulnerabilities
+
+LangChain is partnered with [huntr by Protect AI](https://huntr.com/) to provide 
+a bounty program for our open source projects. 
+
+Please report security vulnerabilities associated with the LangChain 
+open source projects by visiting the following link:
+
+[https://huntr.com/bounties/disclose/](https://huntr.com/bounties/disclose/?target=https%3A%2F%2Fgithub.com%2Flangchain-ai%2Flangchain&validSearch=true)
+
+Before reporting a vulnerability, please review:
+
+1) In-Scope Targets and Out-of-Scope Targets below.
+2) The [langchain-ai/langchain](https://python.langchain.com/docs/contributing/repo_structure) monorepo structure.
+3) LangChain [security guidelines](https://python.langchain.com/docs/security) to
+   understand what we consider to be a security vulnerability vs. developer
+   responsibility.
+
+### In-Scope Targets
+
+The following packages and repositories are eligible for bug bounties:
+
+- langchain-core
+- langchain (see exceptions)
+- langchain-community (see exceptions)
+- langgraph
+- langserve
+
+### Out of Scope Targets
+
+All out of scope targets defined by huntr as well as:
+
+- **langchain-experimental**: This repository is for experimental code and is not
+  eligible for bug bounties, bug reports to it will be marked as interesting or waste of
+  time and published with no bounty attached.
+- **tools**: Tools in either langchain or langchain-community are not eligible for bug
+  bounties. This includes the following directories
+  - langchain/tools
+  - langchain-community/tools
+  - Please review our [security guidelines](https://python.langchain.com/docs/security)
+    for more details, but generally tools interact with the real world. Developers are
+    expected to understand the security implications of their code and are responsible
+    for the security of their tools.
+- Code documented with security notices. This will be decided done on a case by
+  case basis, but likely will not be eligible for a bounty as the code is already
+  documented with guidelines for developers that should be followed for making their
+  application secure.
+- Any LangSmith related repositories or APIs see below.
+
+## Reporting LangSmith Vulnerabilities
+
+Please report security vulnerabilities associated with LangSmith by email to `security@langchain.dev`.
+
+- LangSmith site: https://smith.langchain.com
+- SDK client: https://github.com/langchain-ai/langsmith-sdk
+
+### Other Security Concerns
+
+For any other security concerns, please contact us at `security@langchain.dev`.
@@ -1,54 +0,0 @@
-import pytest
-from langchain_core.agents import AgentActionMessageLog, AgentFinish
-from langchain_core.exceptions import OutputParserException
-from langchain_core.messages import AIMessage
-
-from langchain_benchmarks.tool_usage.agents.experimental.parser import (
-    GenericAgentParser,
-)
-
-
-def test_parser() -> None:
-    """Test parser."""
-    parser = GenericAgentParser(require_closing_tag=False, wrapping_xml_tag="tool")
-
-    # If <tool> tag not found then it's an agent finish
-    assert isinstance(parser.invoke("goodbye"), AgentFinish)
-
-    with pytest.raises(OutputParserException):
-        # Invocation content is missing tool name and arguments
-        parser.invoke("<tool>'hello'</tool>")
-
-    with pytest.raises(OutputParserException):
-        parser.invoke("<tool>hello")
-
-    # Full invocation
-    text = (
-        '<tool>{\n    "tool_name": "type_letter",\n    '
-        '"arguments": {\n        '
-        '"letter": "h"\n    }\n}</tool>\n'
-    )
-
-    assert parser.invoke(text) == AgentActionMessageLog(
-        tool="type_letter",
-        tool_input={"letter": "h"},
-        log="\nInvoking type_letter: {'letter': 'h'}\n\t",
-        message_log=[AIMessage(content=text)],
-    )
-
-    # Test more cases
-    parsed = parser.invoke('<tool>{"tool_name": "hello"}</tool>')
-    assert parsed.tool == "hello"
-    # Assumes that it's a structured tool by default!
-    assert parsed.tool_input == {}
-
-    with pytest.raises(OutputParserException):
-        # Arguments need to be a dict
-        parser.invoke('<tool>{"tool_name": "hello", "arguments": [1, 2]}</tool>')
-
-    parsed = parser.invoke(
-        '<tool>{"tool_name": "hello", "arguments": {"a": "b"}}</tool>'
-    )
-    assert parsed.tool == "hello"
-    # Assumes that it's a structured tool by default!
-    assert parsed.tool_input == {"a": "b"}
@@ -1,25 +0,0 @@
-"""Test typescript encoding."""
-from langchain_benchmarks.tool_usage.agents.experimental.encoder import (
-    FunctionDefinition,
-    TypeScriptEncoder,
-)
-
-
-def test_function_definition() -> None:
-    """Test encoding a function definition."""
-    function_definition = FunctionDefinition(
-        name="test_function",
-        description="A test function",
-        parameters=[
-            {"name": "test_parameter", "type": "str", "description": "A test parameter"}
-        ],
-        return_value={"type": "str", "description": "A test return value"},
-    )
-    encoder = TypeScriptEncoder()
-    xml = encoder.visit_function_definition(function_definition)
-    assert xml == (
-        "// A test function\n"
-        "// @param test_parameter A test parameter\n"
-        "// @returns A test return value\n"
-        "function test_function(test_parameter: str): str;"
-    )
@@ -1,90 +0,0 @@
-"""Test XML encoding and decoding of function definitions, invocation, and results."""
-from langchain_benchmarks.tool_usage.agents.experimental.encoder import (
-    FunctionDefinition,
-    FunctionInvocation,
-    FunctionResult,
-    XMLEncoder,
-)
-
-
-def test_function_definition_encoding() -> None:
-    """Test encoding a function definition."""
-    function_definition = FunctionDefinition(
-        name="test_function",
-        description="A test function",
-        parameters=[
-            {"name": "test_parameter", "type": "str", "description": "A test parameter"}
-        ],
-        return_value={"type": "str", "description": "A test return value"},
-    )
-    encoder = XMLEncoder()
-    xml = encoder.visit_function_definition(function_definition)
-    assert xml == (
-        "<function>\n"
-        "<function_name>test_function</function_name>\n"
-        "<description>\n"
-        "A test function\n"
-        "</description>\n"
-        "<parameters>\n"
-        "<parameter>\n"
-        "<name>test_parameter</name>\n"
-        "<type>str</type>\n"
-        "<description>A test parameter</description>\n"
-        "</parameter>\n"
-        "</parameters>\n"
-        "<return_value>\n"
-        "<type>str</type>\n"
-        "<description>A test return value</description>\n"
-        "</return_value>\n"
-        "</function>"
-    )
-
-
-def test_function_result_encoding() -> None:
-    """Test encoding a function result."""
-    encoder = XMLEncoder()
-    function_result = FunctionResult(
-        name="test_function",
-        result="test_result",
-        error=None,
-    )
-    xml = encoder.visit_function_result(function_result)
-    assert xml == (
-        "<function_result>\n"
-        "<function_name>test_function</function_name>\n"
-        "<result>test_result</result>\n"
-        "</function_result>"
-    )
-
-    function_result = FunctionResult(
-        name="test_function",
-        error="error",
-    )
-    xml = encoder.visit_function_result(function_result)
-    assert xml == (
-        "<function_result>\n"
-        "<function_name>test_function</function_name>\n"
-        "<error>error</error>\n"
-        "</function_result>"
-    )
-
-
-def test_function_invocation() -> None:
-    """Test function invocation."""
-    function_invocation = FunctionInvocation(
-        name="test_function",
-        arguments=[{"name": "test_argument", "value": "test_value"}],
-    )
-    encoder = XMLEncoder()
-    xml = encoder.visit_function_invocation(function_invocation)
-    assert xml == (
-        "<function_invocation>\n"
-        "<function_name>test_function</function_name>\n"
-        "<arguments>\n"
-        "<argument>\n"
-        "<name>test_argument</name>\n"
-        "<value>test_value</value>\n"
-        "</argument>\n"
-        "</arguments>\n"
-        "</function_invocation>"
-    )
@@ -1,59 +0,0 @@
-import pytest
-from langchain.tools import tool
-
-from langchain_benchmarks.tool_usage.agents.experimental.tool_utils import (
-    convert_tool_to_function_definition,
-)
-
-
-@tool
-def get_hello() -> str:
-    """Get hello."""
-    return "hello"
-
-
-@tool
-def repeat(x: str) -> str:
-    """Repeat x.
-
-    Args:
-        x: The string to repeat.
-
-    Returns:
-        The repeated string.
-    """
-    return x
-
-
-def test_parameterless_function() -> None:
-    """Test foo."""
-    function_definition = convert_tool_to_function_definition(get_hello)
-    assert function_definition == {
-        "name": "get_hello",
-        "description": "Get hello.",
-        "parameters": [],
-        "return_value": {
-            "type": "Any",
-        },
-    }
-
-
-@pytest.mark.skip("Need to fix handling of leading whitespace")
-def test_function_with_parameters() -> None:
-    import textwrap
-
-    doc = textwrap.dedent(repeat.func.__doc__)
-    assert convert_tool_to_function_definition(repeat) == {
-        "name": "repeat",
-        "description": doc,
-        "parameters": [
-            {
-                "name": "x",
-                "type": "str",
-                "description": "",  # Need to fix this
-            }
-        ],
-        "return_value": {
-            "type": "Any",
-        },
-    }
@@ -6,5 +6,11 @@ def test_public_api() -> None:
    # This test will also fail if __all__ is not sorted.
    # Please keep it sorted!
    assert __all__ == sorted(
-        ["apply_agent_executor_adapter", "get_eval_config"], key=str.lower
+        [
+            "apply_agent_executor_adapter",
+            "get_eval_config",
+            "CustomRunnableAgentFactory",
+            "StandardAgentFactory",
+        ],
+        key=str.lower,
    )
Author	SHA1	Message	Date
ccurme	34cd281494	benchmarks[major]: bump core to 0.3 (#211 ) - Drop support for python 3.8 - Bump langchain-core to 0.3 - Update pydantic objects to v2	2024-10-21 16:47:14 -04:00
Isaac Francisco	99cf03a50a	add faiss-cpu dependency (#209 )	2024-08-07 07:53:45 -07:00
Isaac Francisco	b36a339a65	Isaac/realfixes (#208 )	2024-08-06 15:28:43 -07:00
Isaac Francisco	442cb47fc9	Isaac/realfixes (#207 )	2024-08-06 15:24:23 -07:00
Isaac Francisco	b7795c7df1	change wd (#206 )	2024-08-06 15:15:08 -07:00
Isaac Francisco	ac161de968	thanks erick (#205 )	2024-08-06 14:50:39 -07:00
Isaac Francisco	d91944bb07	test (#204 )	2024-08-06 14:45:48 -07:00
Isaac Francisco	8798bd3105	test (#203 )	2024-08-06 14:40:01 -07:00
Isaac Francisco	621eea5d93	Isaac/tryingpoetryagain (#202 )	2024-08-06 14:36:43 -07:00
Isaac Francisco	b6590a8745	Isaac/changepoetry (#201 )	2024-08-06 14:30:42 -07:00
Isaac Francisco	458ffa70ea	test (#200 )	2024-08-06 14:26:56 -07:00
Isaac Francisco	ebe5c117c2	test (#198 )	2024-08-06 14:14:39 -07:00
Ikko Eltociear Ashimine	adff80af11	docs: update README.md (#195 ) Mutiverse -> Multiverse	2024-07-24 11:13:42 -07:00
Bagatur	301837e303	Release 0.0.14 (#194 )	2024-07-24 08:00:17 -07:00
Bagatur	4f1d922a6e	minor: bump to langchain v2 (#191 )	2024-07-24 07:59:19 -07:00
Bagatur	e4e26a3b8e	infra: release permissions (#193 )	2024-07-24 07:56:47 -07:00
Bagatur	7f82761813	Release 0.0.13 (#192 )	2024-07-24 07:44:20 -07:00
Isaac Francisco	7e16b6daa6	tool benchmarking (#190 ) Co-authored-by: Bagatur <baskaryan@gmail.com>	2024-07-24 07:00:33 -07:00
Eugene Yurtsev	22d279a25c	Update README.md (#187 )	2024-04-19 10:19:19 -04:00
Eugene Yurtsev	357ada3867	Update README.md (#186 )	2024-04-18 19:58:54 -04:00
Eugene Yurtsev	ab2d93ac6d	Update README.md (#185 )	2024-04-18 13:48:51 -04:00
Eugene Yurtsev	53f727af64	Update README.md (#184 )	2024-04-18 13:47:49 -04:00
Eugene Yurtsev	820af98418	Release 0.0.12 (#183 )	2024-04-18 13:38:38 -04:00
Eugene Yurtsev	857f41882f	Update README.md (#182 )	2024-04-18 11:33:45 -04:00
Eugene Yurtsev	381ada5cbe	Update benchmarks all notebook to use {question} instead of {input} (#179 ) Update benchmarks all prompt	2024-04-18 11:28:21 -04:00
Eugene Yurtsev	32a532f269	Update README.md (#181 )	2024-04-18 11:28:09 -04:00
Eugene Yurtsev	d0acf0ee26	Add security policy (#180 ) Add security policy	2024-04-18 11:19:13 -04:00
Eugene Yurtsev	bec40d90ef	Remove old code (#176 ) Remove old code	2024-04-18 11:16:42 -04:00
Eugene Yurtsev	c80e959b05	Simplify all tool usage notebooks (#178 ) Simplify tool usage notebooks	2024-04-18 11:09:34 -04:00
Eugene Yurtsev	2007f68302	Update intro, remove adapter (#177 ) Remove confusing adapter for agents. Agent template should just take {question} as the input. Update intro and simplify it!	2024-04-18 10:47:46 -04:00
Eugene Yurtsev	aad9045bcb	remove tiny multiverse dataset from registry (#175 ) Keep it for backwards compatibility but do not expose in task registry. This dataset is probably more confusing to folks than helpful especially since it it completely overlaps with the existing multiverse math dataset. We should add another dataset that's later.	2024-04-18 09:31:03 -04:00
Eugene Yurtsev	3b86e9f0b5	Update benchmark all for agents (#174 )	2024-04-18 09:23:19 -04:00