Compare commits

...

19 Commits

Author SHA1 Message Date
isaac hershenson 12e9107edc draft 2024-07-22 12:25:13 -07:00
isaac hershenson a7879c10bb draft state 2024-07-22 07:51:59 -07:00
isaac hershenson e2882258b0 editing tool benchmarks 2024-07-15 21:13:52 -07:00
isaac hershenson be682101d8 hub prompts 2024-07-10 22:43:40 -07:00
isaac hershenson 611408d1d3 fmt 2024-07-10 14:14:28 -07:00
isaac hershenson b10c099c7a fmt 2024-07-10 14:12:49 -07:00
isaac hershenson 94506e4726 fmt 2024-07-10 14:02:15 -07:00
isaac hershenson ab16d66903 fmt 2024-07-10 14:00:57 -07:00
isaac hershenson 47382f82d9 Merge branch 'isaac/toolbenchmarks' of https://github.com/langchain-ai/langchain-benchmarks into isaac/toolbenchmarks 2024-07-10 13:58:31 -07:00
isaac hershenson 17c7794130 semantic search working (i think) 2024-07-10 13:57:10 -07:00
Bagatur 9378a8d9ed fmt 2024-07-09 15:25:59 -07:00
Bagatur 84cca0a2ee fmt 2024-07-09 15:19:47 -07:00
isaac hershenson a601b6de9c fmt 2024-07-09 14:20:50 -07:00
isaac hershenson ea10c07358 second draft 2024-07-09 14:17:57 -07:00
isaac hershenson 3aae5da769 first draft 2024-07-03 12:11:02 -07:00
Eugene Yurtsev 22d279a25c Update README.md (#187) 2024-04-19 10:19:19 -04:00
Eugene Yurtsev 357ada3867 Update README.md (#186) 2024-04-18 19:58:54 -04:00
Eugene Yurtsev ab2d93ac6d Update README.md (#185) 2024-04-18 13:48:51 -04:00
Eugene Yurtsev 53f727af64 Update README.md (#184) 2024-04-18 13:47:49 -04:00
30 changed files with 1121 additions and 1369 deletions
+30
View File
@@ -0,0 +1,30 @@
name: Weekly Tool Benchmarks
on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # Runs at midnight (00:00) every Sunday (UTC time)
jobs:
run_tool_benchmarks:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.12 + Poetry ${{ env.POETRY_VERSION }}
uses: "./.github/actions/poetry_setup"
with:
python-version: '3.12'
poetry-version: ${{ env.POETRY_VERSION }}
working-directory: .
cache-key: benchmarks-all
- name: Install dependencies
shell: bash
run: |
echo "Running tests, installing dependencies with poetry..."
poetry install --with test,lint,typing,docs
- name: Execute Tool Benchmarks
run: python scripts/tool_benchmarks.py
+2
View File
@@ -10,6 +10,8 @@ lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=. --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint lint_diff:
# [ "$(PYTHON_FILES)" = "" ] || poetry run ruff check $(PYTHON_FILES)
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff check --select I $(PYTHON_FILES)
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
# [ "$(PYTHON_FILES)" = "" ] || poetry run mypy $(PYTHON_FILES)
+18 -4
View File
@@ -26,10 +26,24 @@ We have several goals in open sourcing this:
Read some of the articles about benchmarking results on our blog.
* Agent Tool Use: https://blog.langchain.dev/benchmarking-agent-tool-use/
* Query Analysis in High Cardinality Situations: https://blog.langchain.dev/high-cardinality/
* Rag on Tables: https://blog.langchain.dev/benchmarking-rag-on-tables/
* Q&A over CSV data: https://blog.langchain.dev/benchmarking-question-answering-over-csv-data/
* [Agent Tool Use](https://blog.langchain.dev/benchmarking-agent-tool-use/)
* [Query Analysis in High Cardinality Situations](https://blog.langchain.dev/high-cardinality/)
* [RAG on Tables](https://blog.langchain.dev/benchmarking-rag-on-tables/)
* [Q&A over CSV data](https://blog.langchain.dev/benchmarking-question-answering-over-csv-data/)
### Tool Usage (2024-04-18)
See [tool usage docs](https://langchain-ai.github.io/langchain-benchmarks/notebooks/tool_usage/benchmark_all_tasks.html) to recreate!
![download](https://github.com/langchain-ai/langchain-benchmarks/assets/3205522/0da33de8-e03f-49cf-bd48-e9ff945828a9)
Explore Agent Traces on LangSmith:
* [Relational Data](https://smith.langchain.com/public/22721064-dcf6-4e42-be65-e7c46e6835e7/d)
* [Tool Usage (1-tool)](https://smith.langchain.com/public/ac23cb40-e392-471f-b129-a893a77b6f62/d)
* [Tool Usage (26-tools)](https://smith.langchain.com/public/366bddca-62b3-4b6e-849b-a478abab73db/d)
* [Mutiverse Math](https://smith.langchain.com/public/983faff2-54b9-4875-9bf2-c16913e7d489/d)
## Installation
+1 -1
View File
@@ -3,12 +3,12 @@ from langchain.agents import AgentExecutor, OpenAIFunctionsAgent
from langchain.agents.agent_toolkits.conversational_retrieval.tool import (
create_retriever_tool,
)
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain.tools import PythonAstREPLTool
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langsmith import Client
from pydantic import BaseModel, Field
+1 -1
View File
@@ -1,8 +1,8 @@
import pandas as pd
from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain_openai import ChatOpenAI
from langsmith import Client
if __name__ == "__main__":
+1 -1
View File
@@ -1,8 +1,8 @@
import pandas as pd
from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain_openai import ChatOpenAI
from langsmith import Client
if __name__ == "__main__":
+1 -1
View File
@@ -1,8 +1,8 @@
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain_openai import ChatOpenAI
from langsmith import Client
from pandasai import PandasAI
+1 -1
View File
@@ -2,7 +2,7 @@ import pandas as pd
import streamlit as st
from langchain.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
df = pd.read_csv("titanic.csv")
+1 -1
View File
@@ -1,6 +1,6 @@
import streamlit as st
from langchain.chains import create_extraction_chain
from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langsmith import Client
st.set_page_config(page_title="🦜🔗 Text-to-graph extraction")
@@ -3,13 +3,13 @@ from typing import List, Tuple
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad import format_to_openai_functions
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.pydantic_v1 import BaseModel, Field
from langchain.schema.messages import AIMessage, HumanMessage
from langchain.tools import tool
from langchain.tools.render import format_tool_to_openai_function
from langchain_docs_retriever.retriever import get_retriever
from langchain_openai import ChatOpenAI
# This is used to tell the model how to best use the retriever.
@@ -7,9 +7,9 @@ from typing import Callable, Optional
from anthropic_iterative_search.chain import chain as anthropic_agent_chain
from chat_langchain.chain import create_chain
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import Runnable
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain_openai import ChatOpenAI
from langsmith import Client
from oai_assistant.chain import agent_executor as openai_assistant_chain
from openai_functions_agent import agent_executor as openai_functions_agent_chain
@@ -259,8 +259,8 @@
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"llm = ChatOpenAI(model=\"gpt-4-1106-preview\", temperature=0).bind_functions(\n",
" functions=[task.schema],\n",
+1 -1
View File
@@ -232,8 +232,8 @@
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\", temperature=0).bind_functions(\n",
" functions=[task.schema],\n",
+1 -1
View File
@@ -97,7 +97,7 @@
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"from langchain_benchmarks.extraction import get_eval_config\n",
"\n",
@@ -75,6 +75,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "7fb27b941602401d91542211134fc71a",
"metadata": {},
"outputs": [],
"source": [
@@ -728,12 +729,12 @@
"from langchain.agents import AgentExecutor\n",
"from langchain.agents.format_scratchpad import format_to_openai_functions\n",
"from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
"from langchain.pydantic_v1 import BaseModel, Field\n",
"from langchain.schema.messages import AIMessage, HumanMessage\n",
"from langchain.tools import tool\n",
"from langchain.tools.render import format_tool_to_openai_function\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"# This is used to tell the model how to best use the retriever.\n",
"\n",
@@ -508,8 +508,8 @@
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.schema.messages import HumanMessage\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"\n",
"def image_summarize(img_base64, prompt):\n",
@@ -328,10 +328,10 @@
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"from langchain.schema.runnable import RunnablePassthrough\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"\n",
"def rag_chain(retriever):\n",
@@ -451,11 +451,11 @@
"source": [
"from operator import itemgetter\n",
"\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain.schema.document import Document\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"from langchain.schema.runnable.passthrough import RunnableAssign\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"# Prompt\n",
"prompt = ChatPromptTemplate.from_messages(\n",
@@ -126,7 +126,6 @@
"source": [
"import uuid\n",
"\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.document_loaders import PyPDFLoader\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.prompts import ChatPromptTemplate\n",
@@ -138,6 +137,7 @@
"from langchain.storage import InMemoryStore\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.vectorstores import Chroma\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"\n",
"def prepare_documents(docs):\n",
@@ -1,8 +1,8 @@
from typing import Optional
from langchain.chat_models import ChatOpenAI
from langchain.chat_models.base import BaseChatModel
from langchain.smith import RunEvalConfig
from langchain_openai import ChatOpenAI
def get_eval_config(eval_llm: Optional[BaseChatModel] = None) -> RunEvalConfig:
@@ -2,10 +2,10 @@
from typing import Any, Dict, List, Optional, Type
from langchain.chains.openai_functions import convert_to_openai_function
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import Runnable
from langchain_openai import ChatOpenAI
from langsmith.client import Client
from pydantic import BaseModel
+1 -1
View File
@@ -1,8 +1,8 @@
from typing import Optional
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import load_evaluator
from langchain.smith import RunEvalConfig
from langchain_openai import ChatOpenAI
try:
from langchain.schema.language_model import BaseLanguageModel
@@ -1,9 +1,9 @@
from typing import Optional
from langchain.base_language import BaseLanguageModel
from langchain.chat_models import ChatOpenAI
from langchain.schema.retriever import BaseRetriever
from langchain.schema.runnable import Runnable
from langchain_openai import ChatOpenAI
from langchain_benchmarks.rag.tasks.langchain_docs.architectures.crqa import (
create_response_chain,
+1 -1
View File
@@ -3,7 +3,6 @@ import os
from functools import partial
from typing import Callable, Iterable, List, Optional
from langchain.chat_models import ChatOpenAI
from langchain.indexes import SQLRecordManager, index
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.prompts import ChatPromptTemplate
@@ -18,6 +17,7 @@ from langchain.schema.storage import BaseStore
from langchain.schema.vectorstore import VectorStore
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from langchain_openai import ChatOpenAI
from tqdm.auto import tqdm
logger = logging.getLogger(__name__)
+2 -2
View File
@@ -300,8 +300,8 @@ def _get_default_path(provider: str, type_: ModelType) -> str:
"""Get the default path for a model."""
paths = {
("anthropic", "chat"): "langchain_anthropic.ChatAnthropic",
("anyscale", "chat"): "langchain.chat_models.anyscale.ChatAnyscale",
("anyscale", "llm"): "langchain.llms.anyscale.Anyscale",
("anyscale", "chat"): "langchain_community.chat_models.anyscale.ChatAnyscale",
("anyscale", "llm"): "langchain_community.llms.anyscale.Anyscale",
("fireworks", "chat"): "langchain_fireworks.ChatFireworks",
("fireworks", "llm"): "langchain_fireworks.Fireworks",
("openai", "chat"): "langchain_openai.ChatOpenAI",
@@ -10,11 +10,11 @@ from typing import Any, Literal, Optional, Union
from langchain.callbacks.manager import collect_runs
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import EvaluatorType, StringEvaluator, load_evaluator
from langchain.evaluation.schema import StringEvaluator
from langchain.smith import RunEvalConfig
from langchain_core.language_models import BaseChatModel, BaseLanguageModel
from langchain_openai import ChatOpenAI
from langsmith.evaluation.evaluator import (
EvaluationResult,
EvaluationResults,
@@ -47,7 +47,7 @@ from langchain_benchmarks.schema import ToolUsageEnvironment, ToolUsageTask
def multiply(a: float, b: float) -> float:
"""Multiply two numbers; a * b."""
return 1.1 * a * b
return round(1.1 * a * b, 5)
def divide(a: float, b: float) -> float:
@@ -140,7 +140,7 @@ DATASET_TINY = [
"expected_steps": ["subtract"],
},
{
"question": "What is -5 if evaluated using the negate function?",
"question": "what is the value of the negate function evaluated on the argument -5",
"answer": negate(-5),
"expected_steps": ["negate"],
},
Generated
+809 -1311
View File
File diff suppressed because it is too large Load Diff
+7 -28
View File
@@ -8,42 +8,25 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.8.1"
langchain = "^0.1.15"
langchain = "^0.2.7"
langsmith = ">=0.0.70"
tqdm = "^4"
ipywidgets = "^8"
tabulate = ">=0.8.0"
langchain-openai = "^0.1.14"
langchain-community = "^0.2.6"
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core"}
langchain = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/langchain"}
langchain-anthropic = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/anthropic"}
langchain-google-vertexai= {git = "https://github.com/langchain-ai/langchain-google.git", subdirectory = "libs/vertexai/"}
langchain-fireworks = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/fireworks"}
langchain-mistralai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/mistralai"}
langchain-cohere = {git = "https://github.com/langchain-ai/langchain-cohere.git", subdirectory="libs/cohere"}
langchain-groq = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/groq"}
langchain-openai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/openai"}
[tool.poetry.group.typing]
optional = true
[tool.poetry.group.typing.dependencies]
mypy = "^1.7.0"
langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core"}
langchain = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/langchain"}
langchain-anthropic = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/anthropic"}
langchain-fireworks = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/fireworks"}
langchain-mistralai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/mistralai"}
langchain-cohere = {git = "https://github.com/langchain-ai/langchain-cohere.git", subdirectory="libs/cohere"}
langchain-groq = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/groq"}
langchain-openai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/openai"}
[tool.poetry.group.lint]
optional = true
@@ -74,14 +57,10 @@ pytest-socket = "^0.6.0"
pytest-watch = "^4.2.0"
pytest-timeout = "^2.2.0"
freezegun = "^1.3.1"
langchain-core = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/core"}
langchain = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/langchain"}
langchain-anthropic = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/anthropic"}
langchain-fireworks = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/fireworks"}
langchain-mistralai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/mistralai"}
langchain-cohere = {git = "https://github.com/langchain-ai/langchain-cohere.git", subdirectory="libs/cohere"}
langchain-groq = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/groq"}
langchain-openai = {git = "https://github.com/langchain-ai/langchain.git", subdirectory = "libs/partners/openai"}
langchain-anthropic = "^0.1.19"
langchain-fireworks = "^0.1.4"
langchain-mistralai = "^0.1.9"
langchain-groq = "^0.1.6"
[tool.ruff]
select = [
+228
View File
@@ -0,0 +1,228 @@
import datetime
import uuid
from langchain import hub
from langchain_community.vectorstores import FAISS
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.messages.utils import convert_to_messages
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.prompts.few_shot import FewShotChatMessagePromptTemplate
from langchain_openai import OpenAIEmbeddings
from langsmith.client import Client
#from langchain_benchmarks import __version__, registry
from langchain_benchmarks.rate_limiting import RateLimiter
from langchain_benchmarks.tool_usage.agents import StandardAgentFactory
import sys
sys.path.append("./..")
from langchain_benchmarks import __version__, registry
from langchain_benchmarks.tool_usage.tasks.multiverse_math import *
from langchain.chat_models import init_chat_model
from langsmith.evaluation import evaluate
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.tools import tool
tests = [
("claude-3-haiku-20240307","anthropic",),
("claude-3-sonnet-20240229","anthropic",),
("claude-3-opus-20240229","anthropic",),
("claude-3-5-sonnet-20240620","anthropic",),
("gpt-3.5-turbo-0125", "openai"),
("gpt-4o","openai",),
("gpt-4o-mini","openai"),
("llama3-groq-70b-8192-tool-use-preview","groq"),
("llama3-groq-8b-8192-tool-use-preview","groq"),
("gemini-1.5-pro","google_genai"),
("gemini-1.5-flash","google_genai")
]
client = Client() # Launch langsmith client for cloning datasets
def get_few_shot_messages(task_name):
if task_name == "Multiverse Math":
uncleaned_examples = [
e
for e in client.list_examples(
dataset_name="multiverse-math-examples-for-few-shot"
)
]
few_shot_messages = []
few_shot_three_messages = []
examples = []
for i in range(len(uncleaned_examples)):
converted_messages = convert_to_messages(
uncleaned_examples[i].outputs["output"]
)
examples.append(
# The message at index 1 is the human message asking the actual math question (0th message is system prompt)
{
"question": converted_messages[1].content,
"messages": [
m
for m in converted_messages
if isinstance(m, SystemMessage) == False
],
}
)
few_shot_messages += converted_messages
if i < 3:
few_shot_three_messages += converted_messages
return examples, [
m for m in few_shot_messages if not isinstance(m, SystemMessage)
], [m for m in few_shot_three_messages if not isinstance(m, SystemMessage)]
else:
raise ValueError("Few shot messages not supported for this dataset")
def turn_messages_to_str(few_shot_messages):
few_shot_str = ""
for m in few_shot_messages:
if isinstance(m.content, list):
few_shot_str += "AI message: "
for tool_use in m.content:
if "name" in tool_use:
few_shot_str += f"Use tool {tool_use['name']}, input: {', '.join(f'{k}:{v}' for k,v in tool_use['input'].items())}"
else:
few_shot_str += tool_use["text"]
few_shot_str += "\n"
else:
if isinstance(m, HumanMessage):
few_shot_str += f"Human message: {m.content}"
else:
few_shot_str += f"AI message: {m.content}"
few_shot_str += "\n"
return few_shot_str
def get_few_shot_str_from_messages(few_shot_messages, few_shot_three_messages):
few_shot_str = turn_messages_to_str(few_shot_messages)
few_shot_three_str = turn_messages_to_str(few_shot_three_messages)
return few_shot_str, few_shot_three_str
def get_prompts(task_name, **kwargs):
if task_name == "Multiverse Math":
example_selector = SemanticSimilarityExampleSelector.from_examples(
kwargs['examples'],
OpenAIEmbeddings(),
FAISS,
k=3,
input_keys=["question"],
example_keys=["messages"],
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
input_variables=[],
example_selector=example_selector,
example_prompt=MessagesPlaceholder("messages"),
)
return [
(
client.pull_prompt("langchain-ai/multiverse-math-no-few-shot"),
"no-few-shot",
),
(
client.pull_prompt("langchain-ai/multiverse-math-few-shot-messages"),
"few-shot-messages",
),
(
client.pull_prompt("langchain-ai/multiverse-math-few-shot-str"),
"few-shot-string",
),
(
client.pull_prompt("langchain-ai/multiverse-math-few-shot-3-messages"),
"few-shot-three-messages",
),
(
client.pull_prompt("langchain-ai/multiverse-math-few-shot-3-str"),
"few-shot-three-strings",
),
(
ChatPromptTemplate.from_messages(
[
(
"system",
"{instructions} Here are some example conversations of the user interacting with the AI until the correct answer is reached: ",
),
few_shot_prompt,
("human", "{question}"),
MessagesPlaceholder("agent_scratchpad"),
]
),
"few-shot-semantic-openai-embeddinga",
),
]
def predict_from_callable(callable,instructions):
def predict(run):
return callable.invoke({"question":run['question'],"instructions":instructions})
return predict
def pi(a: float) -> float:
"""Returns a precise value of PI for this alternate universe."""
return math.e
experiment_uuid = uuid.uuid4().hex[:4]
today = datetime.date.today().isoformat()
for task in registry.tasks:
if task.type != "ToolUsageTask" or task.name != "Multiverse Math":
continue
dataset_name = task.name
examples, few_shot_messages, few_shot_three_messages = get_few_shot_messages(task.name)
few_shot_str, few_shot_three_str = get_few_shot_str_from_messages(few_shot_messages,few_shot_three_messages)
prompts = get_prompts(task.name,examples=examples,few_shot_three_messages=few_shot_three_messages,few_shot_three_str=few_shot_three_str)
for model_name, model_provider in tests[9:]:
model = init_chat_model(model_name,model_provider=model_provider)
rate_limiter = RateLimiter(requests_per_second=1)
print(f"Benchmarking {task.name} with model: {model_name}")
eval_config = task.get_eval_config()
for prompt, prompt_name in prompts[:-1]:
tools = task.create_environment().tools
if "google" in model_provider:
tools[9] = tool(pi)
agent = create_tool_calling_agent(model, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, return_intermediate_steps=True)
'''
# Legacy way of running, migrate to evaluate
agent_factory = StandardAgentFactory(
task, model, prompt, rate_limiter=rate_limiter
)
client.run_on_dataset(
dataset_name=dataset_name,
llm_or_chain_factory=agent_factory,
evaluation=eval_config,
verbose=False,
project_name=f"{model_name}-{task.name}-{prompt_name}-{experiment_uuid}",
concurrency_level=5,
project_metadata={
"model": model_name,
"id": experiment_uuid,
"task": task.name,
"date": today,
"langchain_benchmarks_version": __version__,
},
)
'''
evaluate(
predict_from_callable(agent_executor,task.instructions),
data=dataset_name,
evaluators=eval_config.custom_evaluators,
max_concurrency=5,
metadata={
"model": model_name,
"id": experiment_uuid,
"task": task.name,
"date": today,
"langchain_benchmarks_version": __version__,
},
experiment_prefix=f"{model_name}-{task.name}-{prompt_name}"
)