mirror of
https://github.com/langchain-ai/langsmith-cookbook.git
synced 2026-07-01 08:12:02 -04:00
364 lines
13 KiB
Plaintext
364 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e537e271-4b08-491f-8cf7-c9be1f3fcf15",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Exact Match Evaluation\n",
|
|
"\n",
|
|
"The simplest evaluation type is direct string comparison. LangChain has a prebuilt [\"exact match\" evaluator](https://python.langchain.com/docs/guides/productionization/evaluation/string/exact_match) you can use, or you can do the same with a custom evaluator.\n",
|
|
"\n",
|
|
"You can check out the example results [here](https://smith.langchain.com/public/454c80b5-9809-4f4f-95ee-1f71d8e3ef53/d).\n",
|
|
"\n",
|
|
"[](https://smith.langchain.com/public/454c80b5-9809-4f4f-95ee-1f71d8e3ef53/d)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "121dcc53-70ec-48df-adac-cbd424c66adc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# %pip install -U --quiet langchain langchain_openai"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "3f610c6e-144b-47c8-9791-eaf4f42a8ccb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"\n",
|
|
"# Update with your API URL if using a hosted instance of Langsmith.\n",
|
|
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
|
|
"# Update with your API key\n",
|
|
"os.environ[\"LANGCHAIN_API_KEY\"] = \"YOUR API KEY\"\n",
|
|
"os.environ[\"OPENAI_API_KEY\"] = \"Your openai api key\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ff62061d-0fb9-4ba9-b185-ff8c9746fb72",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create Dataset\n",
|
|
"\n",
|
|
"First you create a simple dataset of input and expected output pairs."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "7e8ca802-e306-4632-afec-e9d655c84982",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import langsmith\n",
|
|
"\n",
|
|
"client = langsmith.Client()\n",
|
|
"dataset_name = \"Oracle of Exactness\"\n",
|
|
"if not client.has_dataset(dataset_name=dataset_name):\n",
|
|
" ds = client.create_dataset(dataset_name)\n",
|
|
" client.create_examples(\n",
|
|
" inputs=[\n",
|
|
" {\n",
|
|
" \"prompt_template\": \"State the year of the declaration of independence.\"\n",
|
|
" \"Respond with just the year in digits, nothign else\"\n",
|
|
" },\n",
|
|
" {\"prompt_template\": \"What's the average speed of an unladen swallow?\"},\n",
|
|
" ],\n",
|
|
" outputs=[{\"output\": \"1776\"}, {\"output\": \"5\"}],\n",
|
|
" dataset_id=ds.id,\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8d5ea231-7901-44b8-9d66-761d3aca140a",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Evaluate"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "aad3e9fc-72ac-4854-a67a-378ae0c8c91f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"View the evaluation results for project 'impressionable-crew-29' at:\n",
|
|
"https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd/compare?selectedSessions=a0672ba4-e513-4fef-84b8-bab439581721\n",
|
|
"\n",
|
|
"View all tests for Dataset Oracle of Exactness at:\n",
|
|
"https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd\n",
|
|
"[------------------------------------------------->] 2/2"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>feedback.matches_label</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>2.000000</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2.000000</td>\n",
|
|
" <td>2</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>False</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2b4532af-445e-46aa-8170-d34c3af724a8</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.500000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.545045</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.707107</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.265404</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.357376</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>0.250000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.451211</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>0.500000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.545045</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>0.750000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.638880</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.732714</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match feedback.matches_label error execution_time \\\n",
|
|
"count 2.000000 2 0 2.000000 \n",
|
|
"unique NaN 2 0 NaN \n",
|
|
"top NaN False NaN NaN \n",
|
|
"freq NaN 1 NaN NaN \n",
|
|
"mean 0.500000 NaN NaN 0.545045 \n",
|
|
"std 0.707107 NaN NaN 0.265404 \n",
|
|
"min 0.000000 NaN NaN 0.357376 \n",
|
|
"25% 0.250000 NaN NaN 0.451211 \n",
|
|
"50% 0.500000 NaN NaN 0.545045 \n",
|
|
"75% 0.750000 NaN NaN 0.638880 \n",
|
|
"max 1.000000 NaN NaN 0.732714 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 2 \n",
|
|
"unique 2 \n",
|
|
"top 2b4532af-445e-46aa-8170-d34c3af724a8 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'project_name': 'impressionable-crew-29',\n",
|
|
" 'results': {'893730f0-393d-4c40-92f9-16ce24aaec1f': {'input': {'prompt_template': \"What's the average speed of an unladen swallow?\"},\n",
|
|
" 'feedback': [EvaluationResult(key='exact_match', score=0, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('089a016a-d847-4a26-850c-afc0e78879d5'))}, source_run_id=None, target_run_id=None),\n",
|
|
" EvaluationResult(key='matches_label', score=False, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],\n",
|
|
" 'execution_time': 0.732714,\n",
|
|
" 'run_id': '2b4532af-445e-46aa-8170-d34c3af724a8',\n",
|
|
" 'output': {'output': 'The average speed of an unladen European swallow is approximately 20.1 miles per hour (32.4 km/h).'},\n",
|
|
" 'reference': {'output': '5'}},\n",
|
|
" 'ec9d8754-d264-4cec-802e-0c33513843d8': {'input': {'prompt_template': 'State the year of the declaration of independence.Respond with just the year in digits, nothign else'},\n",
|
|
" 'feedback': [EvaluationResult(key='exact_match', score=1, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('cd4c7ede-f367-4d9c-b424-577bf054bf21'))}, source_run_id=None, target_run_id=None),\n",
|
|
" EvaluationResult(key='matches_label', score=True, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],\n",
|
|
" 'execution_time': 0.357376,\n",
|
|
" 'run_id': '82b65c5c-bfbf-4d2b-9c05-3bbd1cd4e711',\n",
|
|
" 'output': {'output': '1776'},\n",
|
|
" 'reference': {'output': '1776'}}}}"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from langchain.smith import RunEvalConfig\n",
|
|
"from langchain_openai import ChatOpenAI\n",
|
|
"from langsmith.evaluation import EvaluationResult, run_evaluator\n",
|
|
"\n",
|
|
"model = \"gpt-3.5-turbo\"\n",
|
|
"\n",
|
|
"\n",
|
|
"# This is your model/system that you want to evaluate\n",
|
|
"def predict_result(input_: dict) -> dict:\n",
|
|
" response = ChatOpenAI(model=model).invoke(input_[\"prompt_template\"])\n",
|
|
" return {\"output\": response.content}\n",
|
|
"\n",
|
|
"\n",
|
|
"@run_evaluator\n",
|
|
"def compare_label(run, example) -> EvaluationResult:\n",
|
|
" # Custom evaluators let you define how \"exact\" the match ought to be\n",
|
|
" # It also lets you flexibly pick the fields to compare\n",
|
|
" prediction = run.outputs.get(\"output\") or \"\"\n",
|
|
" target = example.outputs.get(\"output\") or \"\"\n",
|
|
" match = prediction and prediction == target\n",
|
|
" return EvaluationResult(key=\"matches_label\", score=match)\n",
|
|
"\n",
|
|
"\n",
|
|
"# This defines how you generate metrics about the model's performance\n",
|
|
"eval_config = RunEvalConfig(\n",
|
|
" evaluators=[\"exact_match\"], # equivalent prebuilt evaluator\n",
|
|
" custom_evaluators=[compare_label],\n",
|
|
")\n",
|
|
"\n",
|
|
"client.run_on_dataset(\n",
|
|
" dataset_name=dataset_name,\n",
|
|
" llm_or_chain_factory=predict_result,\n",
|
|
" evaluation=eval_config,\n",
|
|
" verbose=True,\n",
|
|
" project_metadata={\"version\": \"1.0.0\", \"model\": model},\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e442400b-903d-441b-a6d7-58fe2abd253c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|