Files
2024-05-14 19:04:26 -07:00

364 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "e537e271-4b08-491f-8cf7-c9be1f3fcf15",
"metadata": {},
"source": [
"# Exact Match Evaluation\n",
"\n",
"The simplest evaluation type is direct string comparison. LangChain has a prebuilt [\"exact match\" evaluator](https://python.langchain.com/docs/guides/productionization/evaluation/string/exact_match) you can use, or you can do the same with a custom evaluator.\n",
"\n",
"You can check out the example results [here](https://smith.langchain.com/public/454c80b5-9809-4f4f-95ee-1f71d8e3ef53/d).\n",
"\n",
"[![Test graph](./img/result_example.png)](https://smith.langchain.com/public/454c80b5-9809-4f4f-95ee-1f71d8e3ef53/d)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "121dcc53-70ec-48df-adac-cbd424c66adc",
"metadata": {},
"outputs": [],
"source": [
"# %pip install -U --quiet langchain langchain_openai"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3f610c6e-144b-47c8-9791-eaf4f42a8ccb",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Update with your API URL if using a hosted instance of Langsmith.\n",
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
"# Update with your API key\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = \"YOUR API KEY\"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"Your openai api key\""
]
},
{
"cell_type": "markdown",
"id": "ff62061d-0fb9-4ba9-b185-ff8c9746fb72",
"metadata": {},
"source": [
"## Create Dataset\n",
"\n",
"First you create a simple dataset of input and expected output pairs."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7e8ca802-e306-4632-afec-e9d655c84982",
"metadata": {},
"outputs": [],
"source": [
"import langsmith\n",
"\n",
"client = langsmith.Client()\n",
"dataset_name = \"Oracle of Exactness\"\n",
"if not client.has_dataset(dataset_name=dataset_name):\n",
" ds = client.create_dataset(dataset_name)\n",
" client.create_examples(\n",
" inputs=[\n",
" {\n",
" \"prompt_template\": \"State the year of the declaration of independence.\"\n",
" \"Respond with just the year in digits, nothign else\"\n",
" },\n",
" {\"prompt_template\": \"What's the average speed of an unladen swallow?\"},\n",
" ],\n",
" outputs=[{\"output\": \"1776\"}, {\"output\": \"5\"}],\n",
" dataset_id=ds.id,\n",
" )"
]
},
{
"cell_type": "markdown",
"id": "8d5ea231-7901-44b8-9d66-761d3aca140a",
"metadata": {},
"source": [
"## Evaluate"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "aad3e9fc-72ac-4854-a67a-378ae0c8c91f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"View the evaluation results for project 'impressionable-crew-29' at:\n",
"https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd/compare?selectedSessions=a0672ba4-e513-4fef-84b8-bab439581721\n",
"\n",
"View all tests for Dataset Oracle of Exactness at:\n",
"https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd\n",
"[------------------------------------------------->] 2/2"
]
},
{
"data": {
"text/html": [
"<h3>Experiment Results:</h3>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feedback.exact_match</th>\n",
" <th>feedback.matches_label</th>\n",
" <th>error</th>\n",
" <th>execution_time</th>\n",
" <th>run_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2.000000</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>2.000000</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2b4532af-445e-46aa-8170-d34c3af724a8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.500000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.545045</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.707107</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.265404</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.357376</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.250000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.451211</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.500000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.545045</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.750000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.638880</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.732714</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" feedback.exact_match feedback.matches_label error execution_time \\\n",
"count 2.000000 2 0 2.000000 \n",
"unique NaN 2 0 NaN \n",
"top NaN False NaN NaN \n",
"freq NaN 1 NaN NaN \n",
"mean 0.500000 NaN NaN 0.545045 \n",
"std 0.707107 NaN NaN 0.265404 \n",
"min 0.000000 NaN NaN 0.357376 \n",
"25% 0.250000 NaN NaN 0.451211 \n",
"50% 0.500000 NaN NaN 0.545045 \n",
"75% 0.750000 NaN NaN 0.638880 \n",
"max 1.000000 NaN NaN 0.732714 \n",
"\n",
" run_id \n",
"count 2 \n",
"unique 2 \n",
"top 2b4532af-445e-46aa-8170-d34c3af724a8 \n",
"freq 1 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'project_name': 'impressionable-crew-29',\n",
" 'results': {'893730f0-393d-4c40-92f9-16ce24aaec1f': {'input': {'prompt_template': \"What's the average speed of an unladen swallow?\"},\n",
" 'feedback': [EvaluationResult(key='exact_match', score=0, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('089a016a-d847-4a26-850c-afc0e78879d5'))}, source_run_id=None, target_run_id=None),\n",
" EvaluationResult(key='matches_label', score=False, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],\n",
" 'execution_time': 0.732714,\n",
" 'run_id': '2b4532af-445e-46aa-8170-d34c3af724a8',\n",
" 'output': {'output': 'The average speed of an unladen European swallow is approximately 20.1 miles per hour (32.4 km/h).'},\n",
" 'reference': {'output': '5'}},\n",
" 'ec9d8754-d264-4cec-802e-0c33513843d8': {'input': {'prompt_template': 'State the year of the declaration of independence.Respond with just the year in digits, nothign else'},\n",
" 'feedback': [EvaluationResult(key='exact_match', score=1, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('cd4c7ede-f367-4d9c-b424-577bf054bf21'))}, source_run_id=None, target_run_id=None),\n",
" EvaluationResult(key='matches_label', score=True, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],\n",
" 'execution_time': 0.357376,\n",
" 'run_id': '82b65c5c-bfbf-4d2b-9c05-3bbd1cd4e711',\n",
" 'output': {'output': '1776'},\n",
" 'reference': {'output': '1776'}}}}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain.smith import RunEvalConfig\n",
"from langchain_openai import ChatOpenAI\n",
"from langsmith.evaluation import EvaluationResult, run_evaluator\n",
"\n",
"model = \"gpt-3.5-turbo\"\n",
"\n",
"\n",
"# This is your model/system that you want to evaluate\n",
"def predict_result(input_: dict) -> dict:\n",
" response = ChatOpenAI(model=model).invoke(input_[\"prompt_template\"])\n",
" return {\"output\": response.content}\n",
"\n",
"\n",
"@run_evaluator\n",
"def compare_label(run, example) -> EvaluationResult:\n",
" # Custom evaluators let you define how \"exact\" the match ought to be\n",
" # It also lets you flexibly pick the fields to compare\n",
" prediction = run.outputs.get(\"output\") or \"\"\n",
" target = example.outputs.get(\"output\") or \"\"\n",
" match = prediction and prediction == target\n",
" return EvaluationResult(key=\"matches_label\", score=match)\n",
"\n",
"\n",
"# This defines how you generate metrics about the model's performance\n",
"eval_config = RunEvalConfig(\n",
" evaluators=[\"exact_match\"], # equivalent prebuilt evaluator\n",
" custom_evaluators=[compare_label],\n",
")\n",
"\n",
"client.run_on_dataset(\n",
" dataset_name=dataset_name,\n",
" llm_or_chain_factory=predict_result,\n",
" evaluation=eval_config,\n",
" verbose=True,\n",
" project_metadata={\"version\": \"1.0.0\", \"model\": model},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e442400b-903d-441b-a6d7-58fe2abd253c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}