langsmith-cookbook/testing-examples/exact-match/exact_match.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e537e271-4b08-491f-8cf7-c9be1f3fcf15",
   "metadata": {},
   "source": [
    "# Exact Match Evaluation\n",
    "\n",
    "The simplest evaluation type is direct string comparison. LangChain has a prebuilt [\"exact match\" evaluator](https://python.langchain.com/docs/guides/productionization/evaluation/string/exact_match) you can use, or you can do the same with a custom evaluator.\n",
    "\n",
    "You can check out the example results [here](https://smith.langchain.com/public/454c80b5-9809-4f4f-95ee-1f71d8e3ef53/d).\n",
    "\n",
    "[![Test graph](./img/result_example.png)](https://smith.langchain.com/public/454c80b5-9809-4f4f-95ee-1f71d8e3ef53/d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "121dcc53-70ec-48df-adac-cbd424c66adc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# %pip install -U --quiet langchain langchain_openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3f610c6e-144b-47c8-9791-eaf4f42a8ccb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Update with your API URL if using a hosted instance of Langsmith.\n",
    "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
    "# Update with your API key\n",
    "os.environ[\"LANGCHAIN_API_KEY\"] = \"YOUR API KEY\"\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"Your openai api key\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff62061d-0fb9-4ba9-b185-ff8c9746fb72",
   "metadata": {},
   "source": [
    "## Create Dataset\n",
    "\n",
    "First you create a simple dataset of input and expected output pairs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7e8ca802-e306-4632-afec-e9d655c84982",
   "metadata": {},
   "outputs": [],
   "source": [
    "import langsmith\n",
    "\n",
    "client = langsmith.Client()\n",
    "dataset_name = \"Oracle of Exactness\"\n",
    "if not client.has_dataset(dataset_name=dataset_name):\n",
    "    ds = client.create_dataset(dataset_name)\n",
    "    client.create_examples(\n",
    "        inputs=[\n",
    "            {\n",
    "                \"prompt_template\": \"State the year of the declaration of independence.\"\n",
    "                \"Respond with just the year in digits, nothign else\"\n",
    "            },\n",
    "            {\"prompt_template\": \"What's the average speed of an unladen swallow?\"},\n",
    "        ],\n",
    "        outputs=[{\"output\": \"1776\"}, {\"output\": \"5\"}],\n",
    "        dataset_id=ds.id,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8d5ea231-7901-44b8-9d66-761d3aca140a",
   "metadata": {},
   "source": [
    "## Evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "aad3e9fc-72ac-4854-a67a-378ae0c8c91f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "View the evaluation results for project 'impressionable-crew-29' at:\n",
      "https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd/compare?selectedSessions=a0672ba4-e513-4fef-84b8-bab439581721\n",
      "\n",
      "View all tests for Dataset Oracle of Exactness at:\n",
      "https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd\n",
      "[------------------------------------------------->] 2/2"
     ]
    },
    {
     "data": {
      "text/html": [
       "<h3>Experiment Results:</h3>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feedback.exact_match</th>\n",
       "      <th>feedback.matches_label</th>\n",
       "      <th>error</th>\n",
       "      <th>execution_time</th>\n",
       "      <th>run_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2b4532af-445e-46aa-8170-d34c3af724a8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.545045</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.707107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.265404</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.357376</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.250000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.451211</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.500000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.545045</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.750000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.638880</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.732714</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        feedback.exact_match feedback.matches_label error  execution_time  \\\n",
       "count               2.000000                      2     0        2.000000   \n",
       "unique                   NaN                      2     0             NaN   \n",
       "top                      NaN                  False   NaN             NaN   \n",
       "freq                     NaN                      1   NaN             NaN   \n",
       "mean                0.500000                    NaN   NaN        0.545045   \n",
       "std                 0.707107                    NaN   NaN        0.265404   \n",
       "min                 0.000000                    NaN   NaN        0.357376   \n",
       "25%                 0.250000                    NaN   NaN        0.451211   \n",
       "50%                 0.500000                    NaN   NaN        0.545045   \n",
       "75%                 0.750000                    NaN   NaN        0.638880   \n",
       "max                 1.000000                    NaN   NaN        0.732714   \n",
       "\n",
       "                                      run_id  \n",
       "count                                      2  \n",
       "unique                                     2  \n",
       "top     2b4532af-445e-46aa-8170-d34c3af724a8  \n",
       "freq                                       1  \n",
       "mean                                     NaN  \n",
       "std                                      NaN  \n",
       "min                                      NaN  \n",
       "25%                                      NaN  \n",
       "50%                                      NaN  \n",
       "75%                                      NaN  \n",
       "max                                      NaN  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'project_name': 'impressionable-crew-29',\n",
       " 'results': {'893730f0-393d-4c40-92f9-16ce24aaec1f': {'input': {'prompt_template': \"What's the average speed of an unladen swallow?\"},\n",
       "   'feedback': [EvaluationResult(key='exact_match', score=0, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('089a016a-d847-4a26-850c-afc0e78879d5'))}, source_run_id=None, target_run_id=None),\n",
       "    EvaluationResult(key='matches_label', score=False, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],\n",
       "   'execution_time': 0.732714,\n",
       "   'run_id': '2b4532af-445e-46aa-8170-d34c3af724a8',\n",
       "   'output': {'output': 'The average speed of an unladen European swallow is approximately 20.1 miles per hour (32.4 km/h).'},\n",
       "   'reference': {'output': '5'}},\n",
       "  'ec9d8754-d264-4cec-802e-0c33513843d8': {'input': {'prompt_template': 'State the year of the declaration of independence.Respond with just the year in digits, nothign else'},\n",
       "   'feedback': [EvaluationResult(key='exact_match', score=1, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('cd4c7ede-f367-4d9c-b424-577bf054bf21'))}, source_run_id=None, target_run_id=None),\n",
       "    EvaluationResult(key='matches_label', score=True, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],\n",
       "   'execution_time': 0.357376,\n",
       "   'run_id': '82b65c5c-bfbf-4d2b-9c05-3bbd1cd4e711',\n",
       "   'output': {'output': '1776'},\n",
       "   'reference': {'output': '1776'}}}}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.smith import RunEvalConfig\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langsmith.evaluation import EvaluationResult, run_evaluator\n",
    "\n",
    "model = \"gpt-3.5-turbo\"\n",
    "\n",
    "\n",
    "# This is your model/system that you want to evaluate\n",
    "def predict_result(input_: dict) -> dict:\n",
    "    response = ChatOpenAI(model=model).invoke(input_[\"prompt_template\"])\n",
    "    return {\"output\": response.content}\n",
    "\n",
    "\n",
    "@run_evaluator\n",
    "def compare_label(run, example) -> EvaluationResult:\n",
    "    # Custom evaluators let you define how \"exact\" the match ought to be\n",
    "    # It also lets you flexibly pick the fields to compare\n",
    "    prediction = run.outputs.get(\"output\") or \"\"\n",
    "    target = example.outputs.get(\"output\") or \"\"\n",
    "    match = prediction and prediction == target\n",
    "    return EvaluationResult(key=\"matches_label\", score=match)\n",
    "\n",
    "\n",
    "# This defines how you generate metrics about the model's performance\n",
    "eval_config = RunEvalConfig(\n",
    "    evaluators=[\"exact_match\"],  # equivalent prebuilt evaluator\n",
    "    custom_evaluators=[compare_label],\n",
    ")\n",
    "\n",
    "client.run_on_dataset(\n",
    "    dataset_name=dataset_name,\n",
    "    llm_or_chain_factory=predict_result,\n",
    "    evaluation=eval_config,\n",
    "    verbose=True,\n",
    "    project_metadata={\"version\": \"1.0.0\", \"model\": model},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e442400b-903d-441b-a6d7-58fe2abd253c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}