mirror of
https://github.com/langchain-ai/langsmith-cookbook.git
synced 2026-07-01 08:12:02 -04:00
2634 lines
89 KiB
Plaintext
2634 lines
89 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c56f6b50-d708-43c5-acd2-ad948cdc1797",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Bootstrap Few-shot Prompting with LangSmith\n",
|
|
"\n",
|
|
"Prompt engineering is a pain. You can use _examples_ to optimize the prompt for you with the help of tools like LangSmith. Instead of guessing which examples will be the most impactful, you can use tried-and-true evaluation practices to curate and compile the right examples for your pipeline. The main steps are:\n",
|
|
"\n",
|
|
"1. Create a dataset\n",
|
|
"2. Pick a metric to improve\n",
|
|
"3. Create an initial system\n",
|
|
"4. Decide the update logic (few-shot examples vs. instruction teaching vs. other methods, how to format the examples, etc.)\n",
|
|
"5. Train!\n",
|
|
"\n",
|
|
"\n",
|
|
"Below is an example bootstrapping a gpt-3.5-turbo model on an entailment task using few-shot examples. This example inspired by Christopher Potts' [example](https://github.com/stanfordnlp/dspy/blob/main/examples/nli/scone/scone.ipynb) on the SCONE dataset.\n",
|
|
"\n",
|
|
"The task is natural language inference, where the LLM is required to predict whether the a statement can be logically concluded from a premise / grounding statement."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "191baa94-41b2-4aaf-b621-aaf8171566d7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%pip install -U langsmith langchain langchain_openai pandas"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b20f9596-dcd5-4928-a6f0-e4f75e1cf843",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"\n",
|
|
"# Update with your API URL if using a hosted instance of Langsmith.\n",
|
|
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
|
|
"os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
|
|
"os.environ[\"LANGCHAIN_API_KEY\"] = \"YOUR API KEY\"\n",
|
|
"os.environ[\"OPENAI_API_KEY\"] = \"YOUR API KEY\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "8d4cf23c-c99f-4206-99cc-2b2020fa5131",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# We can do the same thing with a SQLite cache\n",
|
|
"from langchain_core.globals import set_llm_cache\n",
|
|
"from langchain_community.cache import SQLiteCache\n",
|
|
"\n",
|
|
"set_llm_cache(SQLiteCache(database_path=\".langchain.db\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "461b3958-1b1a-47aa-a2f8-02c6119eb2cd",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"inputs {'context': 'A man who does not walk confidently dropping produce.', 'question': 'Can we logically conclude for sure that a man who does not walk confidently dropping kale?'}\n",
|
|
"outputs {'answer': 'No', 'category': 'one_not_scoped'}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from langsmith import Client\n",
|
|
"\n",
|
|
"client = Client()\n",
|
|
"\n",
|
|
"public_datasets = [\n",
|
|
" \"https://smith.langchain.com/public/1d065de2-56c1-496e-bc66-bdce308e6537/d\", # train\n",
|
|
" \"https://smith.langchain.com/public/3205fa05-bd78-4eaf-924f-96df0f577b1f/d\", # train2\n",
|
|
" \"https://smith.langchain.com/public/fdf16166-1edd-418f-b777-3af82034931d/d\", # dev\n",
|
|
" \"https://smith.langchain.com/public/aee61506-3c60-4ca8-95c4-0314c9719ca8/d\", # dev2\n",
|
|
" \"https://smith.langchain.com/public/8d40d210-f8e6-4def-a206-78c5080c5d53/d\", # test\n",
|
|
"]\n",
|
|
"for ds in public_datasets:\n",
|
|
" client.clone_public_dataset(ds)\n",
|
|
"train_name = \"scone-train2\"\n",
|
|
"dev_name = \"scone-dev2\"\n",
|
|
"test_name = \"scone-test-one-scoped\"\n",
|
|
"full_test_name = \"scone-test\"\n",
|
|
"\n",
|
|
"example = next(client.list_examples(dataset_name=train_name))\n",
|
|
"print(\"inputs\", example.inputs)\n",
|
|
"print(\"outputs\", example.outputs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "bc278868-1fe8-4a24-845c-7095607f3a88",
|
|
"metadata": {},
|
|
"source": [
|
|
"Reviewing the values above, these examples can be tricky! "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c810e70c-9518-4bf4-a50f-bd52296b145d",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Evaluator\n",
|
|
"\n",
|
|
"Since we have ground-truth clasification labels, we can use an exact-match criterion as our evaluator."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "33084e53-ba37-4274-892c-f4d02ce3c79d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import sys\n",
|
|
"\n",
|
|
"from langsmith.evaluation import run_evaluator\n",
|
|
"\n",
|
|
"\n",
|
|
"@run_evaluator\n",
|
|
"def exact_match(run, example):\n",
|
|
" # Evaluate the exact match correctness of the NLI result\n",
|
|
" try:\n",
|
|
" predicted = run.outputs[\"is_entailed\"]\n",
|
|
" expected = example.outputs[\"answer\"]\n",
|
|
" score = expected.lower() == predicted.lower()\n",
|
|
" except Exception as e:\n",
|
|
" try:\n",
|
|
" expected = example.outputs[\"answer\"]\n",
|
|
" expected_bool = {\"no\": False, \"yes\": True}.get(expected.strip().lower())\n",
|
|
" score = run.outputs[\"output\"].is_entailed == expected_bool\n",
|
|
" except Exception as e2:\n",
|
|
" score = 0\n",
|
|
" return {\n",
|
|
" \"key\": \"exact_match\",\n",
|
|
" \"score\": int(score),\n",
|
|
" }"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "012b34ab-e6eb-4d76-a55f-24bbdcd97d15",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain_core.output_parsers import StrOutputParser\n",
|
|
"from langchain_core.prompts import PromptTemplate\n",
|
|
"from langchain_openai import ChatOpenAI\n",
|
|
"\n",
|
|
"# And we will create a placeholder in the template to add few-shot examples\n",
|
|
"prompt = PromptTemplate.from_template(\n",
|
|
" \"\"\"You are given some context (a premise) and a question (a hypothesis). You must indicate with Yes/No answer whether we can logically conclude the hypothesis from the premise.\n",
|
|
"\n",
|
|
"---\n",
|
|
"\n",
|
|
"Follow the following format.\n",
|
|
"\n",
|
|
"Context: ${{context}}\n",
|
|
"\n",
|
|
"Question: ${{question}}\n",
|
|
"\n",
|
|
"Reasoning: Let's think step by step in order to ${{produce the answer}}. We ...\n",
|
|
"\n",
|
|
"Answer: Yes or No\n",
|
|
"\n",
|
|
"---{examples}\n",
|
|
"\n",
|
|
"Context: {context}\n",
|
|
"\n",
|
|
"Question: {question}\n",
|
|
"\n",
|
|
"Reasoning: Let's think step by step in order to\"\"\"\n",
|
|
").partial(examples=\"\")\n",
|
|
"\n",
|
|
"\n",
|
|
"def parse(pred: str):\n",
|
|
" fnd = \"\\nAnswer:\"\n",
|
|
" idx = pred.find(fnd)\n",
|
|
" answer = pred[idx + len(fnd) :].strip()\n",
|
|
" return {\"is_entailed\": answer, \"reasoning\": pred[:idx].strip()}\n",
|
|
"\n",
|
|
"\n",
|
|
"chain = prompt | ChatOpenAI(model=\"gpt-3.5-turbo\") | StrOutputParser() | parse"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "073c51af-1a53-4530-885d-099f56bf27fa",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'is_entailed': 'No',\n",
|
|
" 'reasoning': 'produce the answer. We know that the man does not walk confidently and drops produce. However, dropping produce does not necessarily mean he drops kale specifically. He could be dropping any type of produce.'}"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"prediction = chain.invoke(example.inputs)\n",
|
|
"prediction"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b2dda2b0-e4f7-4394-9b65-6f4316399392",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Initial Evaluation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "1b596b83-29b5-4a2d-9e42-8b2a3e75114d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.smith import RunEvalConfig\n",
|
|
"\n",
|
|
"eval_config = RunEvalConfig(\n",
|
|
" custom_evaluators=[exact_match],\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "a87a9b2d-a2fe-4e55-bc50-6e7bef4f98ba",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"View the evaluation results for project 'passionate-copy-48' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/f1b328a2-b4e8-473c-808f-e042d38f6ebd/compare?selectedSessions=bb3d33aa-53a1-4d63-8b79-3758df4b1fb7\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-test2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/f1b328a2-b4e8-473c-808f-e042d38f6ebd\n",
|
|
"[------------------------------------------------->] 200/200"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"res = client.run_on_dataset(\n",
|
|
" dataset_name=\"scone-test2\", # dev_name,\n",
|
|
" llm_or_chain_factory=chain,\n",
|
|
" evaluation=eval_config,\n",
|
|
" project_metadata={\"optimizer\": None},\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1ab9236d-a213-4cf6-815e-fd05a248105a",
|
|
"metadata": {},
|
|
"source": [
|
|
"Got about ~55% on it. Definitely room for improvement."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0192913d-e9c1-4f00-aa41-badc64c6b21d",
|
|
"metadata": {},
|
|
"source": [
|
|
"## ✨ Optimize ✨\n",
|
|
"\n",
|
|
"\n",
|
|
"This just means to \"use data to update the system\". At present, LangChain runnables don't natively support a \"backwards\" method (a la pytorch), but you can pretty easily define updates/mutations for key important components you'd want to update, (such as prompts or LLMs).\n",
|
|
"\n",
|
|
"For instance, component-wise, you could apply:\n",
|
|
"- Few shot prompting: add an additional string input or MessagesPlaceholder in the prompt template\n",
|
|
"- Updating the instructions: update the prompt template directly (likely the system prompt)\n",
|
|
"- LLM: do a backwards pass.\n",
|
|
"\n",
|
|
"We will focus on few-shot prompting to limit the search space. We will then apply a genetic/evolutionary algorithm to compare performance of different few-shot examples and pick the ones that provide the most \"lift\" of the provided metric.\n",
|
|
"\n",
|
|
"We'll first create a constructor for our chain that accepts the few-shot examples, letting us re-create the chain with each updated state."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "d50bf47b-0c46-422f-818e-a9d58bac3240",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# We will define how we want our few-shot examples to be formatted\n",
|
|
"import random\n",
|
|
"from typing import List, Optional\n",
|
|
"\n",
|
|
"from langchain_core.runnables import RunnableLambda\n",
|
|
"\n",
|
|
"\n",
|
|
"def format_example(example: dict):\n",
|
|
" inputs = example[\"input\"]\n",
|
|
" outputs = example[\"output\"]\n",
|
|
" return f\"\"\"\n",
|
|
"\n",
|
|
"Context: {inputs['context']}\n",
|
|
"\n",
|
|
"Question: {inputs['question']}\n",
|
|
"\n",
|
|
"Reasoning: {outputs['reasoning']}\n",
|
|
"\n",
|
|
"Answer: {outputs['is_entailed']}\n",
|
|
"\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"\n",
|
|
"def format_few_shot(input_: dict, examples: Optional[List[dict]] = None):\n",
|
|
" if examples:\n",
|
|
" # TODO: make this configurable / bound to the prompt template\n",
|
|
" input_[\"examples\"] = (\n",
|
|
" \"--\".join(format_example(e) for i, e in enumerate(examples)) + \"--\"\n",
|
|
" )\n",
|
|
" return input_\n",
|
|
"\n",
|
|
"\n",
|
|
"def create_chain(examples: Optional[List] = None, llm=None):\n",
|
|
" llm = llm or ChatOpenAI(model=\"gpt-3.5-turbo\")\n",
|
|
" chain = (\n",
|
|
" RunnableLambda(format_few_shot).bind(examples=examples)\n",
|
|
" | prompt\n",
|
|
" | llm\n",
|
|
" | StrOutputParser()\n",
|
|
" | parse\n",
|
|
" ).with_config(tags=[\"to_train\"])\n",
|
|
" return chain"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "04d1c37c-6f85-465f-adf9-3f0a8548be81",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Training\n",
|
|
"\n",
|
|
"Next, we'll define the training utilities."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "709d4bdf-e452-45d7-a97d-e2e8e1dd595f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain_core.tracers.context import collect_runs\n",
|
|
"\n",
|
|
"\n",
|
|
"def step(\n",
|
|
" construct_chain,\n",
|
|
" train_examples,\n",
|
|
" eval_config,\n",
|
|
" examples=None,\n",
|
|
" bootstrap_k: int = 8,\n",
|
|
"):\n",
|
|
" collected = examples.copy() if examples else []\n",
|
|
" random.shuffle(train_examples)\n",
|
|
" train_examples = train_examples.copy()\n",
|
|
" # TODO: Batching to speed it up\n",
|
|
" while train_examples:\n",
|
|
" if len(collected) >= bootstrap_k:\n",
|
|
" break\n",
|
|
" train_batch = [\n",
|
|
" train_examples.pop() for _ in range(bootstrap_k - len(collected))\n",
|
|
" ]\n",
|
|
" chain = construct_chain([e for e in collected if e[\"id\"] != example.id])\n",
|
|
" with collect_runs() as cb:\n",
|
|
" chain.batch([e.inputs for e in train_batch])\n",
|
|
" evaluator = eval_config.custom_evaluators[0]\n",
|
|
" for run, example in zip(cb.traced_runs, train_batch):\n",
|
|
" metric = evaluator.evaluate_run(run, example)\n",
|
|
" score = metric.score\n",
|
|
" # Check if success\n",
|
|
" if score:\n",
|
|
" collected.append(\n",
|
|
" {\n",
|
|
" \"input\": example.inputs,\n",
|
|
" \"output\": run.outputs,\n",
|
|
" \"id\": example.id,\n",
|
|
" }\n",
|
|
" )\n",
|
|
" return collected\n",
|
|
"\n",
|
|
"\n",
|
|
"def eval(eval_dataset, chain, eval_config, step_n) -> float:\n",
|
|
" \"\"\"Compute the metrics on the validation dataset.\"\"\"\n",
|
|
" dev_results = client.run_on_dataset(\n",
|
|
" dataset_name=eval_dataset,\n",
|
|
" llm_or_chain_factory=chain,\n",
|
|
" evaluation=eval_config,\n",
|
|
" verbose=True,\n",
|
|
" concurrency_level=8,\n",
|
|
" project_metadata={\n",
|
|
" \"step\": step_n,\n",
|
|
" },\n",
|
|
" )\n",
|
|
" df = dev_results.to_dataframe()\n",
|
|
" feedback_key = [c for c in df.columns if c.startswith(\"feedback.\")][0]\n",
|
|
" # Assume single metric rn ha\n",
|
|
" return df[feedback_key].mean()\n",
|
|
"\n",
|
|
"\n",
|
|
"def train(\n",
|
|
" chain_constructor,\n",
|
|
" train_dataset,\n",
|
|
" eval_dataset,\n",
|
|
" eval_config,\n",
|
|
" steps: int = 5,\n",
|
|
" k: int = 8,\n",
|
|
" bootstrap_k: int = 8,\n",
|
|
"):\n",
|
|
" \"\"\"Run the full training loop\"\"\"\n",
|
|
" best_score = eval(eval_dataset, chain_constructor(), eval_config, 0)\n",
|
|
" best_step = 0\n",
|
|
" scores = [(best_score, [])]\n",
|
|
" train_examples = list(client.list_examples(dataset_name=train_dataset))\n",
|
|
" for step_number in range(steps):\n",
|
|
" collected = step(\n",
|
|
" chain_constructor, train_examples, eval_config, bootstrap_k=bootstrap_k\n",
|
|
" )\n",
|
|
" if len(collected) < k:\n",
|
|
" # TODO: probably want some diversity of labels here lol\n",
|
|
" to_sample = min(k - len(collected), len(train_examples))\n",
|
|
" collected += random.sample(train_examples, to_sample)\n",
|
|
" selected_examples = collected\n",
|
|
" updated_chain = chain_constructor(examples=selected_examples)\n",
|
|
" updated_score = eval(eval_dataset, updated_chain, eval_config, step_number + 1)\n",
|
|
" scores.append((updated_score, selected_examples))\n",
|
|
"\n",
|
|
" if updated_score > best_score:\n",
|
|
" print(\n",
|
|
" f\"New best score {updated_score} > {best_score}. Updating selected examples.\"\n",
|
|
" )\n",
|
|
" best_score = updated_score\n",
|
|
" best_step = step_number + 1\n",
|
|
" else:\n",
|
|
" print(\"Underperformed. Continuing\")\n",
|
|
" print(\"Best overall score: \", best_score)\n",
|
|
" print(\"Best step: \", best_step)\n",
|
|
" return sorted(scores, key=lambda x: x[0], reverse=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0a7afc25-7fb5-47a6-b233-013b2e02bef9",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Train\n",
|
|
"\n",
|
|
"Now we can finally run the training loop!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "0eb42676-8fb1-4eca-bf50-29f9d2225ea6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"View the evaluation results for project 'bold-show-44' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=0478dc12-5f1a-4d1b-84d6-95699f05bf77\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.00000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>e45cdb67-3ae6-48b6-9db1-6fe09e39e6a3</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.86000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.021456</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.35051</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.011425</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.007727</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.013763</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.019525</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.023224</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.059278</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.00000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.86000 NaN 0.021456 \n",
|
|
"std 0.35051 NaN 0.011425 \n",
|
|
"min 0.00000 NaN 0.007727 \n",
|
|
"25% 1.00000 NaN 0.013763 \n",
|
|
"50% 1.00000 NaN 0.019525 \n",
|
|
"75% 1.00000 NaN 0.023224 \n",
|
|
"max 1.00000 NaN 0.059278 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top e45cdb67-3ae6-48b6-9db1-6fe09e39e6a3 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"View the evaluation results for project 'giving-record-97' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=c181b376-6214-4130-8d6e-87ee7c0cfd5f\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.00000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>ef1483cc-1040-4ebb-a0b0-f770bc9411c5</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.86000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.071231</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.35051</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.016930</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.513033</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.605231</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.932223</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>10.160974</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>24.512853</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.00000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.86000 NaN 9.071231 \n",
|
|
"std 0.35051 NaN 4.016930 \n",
|
|
"min 0.00000 NaN 4.513033 \n",
|
|
"25% 1.00000 NaN 6.605231 \n",
|
|
"50% 1.00000 NaN 7.932223 \n",
|
|
"75% 1.00000 NaN 10.160974 \n",
|
|
"max 1.00000 NaN 24.512853 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top ef1483cc-1040-4ebb-a0b0-f770bc9411c5 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Underperformed. Continuing\n",
|
|
"View the evaluation results for project 'proper-man-52' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=13f9f137-b12b-41c8-bc51-fc65aed67594\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[-----------------------> ] 24/50"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': 'You requested a model that is not compatible with this engine. Please contact us through our help center at help.openai.com for further questions.', 'type': 'invalid_request_error', 'param': 'model', 'code': None}}\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>49.000000</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Error code: 400 - {'error': {'message': 'You r...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>c3388800-20aa-4c72-8e1c-f96632355fcf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.836735</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>10.026921</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.373438</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.115617</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.559937</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.325939</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.343092</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>11.909372</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>24.057484</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match \\\n",
|
|
"count 49.000000 \n",
|
|
"unique NaN \n",
|
|
"top NaN \n",
|
|
"freq NaN \n",
|
|
"mean 0.836735 \n",
|
|
"std 0.373438 \n",
|
|
"min 0.000000 \n",
|
|
"25% 1.000000 \n",
|
|
"50% 1.000000 \n",
|
|
"75% 1.000000 \n",
|
|
"max 1.000000 \n",
|
|
"\n",
|
|
" error execution_time \\\n",
|
|
"count 1 50.000000 \n",
|
|
"unique 1 NaN \n",
|
|
"top Error code: 400 - {'error': {'message': 'You r... NaN \n",
|
|
"freq 1 NaN \n",
|
|
"mean NaN 10.026921 \n",
|
|
"std NaN 4.115617 \n",
|
|
"min NaN 0.559937 \n",
|
|
"25% NaN 7.325939 \n",
|
|
"50% NaN 9.343092 \n",
|
|
"75% NaN 11.909372 \n",
|
|
"max NaN 24.057484 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top c3388800-20aa-4c72-8e1c-f96632355fcf \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Underperformed. Continuing\n",
|
|
"View the evaluation results for project 'proper-quiet-36' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=c6f18469-7df3-41d5-bd70-10ee4a076182\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[----------------------------> ] 29/50"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Error Type: BadRequestError, Message: Error code: 400 - {'error': {'message': 'You requested a model that is not compatible with this engine. Please contact us through our help center at help.openai.com for further questions.', 'type': 'invalid_request_error', 'param': 'model', 'code': None}}\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>49.000000</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Error code: 400 - {'error': {'message': 'You r...</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>ac830a9d-4169-49b6-a843-0f4afe138865</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.897959</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.242384</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.305839</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.108956</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.525809</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.170674</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.969927</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.018508</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>12.737470</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match \\\n",
|
|
"count 49.000000 \n",
|
|
"unique NaN \n",
|
|
"top NaN \n",
|
|
"freq NaN \n",
|
|
"mean 0.897959 \n",
|
|
"std 0.305839 \n",
|
|
"min 0.000000 \n",
|
|
"25% 1.000000 \n",
|
|
"50% 1.000000 \n",
|
|
"75% 1.000000 \n",
|
|
"max 1.000000 \n",
|
|
"\n",
|
|
" error execution_time \\\n",
|
|
"count 1 50.000000 \n",
|
|
"unique 1 NaN \n",
|
|
"top Error code: 400 - {'error': {'message': 'You r... NaN \n",
|
|
"freq 1 NaN \n",
|
|
"mean NaN 7.242384 \n",
|
|
"std NaN 2.108956 \n",
|
|
"min NaN 0.525809 \n",
|
|
"25% NaN 6.170674 \n",
|
|
"50% NaN 6.969927 \n",
|
|
"75% NaN 8.018508 \n",
|
|
"max NaN 12.737470 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top ac830a9d-4169-49b6-a843-0f4afe138865 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"New best score 0.8979591836734694 > 0.86. Updating selected examples.\n",
|
|
"View the evaluation results for project 'advanced-competition-88' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=31ece295-31c4-4c3c-b9f0-a1df3dd09adb\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.00000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>e2d59128-29e4-4562-bc11-93bb60738953</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.86000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.488865</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.35051</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.301064</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.736222</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.037187</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.998608</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.773248</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>26.641730</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.00000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.86000 NaN 8.488865 \n",
|
|
"std 0.35051 NaN 4.301064 \n",
|
|
"min 0.00000 NaN 3.736222 \n",
|
|
"25% 1.00000 NaN 6.037187 \n",
|
|
"50% 1.00000 NaN 6.998608 \n",
|
|
"75% 1.00000 NaN 9.773248 \n",
|
|
"max 1.00000 NaN 26.641730 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top e2d59128-29e4-4562-bc11-93bb60738953 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Underperformed. Continuing\n",
|
|
"View the evaluation results for project 'drab-print-47' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=70686baf-1859-4bcf-91b3-82c41843cd86\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1bd0827b-b405-4bdc-8eb0-ed3105d94e4d</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.900000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>10.443896</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.303046</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>13.421476</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.744148</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.975307</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.340018</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.440450</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>101.049986</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.000000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.900000 NaN 10.443896 \n",
|
|
"std 0.303046 NaN 13.421476 \n",
|
|
"min 0.000000 NaN 4.744148 \n",
|
|
"25% 1.000000 NaN 6.975307 \n",
|
|
"50% 1.000000 NaN 8.340018 \n",
|
|
"75% 1.000000 NaN 9.440450 \n",
|
|
"max 1.000000 NaN 101.049986 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top 1bd0827b-b405-4bdc-8eb0-ed3105d94e4d \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"New best score 0.9 > 0.8979591836734694. Updating selected examples.\n",
|
|
"View the evaluation results for project 'impressionable-writer-19' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=1f31eff6-8ab8-4b16-baa5-6f3669f4dead\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>041fd757-fb44-4a79-8dcf-d0ab006622f1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.880000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.219473</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.328261</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.151543</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.604611</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5.412153</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.344393</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.157682</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>13.777614</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.000000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.880000 NaN 7.219473 \n",
|
|
"std 0.328261 NaN 2.151543 \n",
|
|
"min 0.000000 NaN 3.604611 \n",
|
|
"25% 1.000000 NaN 5.412153 \n",
|
|
"50% 1.000000 NaN 7.344393 \n",
|
|
"75% 1.000000 NaN 8.157682 \n",
|
|
"max 1.000000 NaN 13.777614 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top 041fd757-fb44-4a79-8dcf-d0ab006622f1 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Underperformed. Continuing\n",
|
|
"View the evaluation results for project 'drab-map-24' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=aa3fb10d-f9a7-47ac-a90d-c385085339fc\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>e8f88ef2-8d1e-4323-ac51-0c7ba1c6b0fd</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.880000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.352010</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.328261</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.876893</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.442488</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5.508052</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.563693</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.169192</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>17.694664</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.000000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.880000 NaN 7.352010 \n",
|
|
"std 0.328261 NaN 2.876893 \n",
|
|
"min 0.000000 NaN 3.442488 \n",
|
|
"25% 1.000000 NaN 5.508052 \n",
|
|
"50% 1.000000 NaN 6.563693 \n",
|
|
"75% 1.000000 NaN 8.169192 \n",
|
|
"max 1.000000 NaN 17.694664 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top e8f88ef2-8d1e-4323-ac51-0c7ba1c6b0fd \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Underperformed. Continuing\n",
|
|
"View the evaluation results for project 'best-step-66' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=1d7c26de-3ae1-470e-8c51-9b2873a442c9\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>31e30bda-a245-4f68-8596-03183b8ffcc3</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.920000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.322146</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.274048</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2.587044</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5.140714</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.780764</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.700001</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.086863</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>19.068444</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.000000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.920000 NaN 8.322146 \n",
|
|
"std 0.274048 NaN 2.587044 \n",
|
|
"min 0.000000 NaN 5.140714 \n",
|
|
"25% 1.000000 NaN 6.780764 \n",
|
|
"50% 1.000000 NaN 7.700001 \n",
|
|
"75% 1.000000 NaN 9.086863 \n",
|
|
"max 1.000000 NaN 19.068444 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top 31e30bda-a245-4f68-8596-03183b8ffcc3 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"New best score 0.92 > 0.9. Updating selected examples.\n",
|
|
"View the evaluation results for project 'brief-color-26' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=4b090fa5-87cf-4bab-8f90-d86d91102240\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.00000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>bd2fe2a3-cb39-4287-9c79-ba214bcdae40</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.86000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.189128</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.35051</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5.716492</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.791341</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>6.648413</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.485603</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.478416</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>41.826824</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.00000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.86000 NaN 9.189128 \n",
|
|
"std 0.35051 NaN 5.716492 \n",
|
|
"min 0.00000 NaN 4.791341 \n",
|
|
"25% 1.00000 NaN 6.648413 \n",
|
|
"50% 1.00000 NaN 7.485603 \n",
|
|
"75% 1.00000 NaN 9.478416 \n",
|
|
"max 1.00000 NaN 41.826824 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top bd2fe2a3-cb39-4287-9c79-ba214bcdae40 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Underperformed. Continuing\n",
|
|
"View the evaluation results for project 'worthwhile-rabbit-93' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa/compare?selectedSessions=c8676b03-e009-4a3b-aa50-1f16a4476dbf\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-dev2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/0360d4ae-63e0-4e58-b4a7-97f8ef466aaa\n",
|
|
"[------------------------------------------------->] 50/50"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>50.000000</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>50</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>83776c8b-5772-4521-8b30-17b1cc5defca</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.880000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>8.748563</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.328261</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>4.640876</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5.161556</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.018997</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>7.690480</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>9.327333</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>37.731715</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 50.000000 0 50.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.880000 NaN 8.748563 \n",
|
|
"std 0.328261 NaN 4.640876 \n",
|
|
"min 0.000000 NaN 5.161556 \n",
|
|
"25% 1.000000 NaN 7.018997 \n",
|
|
"50% 1.000000 NaN 7.690480 \n",
|
|
"75% 1.000000 NaN 9.327333 \n",
|
|
"max 1.000000 NaN 37.731715 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 50 \n",
|
|
"unique 50 \n",
|
|
"top 83776c8b-5772-4521-8b30-17b1cc5defca \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Underperformed. Continuing\n",
|
|
"Best overall score: 0.92\n",
|
|
"Best step: 8\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import functools\n",
|
|
"\n",
|
|
"# We will train with gpt-4-turbo\n",
|
|
"llm = ChatOpenAI(model=\"gpt-4-turbo-preview\")\n",
|
|
"all_scores = train(\n",
|
|
" functools.partial(create_chain, llm=llm),\n",
|
|
" train_name,\n",
|
|
" dev_name,\n",
|
|
" eval_config,\n",
|
|
" steps=10,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "816d3e2d-e719-4328-babf-0b230b37d49a",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Compare on held-out set\n",
|
|
"\n",
|
|
"It's easy to overfit a single benchmark if you explicitly choose your pipeline based on metrics on that benchmark.\n",
|
|
"\n",
|
|
"Let's compare models on an unseen test set to see whether the selected examples are reliably better."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "6e086d8c-154e-4c2d-bc6f-69f6fdb1a0eb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"best_score, best_examples = all_scores[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "57a2d5a9-00c6-47df-bc82-53a18008e240",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"original_model = create_chain()\n",
|
|
"# This time we will apply gpt-3.5-turbo, but use the few-shot examples + reasoning trajectories\n",
|
|
"# from gpt-4 to help induce better performance\n",
|
|
"best_performing_model = create_chain(best_examples)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"id": "2e793f7a-0406-4a41-b12b-10d63450df0e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"View the evaluation results for project 'shiny-ship-82' at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/f1b328a2-b4e8-473c-808f-e042d38f6ebd/compare?selectedSessions=368a8216-6462-4d19-8261-9709fe301b19\n",
|
|
"\n",
|
|
"View all tests for Dataset scone-test2 at:\n",
|
|
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/f1b328a2-b4e8-473c-808f-e042d38f6ebd\n",
|
|
"[------------------------------------------------->] 200/200"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<h3>Experiment Results:</h3>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>feedback.exact_match</th>\n",
|
|
" <th>error</th>\n",
|
|
" <th>execution_time</th>\n",
|
|
" <th>run_id</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>200.000000</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>200.000000</td>\n",
|
|
" <td>200</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>200</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>2ab8873e-b142-4f3f-a970-0ca693ce12c2</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>0.870000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.772289</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>0.337147</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0.341076</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>0.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.205090</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.547561</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.718797</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>1.897174</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3.934606</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" feedback.exact_match error execution_time \\\n",
|
|
"count 200.000000 0 200.000000 \n",
|
|
"unique NaN 0 NaN \n",
|
|
"top NaN NaN NaN \n",
|
|
"freq NaN NaN NaN \n",
|
|
"mean 0.870000 NaN 1.772289 \n",
|
|
"std 0.337147 NaN 0.341076 \n",
|
|
"min 0.000000 NaN 1.205090 \n",
|
|
"25% 1.000000 NaN 1.547561 \n",
|
|
"50% 1.000000 NaN 1.718797 \n",
|
|
"75% 1.000000 NaN 1.897174 \n",
|
|
"max 1.000000 NaN 3.934606 \n",
|
|
"\n",
|
|
" run_id \n",
|
|
"count 200 \n",
|
|
"unique 200 \n",
|
|
"top 2ab8873e-b142-4f3f-a970-0ca693ce12c2 \n",
|
|
"freq 1 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"for model_name, model in [\n",
|
|
" (\"optimized\", best_performing_model),\n",
|
|
" # (\"original\", original_model),\n",
|
|
"]:\n",
|
|
" client.run_on_dataset(\n",
|
|
" dataset_name=test_name,\n",
|
|
" llm_or_chain_factory=model,\n",
|
|
" evaluation=eval_config,\n",
|
|
" verbose=True,\n",
|
|
" project_metadata={\n",
|
|
" \"model\": model_name,\n",
|
|
" },\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4ae02ead-d88e-4ea5-b0bd-4cb6abbfa749",
|
|
"metadata": {},
|
|
"source": [
|
|
"Using the GPT-4 generated examples, we were able to boost the performance from ~0.54 to ~0.87: not bad!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "11cf33e1-fa45-4f45-9f39-c01ac9eb644e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|