Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 72414b951c | |||
| 8899acc989 | |||
| c0e7f51626 | |||
| 9f827eaca5 | |||
| d9fc08b05c | |||
| 8a5ba6d575 | |||
| 8204930f2b | |||
| 013fe6a153 | |||
| 01ffffd04c | |||
| 4ddbbc0ff8 | |||
| 5ffdbb5c4c |
@@ -1,6 +1,4 @@
|
||||
🚧 Under Active Development 🚧
|
||||
|
||||
# 🦜💪 LangChain Benchmarks
|
||||
# 🦜💯 LangChain Benchmarks
|
||||
|
||||
[](https://github.com/langchain-ai/langchain-benchmarks/releases)
|
||||
[](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml)
|
||||
@@ -35,7 +33,7 @@ pip install -U langchain-benchmarks
|
||||
All the benchmarks come with an associated benchmark dataset stored in [LangSmith](https://smith.langchain.com). To take advantage of the eval and debugging experience, [sign up](https://smith.langchain.com), and set your API key in your environment:
|
||||
|
||||
```bash
|
||||
export LANGCHAIN_API_KEY=sk-...
|
||||
export LANGCHAIN_API_KEY=ls-...
|
||||
```
|
||||
|
||||
## Repo Structure
|
||||
|
||||
|
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
|
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
|
Before Width: | Height: | Size: 9.7 KiB After Width: | Height: | Size: 9.7 KiB |
|
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
@@ -1 +1,3 @@
|
||||
chromadb/
|
||||
index.md
|
||||
Untitled.ipynb
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"id": "86912590-a90a-4351-8ab4-89192cdee1e7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -26,19 +26,24 @@
|
||||
"<tr><th>Name </th><th>Type </th><th>Dataset ID </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d\" target=\"_blank\" rel=\"noopener\">36bdfe7d-3cd1-4b36-b957-d12d95810a2b</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
|
||||
"<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d\" target=\"_blank\" rel=\"noopener\">a1742786-bde5-4f51-a1d8-e148e5251ddb</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
|
||||
"\n",
|
||||
"Some additional cleanup of the data was done by hand after the initial pass.\n",
|
||||
"\n",
|
||||
"See https://github.com/jacoblee93/oss-model-extraction-evals. </td></tr>\n",
|
||||
"<tr><td>Chat Extraction </td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d\" target=\"_blank\" rel=\"noopener\">00f4444c-9460-4a82-b87a-f50096f1cfef</a></td><td>A dataset meant to test the ability of an LLM to extract and infer\n",
|
||||
"structured information from a dialogue. The dialogue is between a user and a support\n",
|
||||
"engineer. Outputs should be structured as a JSON object and test both the ability\n",
|
||||
"of the LLM to correctly structure the information and its ability to perform simple \n",
|
||||
"classification tasks. </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))]))])"
|
||||
"Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))])), ExtractionTask(name='Chat Extraction', dataset_id='https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d', description='A dataset meant to test the ability of an LLM to extract and infer\\nstructured information from a dialogue. The dialogue is between a user and a support\\nengineer. Outputs should be structured as a JSON object and test both the ability\\nof the LLM to correctly structure the information and its ability to perform simple \\nclassification tasks.', schema=<class 'langchain_benchmarks.extraction.tasks.chat_extraction.schema.GenerateTicket'>, instructions=ChatPromptTemplate(input_variables=['dialogue'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['dialogue'], template='Generate a ticket for the following question-response pair:\\n<Dialogue>\\n{dialogue}\\n</Dialogue>'))]))])"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -85,9 +90,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 2,
|
||||
"id": "9c7865bd-8251-4579-85a3-f9085d96f497",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
@@ -115,7 +122,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -0,0 +1,464 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d9cb90d8-e6a1-4c89-9cde-0e6c0a28f5c0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Model Registry\n",
|
||||
"\n",
|
||||
"LangChain Benchmark includes a model registry to make it easier to run benchmarks across different models.\n",
|
||||
"\n",
|
||||
"If you see a model that you want to use and it's missing, please open a PR to add it!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "31831289-51fb-4ee5-98f3-0476cf11b187",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_benchmarks import model_registry"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "aaed190d-fa4b-4445-9bfb-0e784e2a083b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<thead>\n",
|
||||
"<tr><th>Name </th><th>Type </th><th>Provider </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>gpt-3.5-turbo-1106 </td><td>chat </td><td>openai </td><td>The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens.</td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo-16k </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo-instruct</td><td>llm </td><td>openai </td><td>Similar capabilities as text-davinci-003 but compatible with legacy Completions endpoint and not Chat Completions. </td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo-0613 </td><td>chat </td><td>openai </td><td>Legacy Snapshot of gpt-3.5-turbo from June 13th 2023. Will be deprecated on June 13, 2024. </td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo-16k-0613</td><td>chat </td><td>openai </td><td>Legacy Snapshot of gpt-3.5-16k-turbo from June 13th 2023. Will be deprecated on June 13, 2024. </td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo-0301 </td><td>chat </td><td>openai </td><td>Legacy Snapshot of gpt-3.5-turbo from March 1st 2023. Will be deprecated on June 13th 2024. </td></tr>\n",
|
||||
"<tr><td>text-davinci-003 </td><td>llm </td><td>openai </td><td>Legacy Can do language tasks with better quality and consistency than the curie, babbage, or ada models. Will be deprecated on Jan 4th 2024. </td></tr>\n",
|
||||
"<tr><td>text-davinci-002 </td><td>llm </td><td>openai </td><td>Legacy Similar capabilities to text-davinci-003 but trained with supervised fine-tuning instead of reinforcement learning. Will be deprecated on Jan 4th 2024. </td></tr>\n",
|
||||
"<tr><td>code-davinci-002 </td><td>llm </td><td>openai </td><td>Legacy Optimized for code-completion tasks. Will be deprecated on Jan 4th 2024. </td></tr>\n",
|
||||
"<tr><td>llama-v2-7b-chat-fw </td><td>chat </td><td>fireworks </td><td>7b parameter LlamaChat model </td></tr>\n",
|
||||
"<tr><td>llama-v2-13b-chat-fw </td><td>chat </td><td>fireworks </td><td>13b parameter LlamaChat model </td></tr>\n",
|
||||
"<tr><td>llama-v2-70b-chat-fw </td><td>chat </td><td>fireworks </td><td>70b parameter LlamaChat model </td></tr>\n",
|
||||
"<tr><td>claude-2 </td><td>chat </td><td>anthropic </td><td>Superior performance on tasks that require complex reasoning </td></tr>\n",
|
||||
"<tr><td>claude-2.1 </td><td>chat </td><td>anthropic </td><td>Same performance as Claude 2, plus significant reduction in model hallucination rates </td></tr>\n",
|
||||
"<tr><td>claude-instant-1.2 </td><td>chat </td><td>anthropic </td><td>low-latency, high throughput. </td></tr>\n",
|
||||
"<tr><td>claude-instant-1 </td><td>chat </td><td>anthropic </td><td>low-latency, high throughput. </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"ModelRegistry(registered_models=[RegisteredModel(name='gpt-3.5-turbo-1106', provider='openai', description='The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens.', params={'model': 'gpt-3.5-turbo-1106'}, type='chat', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo'}, type='chat', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo-16k', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo-16k'}, type='chat', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo-instruct', provider='openai', description='Similar capabilities as text-davinci-003 but compatible with legacy Completions endpoint and not Chat Completions.', params={'model': 'gpt-3.5-turbo-instruct'}, type='llm', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo-0613', provider='openai', description='Legacy Snapshot of gpt-3.5-turbo from June 13th 2023. Will be deprecated on June 13, 2024.', params={'model': 'gpt-3.5-turbo-0613'}, type='chat', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo-16k-0613', provider='openai', description='Legacy Snapshot of gpt-3.5-16k-turbo from June 13th 2023. Will be deprecated on June 13, 2024.', params={'model': 'gpt-3.5-turbo-16k-0613'}, type='chat', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo-0301', provider='openai', description='Legacy Snapshot of gpt-3.5-turbo from March 1st 2023. Will be deprecated on June 13th 2024.', params={'model': 'gpt-3.5-turbo-0301'}, type='chat', path=None, url=None), RegisteredModel(name='text-davinci-003', provider='openai', description='Legacy Can do language tasks with better quality and consistency than the curie, babbage, or ada models. Will be deprecated on Jan 4th 2024.', params={'model': 'text-davinci-003'}, type='llm', path=None, url=None), RegisteredModel(name='text-davinci-002', provider='openai', description='Legacy Similar capabilities to text-davinci-003 but trained with supervised fine-tuning instead of reinforcement learning. Will be deprecated on Jan 4th 2024.', params={'model': 'text-davinci-002'}, type='llm', path=None, url=None), RegisteredModel(name='code-davinci-002', provider='openai', description='Legacy Optimized for code-completion tasks. Will be deprecated on Jan 4th 2024.', params={'model': 'code-davinci-002'}, type='llm', path=None, url=None), RegisteredModel(name='llama-v2-7b-chat-fw', provider='fireworks', description='7b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-7b-chat'}, type='chat', path=None, url=None), RegisteredModel(name='llama-v2-13b-chat-fw', provider='fireworks', description='13b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-13b-chat'}, type='chat', path=None, url=None), RegisteredModel(name='llama-v2-70b-chat-fw', provider='fireworks', description='70b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-70b-chat'}, type='chat', path=None, url=None), RegisteredModel(name='claude-2', provider='anthropic', description='Superior performance on tasks that require complex reasoning', params={'model': 'claude-2'}, type='chat', path=None, url=None), RegisteredModel(name='claude-2.1', provider='anthropic', description='Same performance as Claude 2, plus significant reduction in model hallucination rates', params={'model': 'claude-2.1'}, type='chat', path=None, url=None), RegisteredModel(name='claude-instant-1.2', provider='anthropic', description='low-latency, high throughput.', params={'model': 'claude-instant-1.2'}, type='chat', path=None, url=None), RegisteredModel(name='claude-instant-1', provider='anthropic', description='low-latency, high throughput.', params={'model': 'claude-instant-1'}, type='chat', path=None, url=None)])"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model_registry"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2974b4f9-575c-4907-97eb-7334ef5f1d8e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Indexing\n",
|
||||
"\n",
|
||||
"Registry supports indexing by position. This ordering may change as more models get added."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "64bfc631-1f1e-4cf4-8636-b8be7b46fef8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>name </td><td>gpt-3.5-turbo-1106 </td></tr>\n",
|
||||
"<tr><td>type </td><td>chat </td></tr>\n",
|
||||
"<tr><td>provider </td><td>openai </td></tr>\n",
|
||||
"<tr><td>description</td><td>The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens.</td></tr>\n",
|
||||
"<tr><td>model_path </td><td>langchain.chat_models.openai.ChatOpenAI </td></tr>\n",
|
||||
"<tr><td>url </td><td><a href=\"langchain.chat_models.openai.ChatOpenAI\" target=\"_blank\" rel=\"noopener\">ModelPage</a> </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"RegisteredModel(name='gpt-3.5-turbo-1106', provider='openai', description='The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens.', params={'model': 'gpt-3.5-turbo-1106'}, type='chat', path=None, url=None)"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"registered_model = model_registry[0]\n",
|
||||
"registered_model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9150df8d-11a5-4e83-bc1c-b34119f75783",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Can also index by model name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "267e746b-f13e-4484-bcbb-ed5dfbacae67",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>name </td><td>gpt-3.5-turbo </td></tr>\n",
|
||||
"<tr><td>type </td><td>chat </td></tr>\n",
|
||||
"<tr><td>provider </td><td>openai </td></tr>\n",
|
||||
"<tr><td>description</td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
|
||||
"<tr><td>model_path </td><td>langchain.chat_models.openai.ChatOpenAI </td></tr>\n",
|
||||
"<tr><td>url </td><td><a href=\"langchain.chat_models.openai.ChatOpenAI\" target=\"_blank\" rel=\"noopener\">ModelPage</a></td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"RegisteredModel(name='gpt-3.5-turbo', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo'}, type='chat', path=None, url=None)"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model_registry[\"gpt-3.5-turbo\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "26404672-0832-47be-bc7e-7f74116f6909",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Use the model\n",
|
||||
"\n",
|
||||
"To use the models, make sure that you have credentials set up. Most models take either an API key as part of the initializer or will use any ENV variables that might be present."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "3604d49e-afbe-48ad-ac10-1e538b1ad376",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = model_registry[\"gpt-3.5-turbo\"].get_model(model_params={\"temperature\": 0})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "bdece532-9843-427a-a10b-4545ed4ec151",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"Hello! I am an AI language model developed by OpenAI, and I don't have a personal name. You can simply refer to me as OpenAI Assistant. How can I assist you today?\")"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model.invoke(\"hello! what is your name?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "095ad4c7-a796-4d6d-bf1f-706799c1f743",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = model_registry[\"claude-2.1\"].get_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "d8c656d7-d17a-4dd9-bfdb-34da26f1ba57",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=' Hello! My name is Claude.')"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model.invoke(\"hello! what is your name?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "a193bca5-67d4-4e83-841b-7c28089d76c4",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = model_registry[\"llama-v2-7b-chat-fw\"].get_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "ac4c3216-d343-4346-b089-deceef91b334",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"Hello! *smiling* My name is Assistant, and I'm here to help you with any questions or tasks you may have. It's important to me to provide respectful and socially unbiased responses, and I'm glad you're here to chat with me! Is there something specific you'd like to talk about or ask? 😊\")"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model.invoke(\"hello! what is your name?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "5769e2a6-55ae-41b6-b0c9-2fce59a7a409",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n\\nMy name is [Name].'"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = model_registry[\"text-davinci-003\"].get_model()\n",
|
||||
"model.invoke(\"hello! what is your name?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "34801c48-83ed-4ada-b85b-aa3b8cfce31b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Slicing\n",
|
||||
"\n",
|
||||
"Slicing notation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "db40d4da-dc70-4e6d-b7e8-61de1e15ed2e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<thead>\n",
|
||||
"<tr><th>Name </th><th>Type </th><th>Provider </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>gpt-3.5-turbo-1106</td><td>chat </td><td>openai </td><td>The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens.</td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
|
||||
"<tr><td>gpt-3.5-turbo-16k </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"ModelRegistry(registered_models=[RegisteredModel(name='gpt-3.5-turbo-1106', provider='openai', description='The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens.', params={'model': 'gpt-3.5-turbo-1106'}, type='chat', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo'}, type='chat', path=None, url=None), RegisteredModel(name='gpt-3.5-turbo-16k', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo-16k'}, type='chat', path=None, url=None)])"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model_registry[:3]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8d0260af-920f-4512-9273-6f7662369ec5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Filtering\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Filtering"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "9874846a-52f3-4921-b1ed-0858521bb9a9",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<thead>\n",
|
||||
"<tr><th>Name </th><th>Type </th><th>Provider </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>llama-v2-7b-chat-fw </td><td>chat </td><td>fireworks </td><td>7b parameter LlamaChat model </td></tr>\n",
|
||||
"<tr><td>llama-v2-13b-chat-fw</td><td>chat </td><td>fireworks </td><td>13b parameter LlamaChat model</td></tr>\n",
|
||||
"<tr><td>llama-v2-70b-chat-fw</td><td>chat </td><td>fireworks </td><td>70b parameter LlamaChat model</td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"ModelRegistry(registered_models=[RegisteredModel(name='llama-v2-7b-chat-fw', provider='fireworks', description='7b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-7b-chat'}, type='chat', path=None, url=None), RegisteredModel(name='llama-v2-13b-chat-fw', provider='fireworks', description='13b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-13b-chat'}, type='chat', path=None, url=None), RegisteredModel(name='llama-v2-70b-chat-fw', provider='fireworks', description='70b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-70b-chat'}, type='chat', path=None, url=None)])"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model_registry.filter(provider=\"fireworks\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c59baa6f-c3c1-4e37-919c-f9e70feb9101",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Iteration\n",
|
||||
"\n",
|
||||
"You can iterate through the models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "eb531591-f46b-4745-ae67-4dfd6217ec5f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"gpt-3.5-turbo-1106\n",
|
||||
"gpt-3.5-turbo\n",
|
||||
"gpt-3.5-turbo-16k\n",
|
||||
"gpt-3.5-turbo-instruct\n",
|
||||
"gpt-3.5-turbo-0613\n",
|
||||
"gpt-3.5-turbo-16k-0613\n",
|
||||
"gpt-3.5-turbo-0301\n",
|
||||
"text-davinci-003\n",
|
||||
"text-davinci-002\n",
|
||||
"code-davinci-002\n",
|
||||
"llama-v2-7b-chat-fw\n",
|
||||
"llama-v2-13b-chat-fw\n",
|
||||
"llama-v2-70b-chat-fw\n",
|
||||
"claude-2\n",
|
||||
"claude-2.1\n",
|
||||
"claude-instant-1.2\n",
|
||||
"claude-instant-1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for registered_model in model_registry:\n",
|
||||
" print(registered_model.name)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,610 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9fa3470d-9448-4792-9f65-6978fc58cf84",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Multi-modal eval: Baseline\n",
|
||||
"\n",
|
||||
"`Multi-modal slide decks` is a public dataset that contains a dataset of question-answer pairs from slide decks with visual content.\n",
|
||||
"\n",
|
||||
"The question-answer pairs are derived from the visual content in the decks, testing the ability of RAG to perform visual reasoning.\n",
|
||||
"\n",
|
||||
"As a baseline, we evaluate this dataset using text-based RAG pipeline, below.\n",
|
||||
"\n",
|
||||
"This will not reason about visual content and will simply load the text from the slides. \n",
|
||||
"\n",
|
||||
"## Pre-requisites"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "47220461-d4e9-4f1d-9c57-672ca947ca0d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install -U langchain langsmith langchain_benchmarks\n",
|
||||
"# %pip install --quiet chromadb openai pypdf pandas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "196de967-6de6-40da-aa75-e836923ab5e3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
|
||||
"env_vars = [\"LANGCHAIN_API_KEY\", \"OPENAI_API_KEY\"]\n",
|
||||
"for var in env_vars:\n",
|
||||
" if var not in os.environ:\n",
|
||||
" os.environ[var] = getpass.getpass(prompt=f\"Enter your {var}: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "10da8e11-6288-4131-bd60-d5aa86928acc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dataset\n",
|
||||
"\n",
|
||||
"We can browse the available LangChain benchmark datasets for retrieval."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2ff97905-14a6-413c-99be-58b7a9c8d4c1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<thead>\n",
|
||||
"<tr><th>Name </th><th>Type </th><th>Dataset ID </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>LangChain Docs Q&A </td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d\" target=\"_blank\" rel=\"noopener\">452ccafc-18e1-4314-885b-edd735f17b9d</a></td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
|
||||
"\n",
|
||||
"The environment provides the documents and the retriever information.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer.\n",
|
||||
"We also measure the faithfulness of the model's response relative to the retrieved documents (if any). </td></tr>\n",
|
||||
"<tr><td>Semi-structured Reports</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d\" target=\"_blank\" rel=\"noopener\">c47d9617-ab99-4d6e-a6e6-92b8daf85a7d</a></td><td>Questions and answers based on PDFs containing tables and charts.\n",
|
||||
"\n",
|
||||
"The task provides the raw documents as well as factory methods to easily index them\n",
|
||||
"and create a retriever.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer.\n",
|
||||
"We also measure the faithfulness of the model's response relative to the retrieved documents (if any). </td></tr>\n",
|
||||
"<tr><td>Multi-modal slide decks</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d\" target=\"_blank\" rel=\"noopener\">40afc8e7-9d7e-44ed-8971-2cae1eb59731</a></td><td>This public dataset is a work-in-progress and will be extended over time.\n",
|
||||
" \n",
|
||||
"Questions and answers based on slide decks containing visual tables and charts.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer. </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", get_docs=<function load_cached_docs at 0x104485800>, retriever_factories={'basic': <function _chroma_retriever_factory at 0x1360289a0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x136028a40>, 'hyde': <function _chroma_hyde_retriever_factory at 0x136028ae0>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x126ba2660>}), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", get_docs=<function load_docs at 0x136029620>, retriever_factories={'basic': <function _chroma_retriever_factory at 0x1360296c0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x136029760>, 'hyde': <function _chroma_hyde_retriever_factory at 0x136029800>}, architecture_factories={}), RetrievalTask(name='Multi-modal slide decks', dataset_id='https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d', description='This public dataset is a work-in-progress and will be extended over time.\\n \\nQuestions and answers based on slide decks containing visual tables and charts.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\n', get_docs={}, retriever_factories={}, architecture_factories={})])"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_benchmarks import clone_public_dataset, registry\n",
|
||||
"\n",
|
||||
"registry = registry.filter(Type=\"RetrievalTask\")\n",
|
||||
"registry"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2fb7dc3d-28f1-4c28-b0d0-3784d04b81ce",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"`Multi-modal slide decks` is the relevant dataset for our task."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "219a4141-4a5f-48e4-ae05-5a824e2193fd",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>Name </td><td>Multi-modal slide decks </td></tr>\n",
|
||||
"<tr><td>Type </td><td>RetrievalTask </td></tr>\n",
|
||||
"<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d\" target=\"_blank\" rel=\"noopener\">40afc8e7-9d7e-44ed-8971-2cae1eb59731</a></td></tr>\n",
|
||||
"<tr><td>Description </td><td>This public dataset is a work-in-progress and will be extended over time.\n",
|
||||
" \n",
|
||||
"Questions and answers based on slide decks containing visual tables and charts.\n",
|
||||
"\n",
|
||||
"Each example is composed of a question and reference answer.\n",
|
||||
"\n",
|
||||
"Success is measured based on the accuracy of the answer relative to the reference answer. </td></tr>\n",
|
||||
"<tr><td>Retriever Factories </td><td> </td></tr>\n",
|
||||
"<tr><td>Architecture Factories</td><td> </td></tr>\n",
|
||||
"<tr><td>get_docs </td><td>{} </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"RetrievalTask(name='Multi-modal slide decks', dataset_id='https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d', description='This public dataset is a work-in-progress and will be extended over time.\\n \\nQuestions and answers based on slide decks containing visual tables and charts.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\n', get_docs={}, retriever_factories={}, architecture_factories={})"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"task = registry[\"Multi-modal slide decks\"]\n",
|
||||
"task"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2d6569b5-e79a-41b7-9745-c2f8a1dd704e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Clone the dataset so that it's available in our LangSmith datasets."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d2caa086-9549-4c74-bba9-ba80d5a7b218",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dataset Multi-modal slide decks already exists. Skipping.\n",
|
||||
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bf350917-a1e5-46f4-81cd-c1678ab9220f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "99ce6afb-2317-4bc1-9faf-4f828095ad91",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_benchmarks.rag.tasks.multi_modal_slide_decks import get_file_names\n",
|
||||
"\n",
|
||||
"file_names = list(get_file_names()) # PosixPath"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "848a4cdb-6c08-4c01-81ce-16ab83a7fdff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load\n",
|
||||
"\n",
|
||||
"Load and split the files for indexing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "6ce85810-98a7-406e-b44e-ce860ac35986",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"There are 98 text elements in DDOG_Q3_earnings_deck.pdf\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PyPDFLoader\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def load_and_split(file):\n",
|
||||
" \"\"\"\n",
|
||||
" Load and split PDF files\n",
|
||||
" :param file: PosixPath path for pdf\n",
|
||||
" :return: A list of text chunks\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" loader = PyPDFLoader(str(file))\n",
|
||||
" pdf_pages = loader.load()\n",
|
||||
"\n",
|
||||
" text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
|
||||
" chunk_size=100, chunk_overlap=50\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Get chunks\n",
|
||||
" docs = text_splitter.split_documents(pdf_pages)\n",
|
||||
" texts = [d.page_content for d in docs]\n",
|
||||
" print(f\"There are {len(texts)} text elements in {file.name}\")\n",
|
||||
" return texts\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"texts = []\n",
|
||||
"for fi in file_names:\n",
|
||||
" texts.extend(load_and_split(fi))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eb01925d-b7d1-47a1-bd90-805178d3c4a9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Index\n",
|
||||
"\n",
|
||||
"Embed (OpenAIEmbeddings) and store splits in a vectorstore (Chroma)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ceb31f71-45fb-4b12-bc1c-31981de334bb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"\n",
|
||||
"vectorstore_baseline = Chroma.from_texts(\n",
|
||||
" texts=texts, collection_name=\"baseline-multi-modal\", embedding=OpenAIEmbeddings()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"retriever_baseline = vectorstore_baseline.as_retriever()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e6dcbb01-f480-456d-b972-c732eb26c393",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## RAG\n",
|
||||
"\n",
|
||||
"Create a pipeline for retrieval of relevant chunks based on semantic similarity to the input question.\n",
|
||||
"\n",
|
||||
"Pass the images to GPT-4 for answer synthesis."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ea233664-e527-42f1-a820-0c2271e16c20",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.prompts import ChatPromptTemplate\n",
|
||||
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def rag_chain(retriever):\n",
|
||||
" \"\"\"\n",
|
||||
" RAG pipeline for the indexed presentations\n",
|
||||
" :param retriever: PosixPath path for pdf\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Prompt template\n",
|
||||
" template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n",
|
||||
" {context}\n",
|
||||
" Question: {question}\n",
|
||||
" \"\"\"\n",
|
||||
" prompt = ChatPromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
" # LLM\n",
|
||||
" model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
|
||||
"\n",
|
||||
" # RAG pipeline\n",
|
||||
" chain = (\n",
|
||||
" {\n",
|
||||
" \"context\": retriever | (lambda x: \"\\n\\n\".join([i.page_content for i in x])),\n",
|
||||
" \"question\": RunnablePassthrough(),\n",
|
||||
" }\n",
|
||||
" | prompt\n",
|
||||
" | model\n",
|
||||
" | StrOutputParser()\n",
|
||||
" )\n",
|
||||
" return chain\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Create RAG chain\n",
|
||||
"chain = rag_chain(retriever_baseline)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "95df1446-143d-4f4c-a15b-2a379266d8bf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Eval\n",
|
||||
"\n",
|
||||
"Run evaluation on our dataset:\n",
|
||||
"\n",
|
||||
"* `task.name` is the dataset of QA pairs that we cloned\n",
|
||||
"* `eval_config` specifies the [LangSmith evaluator](https://docs.smith.langchain.com/evaluation/evaluator-implementations#correctness-qa-evaluation) for our dataset, which will use GPT-4 as a grader\n",
|
||||
"* The grader will evaluate the chain-generated answer to each question relative to ground truth"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "479ce09d-642e-4b3b-9e4e-e9c2b7f0e9ca",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"View the evaluation results for project '866f-baseline' at:\n",
|
||||
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306/compare?selectedSessions=30199d47-50d7-4c5c-a55a-e74157e05951\n",
|
||||
"\n",
|
||||
"View all tests for Dataset Multi-modal slide decks at:\n",
|
||||
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306\n",
|
||||
"[------------------------------------------------->] 10/10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<h3>Experiment Results:</h3>"
|
||||
],
|
||||
"text/plain": [
|
||||
"<IPython.core.display.HTML object>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>output</th>\n",
|
||||
" <th>feedback.COT Contextual Accuracy</th>\n",
|
||||
" <th>error</th>\n",
|
||||
" <th>execution_time</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>10.000000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>10.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unique</th>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>top</th>\n",
|
||||
" <td>Datadog has 20 total customers.</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>freq</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.200000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>4.674478</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.421637</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.864273</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>3.307960</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>4.113816</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>4.700962</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>5.018359</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>6.188082</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" output feedback.COT Contextual Accuracy \\\n",
|
||||
"count 10 10.000000 \n",
|
||||
"unique 10 NaN \n",
|
||||
"top Datadog has 20 total customers. NaN \n",
|
||||
"freq 1 NaN \n",
|
||||
"mean NaN 0.200000 \n",
|
||||
"std NaN 0.421637 \n",
|
||||
"min NaN 0.000000 \n",
|
||||
"25% NaN 0.000000 \n",
|
||||
"50% NaN 0.000000 \n",
|
||||
"75% NaN 0.000000 \n",
|
||||
"max NaN 1.000000 \n",
|
||||
"\n",
|
||||
" error execution_time \n",
|
||||
"count 0 10.000000 \n",
|
||||
"unique 0 NaN \n",
|
||||
"top NaN NaN \n",
|
||||
"freq NaN NaN \n",
|
||||
"mean NaN 4.674478 \n",
|
||||
"std NaN 0.864273 \n",
|
||||
"min NaN 3.307960 \n",
|
||||
"25% NaN 4.113816 \n",
|
||||
"50% NaN 4.700962 \n",
|
||||
"75% NaN 5.018359 \n",
|
||||
"max NaN 6.188082 "
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import uuid\n",
|
||||
"\n",
|
||||
"from langchain.smith import RunEvalConfig\n",
|
||||
"from langsmith.client import Client\n",
|
||||
"\n",
|
||||
"# Evaluator configuration\n",
|
||||
"client = Client()\n",
|
||||
"eval_config = RunEvalConfig(\n",
|
||||
" evaluators=[\"cot_qa\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Experiments\n",
|
||||
"chain_map = {\n",
|
||||
" \"baseline\": chain,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Run evaluation\n",
|
||||
"run_id = uuid.uuid4().hex[:4]\n",
|
||||
"test_runs = {}\n",
|
||||
"for project_name, chain in chain_map.items():\n",
|
||||
" test_runs[project_name] = client.run_on_dataset(\n",
|
||||
" dataset_name=task.name,\n",
|
||||
" llm_or_chain_factory=lambda: (lambda x: x[\"Question\"]) | chain,\n",
|
||||
" evaluation=eval_config,\n",
|
||||
" verbose=True,\n",
|
||||
" project_name=f\"{run_id}-{project_name}\",\n",
|
||||
" project_metadata={\"chain\": project_name},\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -3,6 +3,7 @@
|
||||
:caption: Introduction
|
||||
|
||||
./notebooks/getting_started
|
||||
./notebooks/models
|
||||
./notebooks/datasets
|
||||
```
|
||||
|
||||
@@ -24,6 +25,7 @@
|
||||
|
||||
./notebooks/extraction/intro
|
||||
./notebooks/extraction/email
|
||||
./notebooks/extraction/chat_extraction
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
@@ -33,5 +35,7 @@
|
||||
./notebooks/retrieval/intro
|
||||
./notebooks/retrieval/langchain_docs_qa
|
||||
./notebooks/retrieval/semi_structured
|
||||
./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval_baseline
|
||||
./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval
|
||||
./notebooks/retrieval/comparing_techniques
|
||||
```
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from langchain_benchmarks.model_registration import model_registry
|
||||
from langchain_benchmarks.registration import registry
|
||||
from langchain_benchmarks.utils._langsmith import (
|
||||
clone_public_dataset,
|
||||
@@ -5,4 +6,9 @@ from langchain_benchmarks.utils._langsmith import (
|
||||
)
|
||||
|
||||
# Please keep this list sorted!
|
||||
__all__ = ["clone_public_dataset", "download_public_dataset", "registry"]
|
||||
__all__ = [
|
||||
"clone_public_dataset",
|
||||
"download_public_dataset",
|
||||
"model_registry",
|
||||
"registry",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
from langchain.prompts import ChatPromptTemplate
|
||||
|
||||
from langchain_benchmarks.extraction.tasks.chat_extraction.evaluators import (
|
||||
get_eval_config,
|
||||
)
|
||||
from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket
|
||||
from langchain_benchmarks.schema import ExtractionTask
|
||||
|
||||
# This is a default prompt that works reasonably for OpenAI models.
|
||||
|
||||
DEFAULT_CHAT_MODEL_PROMPT = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
(
|
||||
"system",
|
||||
"You are a helpdesk assistant responsible with extracting information"
|
||||
" and generating tickets. Dialogues are between a user and"
|
||||
" a support engineer.",
|
||||
),
|
||||
(
|
||||
"user",
|
||||
"Generate a ticket for the following question-response pair:\n"
|
||||
"<Dialogue>\n{dialogue}\n</Dialogue>",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
CHAT_EXTRACTION_TASK = ExtractionTask(
|
||||
name="Chat Extraction",
|
||||
dataset_id="https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d",
|
||||
schema=GenerateTicket,
|
||||
description="""A dataset meant to test the ability of an LLM to extract and infer
|
||||
structured information from a dialogue. The dialogue is between a user and a support
|
||||
engineer. Outputs should be structured as a JSON object and test both the ability
|
||||
of the LLM to correctly structure the information and its ability to perform simple
|
||||
classification tasks.""",
|
||||
instructions=DEFAULT_CHAT_MODEL_PROMPT,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["CHAT_EXTRACTION_TASK", "get_eval_config"]
|
||||
@@ -0,0 +1,180 @@
|
||||
from langchain.smith.evaluation.config import RunEvalConfig, SingleKeyEvalConfig
|
||||
from langsmith.evaluation.evaluator import (
|
||||
EvaluationResult,
|
||||
run_evaluator,
|
||||
)
|
||||
from langsmith.schemas import Example, Run
|
||||
|
||||
from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def json_schema(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the json schema of the generated ticket."""
|
||||
score, comment = None, None
|
||||
try:
|
||||
GenerateTicket.parse_obj(run.outputs["output"])
|
||||
score = 1
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
score = 0
|
||||
|
||||
return EvaluationResult(
|
||||
key="json_schema",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_toxicity_similarity(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the toxicity of the generated ticket."""
|
||||
gt = example.outputs["output"]["question"]["toxicity"]
|
||||
score, comment = None, None
|
||||
# Toxicity should be a on scale from 0 to 5
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["toxicity"]
|
||||
score = 1 - abs(gt - float(pred)) / 5
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="toxicity_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_sentiment_similarity(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the sentiment of the generated ticket."""
|
||||
gt = example.outputs["output"]["question"]["sentiment"]
|
||||
ordinal_map = {
|
||||
"negative": 0,
|
||||
"neutral": 1,
|
||||
"positive": 2,
|
||||
}
|
||||
gt_score = ordinal_map.get(str(gt).lower())
|
||||
score, comment = None, None
|
||||
# Sentiment is an enum, "Negative", "Neutral", "Positive"
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["sentiment"]
|
||||
pred_score = ordinal_map.get(str(pred).lower())
|
||||
score = 1 - (abs(gt_score - float(pred_score)) / 2)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="sentiment_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_confidence_level_similarity(
|
||||
run: Run, example: Example
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate the confidence level of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["response"]["confidence_level"]
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["response"]["confidence_level"]
|
||||
score = 1 - (abs(gt - float(pred)) / 5)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="confidence_level_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_question_category_similarity(
|
||||
run: Run, example: Example
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate the question category of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["question"]["question_category"]
|
||||
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["question_category"]
|
||||
score = int(gt == pred)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="question_category",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_off_topic(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the off topic of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["question"]["is_off_topic"]
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["question"].get("is_off_topic")
|
||||
score = int(gt == pred)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="off_topic_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_programming_language(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the programming language of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["question"]["programming_language"]
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["programming_language"]
|
||||
score = int(gt == pred)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="programming_language_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
def get_eval_config() -> RunEvalConfig:
|
||||
"""Get the evaluation configuration for the chat extraction task."""
|
||||
return RunEvalConfig(
|
||||
evaluators=[
|
||||
# General aggregate score
|
||||
SingleKeyEvalConfig(
|
||||
# input key is ignored.
|
||||
evaluator_type="json_edit_distance",
|
||||
input_key="question",
|
||||
)
|
||||
],
|
||||
custom_evaluators=[
|
||||
json_schema,
|
||||
evaluate_toxicity_similarity,
|
||||
evaluate_sentiment_similarity,
|
||||
evaluate_confidence_level_similarity,
|
||||
evaluate_question_category_similarity,
|
||||
evaluate_off_topic,
|
||||
evaluate_programming_language,
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1,99 @@
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.pydantic_v1 import BaseModel, Field
|
||||
|
||||
|
||||
class QuestionCategory(str, Enum):
|
||||
IMPLEMENTATION_ISSUES = "Implementation Issues" # about existing implementation
|
||||
FEATURE_REQUESTS = "Feature Requests"
|
||||
CONCEPT_EXPLANATIONS = "Concept Explanations"
|
||||
CODE_OPTIMIZATION = "Code Optimization"
|
||||
SECURITY_AND_PRIVACY_CONCERNS = "Security and Privacy Concerns"
|
||||
MODEL_TRAINING_AND_FINE_TUNING = "Model Training and Fine-tuning"
|
||||
DATA_HANDLING_AND_MANIPULATION = "Data Handling and Manipulation"
|
||||
USER_INTERACTION_FLOW = "User Interaction Flow"
|
||||
TECHNICAL_INTEGRATION = "Technical Integration"
|
||||
ERROR_HANDLING_AND_LOGGING = "Error Handling and Logging"
|
||||
CUSTOMIZATION_AND_CONFIGURATION = "Customization and Configuration"
|
||||
EXTERNAL_API_AND_DATA_SOURCE_INTEGRATION = (
|
||||
"External API and Data Source Integration"
|
||||
)
|
||||
LANGUAGE_AND_LOCALIZATION = "Language and Localization"
|
||||
STREAMING_AND_REAL_TIME_PROCESSING = "Streaming and Real-time Processing"
|
||||
TOOL_DEVELOPMENT = "Tool Development"
|
||||
FUNCTION_CALLING = "Function Calling"
|
||||
LLM_INTEGRATIONS = "LLM Integrations"
|
||||
GENERAL_AGENT_QUESTIONS = "General Agent Question"
|
||||
GENERAL_CHIT_CHAT = "General Chit Chat"
|
||||
MEMORY = "Memory"
|
||||
DEBUGGING_HELP = "Debugging Help"
|
||||
APPLICATION_DESIGN = "Application Design"
|
||||
PROMPT_TEMPLATES = "Prompt Templates"
|
||||
COST_TRACKING = "Cost Tracking"
|
||||
OTHER = "Other"
|
||||
|
||||
|
||||
class Sentiment(str, Enum):
|
||||
NEGATIVE = "Negative"
|
||||
NEUTRAL = "Neutral"
|
||||
POSITIVE = "Positive"
|
||||
|
||||
|
||||
class ProgrammingLanguage(str, Enum):
|
||||
PYTHON = "python"
|
||||
JAVASCRIPT = "javascript"
|
||||
TYPESCRIPT = "typescript"
|
||||
UNKNOWN = "unknown"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
class QuestionCategorization(BaseModel):
|
||||
question_category: QuestionCategory
|
||||
category_if_other: Optional[str] = Field(
|
||||
default=None, description="question category if the category above is 'other'"
|
||||
)
|
||||
is_off_topic: bool = Field(
|
||||
description="If the input is general chit chat or does not pertain to technical inqueries about LangChain or building/debugging applications with LLMs/AI, it is off topic. For context, LangChain is a library and framework designed"
|
||||
" to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc."
|
||||
)
|
||||
toxicity: int = Field(
|
||||
ge=0, lt=6, description="Whether or not the input question is toxic"
|
||||
)
|
||||
sentiment: Sentiment
|
||||
programming_language: ProgrammingLanguage
|
||||
|
||||
|
||||
# resolve the issue, provide guidance, or ask for more information
|
||||
class ResponseType(str, Enum):
|
||||
RESOLVE_ISSUE = "resolve issue"
|
||||
PROVIDE_GUIDANCE = "provide guidance"
|
||||
REQUEST_INFORMATION = "request information"
|
||||
GIVE_UP = "give up"
|
||||
NONE = "none"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
class ResponseCategorization(BaseModel):
|
||||
response_type: ResponseType
|
||||
response_type_if_other: Optional[str] = None
|
||||
confidence_level: int = Field(
|
||||
ge=0, lt=6, description="The confidence of the assistant in its answer."
|
||||
)
|
||||
followup_actions: Optional[List[str]] = Field(
|
||||
description="Actions the assistant recommended the user take."
|
||||
)
|
||||
|
||||
|
||||
class GenerateTicket(BaseModel):
|
||||
"""Generate a ticket containing all the extracted information."""
|
||||
|
||||
issue_summary: str = Field(
|
||||
description="short (<10 word) summary of the issue or question"
|
||||
)
|
||||
question: QuestionCategorization = Field(
|
||||
description="Information inferred from the the question."
|
||||
)
|
||||
response: ResponseCategorization = Field(
|
||||
description="Information inferred from the the response."
|
||||
)
|
||||
@@ -0,0 +1,303 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from langchain_benchmarks.schema import ModelRegistry, RegisteredModel
|
||||
|
||||
_OPEN_AI_MODELS = [
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="gpt-3.5-turbo-1106",
|
||||
type="chat",
|
||||
description=(
|
||||
"The latest GPT-3.5 Turbo model with improved instruction following, "
|
||||
"JSON mode, reproducible outputs, parallel function calling, and more. "
|
||||
"Returns a maximum of 4,096 output tokens."
|
||||
),
|
||||
params={
|
||||
"model": "gpt-3.5-turbo-1106",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="gpt-3.5-turbo",
|
||||
type="chat",
|
||||
description="Currently points to gpt-3.5-turbo-0613.",
|
||||
params={
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="gpt-3.5-turbo-16k",
|
||||
type="chat",
|
||||
description="Currently points to gpt-3.5-turbo-0613.",
|
||||
params={
|
||||
"model": "gpt-3.5-turbo-16k",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="gpt-3.5-turbo-instruct",
|
||||
type="llm",
|
||||
description=(
|
||||
"Similar capabilities as text-davinci-003 but compatible with legacy "
|
||||
"Completions endpoint and not Chat Completions."
|
||||
),
|
||||
params={
|
||||
"model": "gpt-3.5-turbo-instruct",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="gpt-3.5-turbo-0613",
|
||||
type="chat",
|
||||
description=(
|
||||
"Legacy Snapshot of gpt-3.5-turbo from June 13th 2023. "
|
||||
"Will be deprecated on June 13, 2024."
|
||||
),
|
||||
params={
|
||||
"model": "gpt-3.5-turbo-0613",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="gpt-3.5-turbo-16k-0613",
|
||||
type="chat",
|
||||
description=(
|
||||
"Legacy Snapshot of gpt-3.5-16k-turbo from June 13th 2023. "
|
||||
"Will be deprecated on June 13, 2024."
|
||||
),
|
||||
params={
|
||||
"model": "gpt-3.5-turbo-16k-0613",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="gpt-3.5-turbo-0301",
|
||||
type="chat",
|
||||
description=(
|
||||
"Legacy Snapshot of gpt-3.5-turbo from March 1st 2023. "
|
||||
"Will be deprecated on June 13th 2024."
|
||||
),
|
||||
params={
|
||||
"model": "gpt-3.5-turbo-0301",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="text-davinci-003",
|
||||
type="llm",
|
||||
description=(
|
||||
"Legacy Can do language tasks with better quality and consistency than "
|
||||
"the curie, babbage, or ada models. Will be deprecated on Jan 4th 2024."
|
||||
),
|
||||
params={
|
||||
"model": "text-davinci-003",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="text-davinci-002",
|
||||
type="llm",
|
||||
description=(
|
||||
"Legacy Similar capabilities to text-davinci-003 but trained with "
|
||||
"supervised fine-tuning instead of reinforcement learning. "
|
||||
"Will be deprecated on Jan 4th 2024."
|
||||
),
|
||||
params={
|
||||
"model": "text-davinci-002",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="openai",
|
||||
name="code-davinci-002",
|
||||
type="llm",
|
||||
description="Legacy Optimized for code-completion tasks. Will be deprecated "
|
||||
"on Jan 4th 2024.",
|
||||
params={
|
||||
"model": "code-davinci-002",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
_FIREWORKS_MODELS = [
|
||||
RegisteredModel(
|
||||
provider="fireworks",
|
||||
name="llama-v2-7b-chat-fw",
|
||||
type="chat",
|
||||
description="7b parameter LlamaChat model",
|
||||
params={
|
||||
"model": "accounts/fireworks/models/llama-v2-7b-chat",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="fireworks",
|
||||
name="llama-v2-13b-chat-fw",
|
||||
type="chat",
|
||||
description="13b parameter LlamaChat model",
|
||||
params={
|
||||
"model": "accounts/fireworks/models/llama-v2-13b-chat",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="fireworks",
|
||||
name="llama-v2-70b-chat-fw",
|
||||
type="chat",
|
||||
description="70b parameter LlamaChat model",
|
||||
params={
|
||||
"model": "accounts/fireworks/models/llama-v2-70b-chat",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="fireworks",
|
||||
name='mixtral-moe-8x7b-chat-fw',
|
||||
type="chat",
|
||||
description="Mistral MoE model, unofficial implementation. Further fine-tuned for chat by Fireworks.",
|
||||
params={
|
||||
"model": "accounts/fireworks/models/mixtral-moe-8x7b-chat",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="fireworks",
|
||||
name="llama-v2-7b-llm-fw",
|
||||
type="llm",
|
||||
)
|
||||
|
||||
{
|
||||
"name": "Mixtral MoE 8x7B",
|
||||
"description": "Mistral MoE model, unofficial implementation.",
|
||||
"type": "llm",
|
||||
},
|
||||
{
|
||||
"name": "Capybara 34B",
|
||||
"description": "34B chat model from NousResearch, based on Yi-34B-200k.",
|
||||
"type": "chat",
|
||||
},
|
||||
{
|
||||
"name": "Yi 34B 200k context window",
|
||||
"description": "34B LLM model from 01.ai, with context window 200k.",
|
||||
"params": {"model": "accounts/fireworks/models/yi-34b-200k-capybara"},
|
||||
"type": "llm",
|
||||
},
|
||||
{
|
||||
"name": "Yi 6B",
|
||||
"description": "6B LLM model from 01.ai.",
|
||||
"params": {"model": "accounts/fireworks/models/yi-6b"},
|
||||
"type": "llm",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
[
|
||||
{
|
||||
"name": "Mistral 7B Instruct",
|
||||
"description": "Mistral-7B model fine-tuned for conversations.",
|
||||
"params": {"model": "accounts/fireworks/models/mistral-7b-instruct"},
|
||||
"type": "llm",
|
||||
},
|
||||
{
|
||||
"name": "Llama 2 13B code instruct",
|
||||
"description": "Instruction-tuned version of Llama 2 13B, optimized for code generation.",
|
||||
"params": {"model": "accounts/fireworks/models/llama-2-13b-code-instruct"},
|
||||
"type": "llm",
|
||||
},
|
||||
{
|
||||
"name": "Llama 2 34B Code Llama instruct",
|
||||
"description": "Code Llama 34B, optimized for code generation.",
|
||||
"params": {"model": "accounts/fireworks/models/llama-2-34b-code-instruct"},
|
||||
"type": "llm",
|
||||
},
|
||||
{
|
||||
"name": "Llama 2 7B Chat",
|
||||
"description": "Fine-tuned version of Llama 2 7B, optimized for dialogue applications using RLHF, comparable to ChatGPT.",
|
||||
"params": {"model": "accounts/fireworks/models/llama-2-7b-chat"},
|
||||
"type": "chat",
|
||||
},
|
||||
{
|
||||
"name": "Llama 2 13B Chat",
|
||||
"description": "Fine-tuned version of Llama 2 13B, optimized for dialogue applications using RLHF, comparable to ChatGPT.",
|
||||
"params": {"model": "accounts/fireworks/models/llama-2-13b-chat"},
|
||||
"type": "chat",
|
||||
},
|
||||
{
|
||||
"name": "Llama 2 70B Chat",
|
||||
"description": "Fine-tuned version of Llama 2 70B, optimized for dialogue applications using RLHF, comparable to ChatGPT.",
|
||||
},
|
||||
{
|
||||
"name": "StarCoder 7B",
|
||||
"description": "7B parameter model trained on 80+ programming languages from The Stack (v1.2), using Multi Query Attention and Fill-in-the-Middle objective.",
|
||||
},
|
||||
{
|
||||
"name": "StarCoder 15.5B",
|
||||
"description": "15.5B parameter model trained on 80+ programming languages from The Stack (v1.2), using Multi Query Attention and Fill-in-the-Middle objective.",
|
||||
},
|
||||
{
|
||||
"name": "Traditional Chinese Llama2 QLoRa",
|
||||
"description": "Fine-tuned Llama 2 model on traditional Chinese Alpaca dataset.",
|
||||
},
|
||||
{
|
||||
"name": "Llama 2 13B French",
|
||||
"description": "Fine-tuned meta-llama/Llama-2-13b-chat-hf to answer French questions in French.",
|
||||
},
|
||||
{
|
||||
"name": "Chinese Llama 2 LoRA 7B",
|
||||
"description": "The LoRA version of Chinese-Llama-2 based on Llama-2-7b-hf.",
|
||||
},
|
||||
{
|
||||
"name": "Bleat",
|
||||
"description": "Enables function calling in LLaMA 2, similar to OpenAI's implementation for ChatGPT.",
|
||||
},
|
||||
{
|
||||
"name": "Llama2 13B Guanaco QLoRA GGML",
|
||||
"description": "Fine-tuned Llama 2 13B model using the Open Assist dataset.",
|
||||
},
|
||||
{
|
||||
"name": "Llama 7B Summarize",
|
||||
"description": "Summarizes articles and conversations.",
|
||||
},
|
||||
]
|
||||
|
||||
_ANTHROPIC_MODELS = [
|
||||
RegisteredModel(
|
||||
provider="anthropic",
|
||||
name="claude-2",
|
||||
description=("Superior performance on tasks that require complex reasoning"),
|
||||
type="chat",
|
||||
params={
|
||||
"model": "claude-2",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="anthropic",
|
||||
name="claude-2.1",
|
||||
description=(
|
||||
"Same performance as Claude 2, plus significant reduction in model "
|
||||
"hallucination rates"
|
||||
),
|
||||
type="chat",
|
||||
params={
|
||||
"model": "claude-2.1",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="anthropic",
|
||||
name="claude-instant-1.2",
|
||||
description="low-latency, high throughput.",
|
||||
type="chat",
|
||||
params={
|
||||
"model": "claude-instant-1.2",
|
||||
},
|
||||
),
|
||||
RegisteredModel(
|
||||
provider="anthropic",
|
||||
name="claude-instant-1",
|
||||
description="low-latency, high throughput.",
|
||||
type="chat",
|
||||
params={
|
||||
"model": "claude-instant-1",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
model_registry = ModelRegistry(
|
||||
registered_models=_OPEN_AI_MODELS + _FIREWORKS_MODELS + _ANTHROPIC_MODELS
|
||||
)
|
||||
@@ -0,0 +1 @@
|
||||
pdfs/
|
||||
@@ -1,7 +1,14 @@
|
||||
from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
|
||||
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.task import (
|
||||
MULTI_MODAL_SLIDE_DECKS_TASK,
|
||||
)
|
||||
from langchain_benchmarks.rag.tasks.semi_structured_reports.task import (
|
||||
SEMI_STRUCTURED_REPORTS_TASK,
|
||||
)
|
||||
|
||||
# Please keep this sorted
|
||||
__all__ = ["LANGCHAIN_DOCS_TASK", "SEMI_STRUCTURED_REPORTS_TASK"]
|
||||
__all__ = [
|
||||
"LANGCHAIN_DOCS_TASK",
|
||||
"SEMI_STRUCTURED_REPORTS_TASK",
|
||||
"MULTI_MODAL_SLIDE_DECKS_TASK",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import (
|
||||
get_file_names,
|
||||
)
|
||||
|
||||
__all__ = ["get_file_names"]
|
||||