Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 01ffffd04c | |||
| 4ddbbc0ff8 | |||
| 5ffdbb5c4c |
@@ -35,7 +35,7 @@ pip install -U langchain-benchmarks
|
||||
All the benchmarks come with an associated benchmark dataset stored in [LangSmith](https://smith.langchain.com). To take advantage of the eval and debugging experience, [sign up](https://smith.langchain.com), and set your API key in your environment:
|
||||
|
||||
```bash
|
||||
export LANGCHAIN_API_KEY=sk-...
|
||||
export LANGCHAIN_API_KEY=ls-...
|
||||
```
|
||||
|
||||
## Repo Structure
|
||||
|
||||
|
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
|
Before Width: | Height: | Size: 12 KiB After Width: | Height: | Size: 12 KiB |
|
Before Width: | Height: | Size: 9.7 KiB After Width: | Height: | Size: 9.7 KiB |
|
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
@@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"id": "86912590-a90a-4351-8ab4-89192cdee1e7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -26,19 +26,24 @@
|
||||
"<tr><th>Name </th><th>Type </th><th>Dataset ID </th><th>Description </th></tr>\n",
|
||||
"</thead>\n",
|
||||
"<tbody>\n",
|
||||
"<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d\" target=\"_blank\" rel=\"noopener\">36bdfe7d-3cd1-4b36-b957-d12d95810a2b</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
|
||||
"<tr><td>Email Extraction</td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d\" target=\"_blank\" rel=\"noopener\">a1742786-bde5-4f51-a1d8-e148e5251ddb</a></td><td>A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\n",
|
||||
"\n",
|
||||
"Some additional cleanup of the data was done by hand after the initial pass.\n",
|
||||
"\n",
|
||||
"See https://github.com/jacoblee93/oss-model-extraction-evals. </td></tr>\n",
|
||||
"<tr><td>Chat Extraction </td><td>ExtractionTask</td><td><a href=\"https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d\" target=\"_blank\" rel=\"noopener\">00f4444c-9460-4a82-b87a-f50096f1cfef</a></td><td>A dataset meant to test the ability of an LLM to extract and infer\n",
|
||||
"structured information from a dialogue. The dialogue is between a user and a support\n",
|
||||
"engineer. Outputs should be structured as a JSON object and test both the ability\n",
|
||||
"of the LLM to correctly structure the information and its ability to perform simple \n",
|
||||
"classification tasks. </td></tr>\n",
|
||||
"</tbody>\n",
|
||||
"</table>"
|
||||
],
|
||||
"text/plain": [
|
||||
"Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/36bdfe7d-3cd1-4b36-b957-d12d95810a2b/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['email'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['email'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{email}\\n```'))]))])"
|
||||
"Registry(tasks=[ExtractionTask(name='Email Extraction', dataset_id='https://smith.langchain.com/public/a1742786-bde5-4f51-a1d8-e148e5251ddb/d', description='A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.\\n\\nSome additional cleanup of the data was done by hand after the initial pass.\\n\\nSee https://github.com/jacoblee93/oss-model-extraction-evals.\\n ', schema=<class 'langchain_benchmarks.extraction.tasks.email_task.Email'>, instructions=ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\\n ```\\n{input}\\n```'))])), ExtractionTask(name='Chat Extraction', dataset_id='https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d', description='A dataset meant to test the ability of an LLM to extract and infer\\nstructured information from a dialogue. The dialogue is between a user and a support\\nengineer. Outputs should be structured as a JSON object and test both the ability\\nof the LLM to correctly structure the information and its ability to perform simple \\nclassification tasks.', schema=<class 'langchain_benchmarks.extraction.tasks.chat_extraction.schema.GenerateTicket'>, instructions=ChatPromptTemplate(input_variables=['dialogue'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpdesk assistant responsible with extracting information and generating tickets. Dialogues are between a user and a support engineer.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['dialogue'], template='Generate a ticket for the following question-response pair:\\n<Dialogue>\\n{dialogue}\\n</Dialogue>'))]))])"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -85,9 +90,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 2,
|
||||
"id": "9c7865bd-8251-4579-85a3-f9085d96f497",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
@@ -115,7 +122,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
./notebooks/extraction/intro
|
||||
./notebooks/extraction/email
|
||||
./notebooks/extraction/chat_extraction
|
||||
```
|
||||
|
||||
```{toctree}
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
from langchain.prompts import ChatPromptTemplate
|
||||
|
||||
from langchain_benchmarks.extraction.tasks.chat_extraction.evaluators import (
|
||||
get_eval_config,
|
||||
)
|
||||
from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket
|
||||
from langchain_benchmarks.schema import ExtractionTask
|
||||
|
||||
# This is a default prompt that works reasonably for OpenAI models.
|
||||
|
||||
DEFAULT_CHAT_MODEL_PROMPT = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
(
|
||||
"system",
|
||||
"You are a helpdesk assistant responsible with extracting information"
|
||||
" and generating tickets. Dialogues are between a user and"
|
||||
" a support engineer.",
|
||||
),
|
||||
(
|
||||
"user",
|
||||
"Generate a ticket for the following question-response pair:\n"
|
||||
"<Dialogue>\n{dialogue}\n</Dialogue>",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
CHAT_EXTRACTION_TASK = ExtractionTask(
|
||||
name="Chat Extraction",
|
||||
dataset_id="https://smith.langchain.com/public/00f4444c-9460-4a82-b87a-f50096f1cfef/d",
|
||||
schema=GenerateTicket,
|
||||
description="""A dataset meant to test the ability of an LLM to extract and infer
|
||||
structured information from a dialogue. The dialogue is between a user and a support
|
||||
engineer. Outputs should be structured as a JSON object and test both the ability
|
||||
of the LLM to correctly structure the information and its ability to perform simple
|
||||
classification tasks.""",
|
||||
instructions=DEFAULT_CHAT_MODEL_PROMPT,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["CHAT_EXTRACTION_TASK", "get_eval_config"]
|
||||
@@ -0,0 +1,180 @@
|
||||
from langchain.smith.evaluation.config import RunEvalConfig, SingleKeyEvalConfig
|
||||
from langsmith.evaluation.evaluator import (
|
||||
EvaluationResult,
|
||||
run_evaluator,
|
||||
)
|
||||
from langsmith.schemas import Example, Run
|
||||
|
||||
from langchain_benchmarks.extraction.tasks.chat_extraction.schema import GenerateTicket
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def json_schema(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the json schema of the generated ticket."""
|
||||
score, comment = None, None
|
||||
try:
|
||||
GenerateTicket.parse_obj(run.outputs["output"])
|
||||
score = 1
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
score = 0
|
||||
|
||||
return EvaluationResult(
|
||||
key="json_schema",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_toxicity_similarity(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the toxicity of the generated ticket."""
|
||||
gt = example.outputs["output"]["question"]["toxicity"]
|
||||
score, comment = None, None
|
||||
# Toxicity should be a on scale from 0 to 5
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["toxicity"]
|
||||
score = 1 - abs(gt - float(pred)) / 5
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="toxicity_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_sentiment_similarity(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the sentiment of the generated ticket."""
|
||||
gt = example.outputs["output"]["question"]["sentiment"]
|
||||
ordinal_map = {
|
||||
"negative": 0,
|
||||
"neutral": 1,
|
||||
"positive": 2,
|
||||
}
|
||||
gt_score = ordinal_map.get(str(gt).lower())
|
||||
score, comment = None, None
|
||||
# Sentiment is an enum, "Negative", "Neutral", "Positive"
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["sentiment"]
|
||||
pred_score = ordinal_map.get(str(pred).lower())
|
||||
score = 1 - (abs(gt_score - float(pred_score)) / 2)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="sentiment_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_confidence_level_similarity(
|
||||
run: Run, example: Example
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate the confidence level of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["response"]["confidence_level"]
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["response"]["confidence_level"]
|
||||
score = 1 - (abs(gt - float(pred)) / 5)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="confidence_level_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_question_category_similarity(
|
||||
run: Run, example: Example
|
||||
) -> EvaluationResult:
|
||||
"""Evaluate the question category of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["question"]["question_category"]
|
||||
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["question_category"]
|
||||
score = int(gt == pred)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="question_category",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_off_topic(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the off topic of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["question"]["is_off_topic"]
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["question"].get("is_off_topic")
|
||||
score = int(gt == pred)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="off_topic_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
@run_evaluator
|
||||
def evaluate_programming_language(run: Run, example: Example) -> EvaluationResult:
|
||||
"""Evaluate the programming language of the generated ticket.
|
||||
This is a binary T/F question."""
|
||||
gt = example.outputs["output"]["question"]["programming_language"]
|
||||
score, comment = None, None
|
||||
try:
|
||||
pred = run.outputs["output"]["question"]["programming_language"]
|
||||
score = int(gt == pred)
|
||||
except Exception as e:
|
||||
comment = repr(e)
|
||||
# Forgot to predict / mis-structured
|
||||
score = 0
|
||||
return EvaluationResult(
|
||||
key="programming_language_similarity",
|
||||
score=score,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
def get_eval_config() -> RunEvalConfig:
|
||||
"""Get the evaluation configuration for the chat extraction task."""
|
||||
return RunEvalConfig(
|
||||
evaluators=[
|
||||
# General aggregate score
|
||||
SingleKeyEvalConfig(
|
||||
# input key is ignored.
|
||||
evaluator_type="json_edit_distance",
|
||||
input_key="question",
|
||||
)
|
||||
],
|
||||
custom_evaluators=[
|
||||
json_schema,
|
||||
evaluate_toxicity_similarity,
|
||||
evaluate_sentiment_similarity,
|
||||
evaluate_confidence_level_similarity,
|
||||
evaluate_question_category_similarity,
|
||||
evaluate_off_topic,
|
||||
evaluate_programming_language,
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1,99 @@
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.pydantic_v1 import BaseModel, Field
|
||||
|
||||
|
||||
class QuestionCategory(str, Enum):
|
||||
IMPLEMENTATION_ISSUES = "Implementation Issues" # about existing implementation
|
||||
FEATURE_REQUESTS = "Feature Requests"
|
||||
CONCEPT_EXPLANATIONS = "Concept Explanations"
|
||||
CODE_OPTIMIZATION = "Code Optimization"
|
||||
SECURITY_AND_PRIVACY_CONCERNS = "Security and Privacy Concerns"
|
||||
MODEL_TRAINING_AND_FINE_TUNING = "Model Training and Fine-tuning"
|
||||
DATA_HANDLING_AND_MANIPULATION = "Data Handling and Manipulation"
|
||||
USER_INTERACTION_FLOW = "User Interaction Flow"
|
||||
TECHNICAL_INTEGRATION = "Technical Integration"
|
||||
ERROR_HANDLING_AND_LOGGING = "Error Handling and Logging"
|
||||
CUSTOMIZATION_AND_CONFIGURATION = "Customization and Configuration"
|
||||
EXTERNAL_API_AND_DATA_SOURCE_INTEGRATION = (
|
||||
"External API and Data Source Integration"
|
||||
)
|
||||
LANGUAGE_AND_LOCALIZATION = "Language and Localization"
|
||||
STREAMING_AND_REAL_TIME_PROCESSING = "Streaming and Real-time Processing"
|
||||
TOOL_DEVELOPMENT = "Tool Development"
|
||||
FUNCTION_CALLING = "Function Calling"
|
||||
LLM_INTEGRATIONS = "LLM Integrations"
|
||||
GENERAL_AGENT_QUESTIONS = "General Agent Question"
|
||||
GENERAL_CHIT_CHAT = "General Chit Chat"
|
||||
MEMORY = "Memory"
|
||||
DEBUGGING_HELP = "Debugging Help"
|
||||
APPLICATION_DESIGN = "Application Design"
|
||||
PROMPT_TEMPLATES = "Prompt Templates"
|
||||
COST_TRACKING = "Cost Tracking"
|
||||
OTHER = "Other"
|
||||
|
||||
|
||||
class Sentiment(str, Enum):
|
||||
NEGATIVE = "Negative"
|
||||
NEUTRAL = "Neutral"
|
||||
POSITIVE = "Positive"
|
||||
|
||||
|
||||
class ProgrammingLanguage(str, Enum):
|
||||
PYTHON = "python"
|
||||
JAVASCRIPT = "javascript"
|
||||
TYPESCRIPT = "typescript"
|
||||
UNKNOWN = "unknown"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
class QuestionCategorization(BaseModel):
|
||||
question_category: QuestionCategory
|
||||
category_if_other: Optional[str] = Field(
|
||||
default=None, description="question category if the category above is 'other'"
|
||||
)
|
||||
is_off_topic: bool = Field(
|
||||
description="If the input is general chit chat or does not pertain to technical inqueries about LangChain or building/debugging applications with LLMs/AI, it is off topic. For context, LangChain is a library and framework designed"
|
||||
" to assist in building applications with LLMs. Questions may also be about similar packages like LangServe, LangSmith, OpenAI, Anthropic, vectorstores, agents, etc."
|
||||
)
|
||||
toxicity: int = Field(
|
||||
ge=0, lt=6, description="Whether or not the input question is toxic"
|
||||
)
|
||||
sentiment: Sentiment
|
||||
programming_language: ProgrammingLanguage
|
||||
|
||||
|
||||
# resolve the issue, provide guidance, or ask for more information
|
||||
class ResponseType(str, Enum):
|
||||
RESOLVE_ISSUE = "resolve issue"
|
||||
PROVIDE_GUIDANCE = "provide guidance"
|
||||
REQUEST_INFORMATION = "request information"
|
||||
GIVE_UP = "give up"
|
||||
NONE = "none"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
class ResponseCategorization(BaseModel):
|
||||
response_type: ResponseType
|
||||
response_type_if_other: Optional[str] = None
|
||||
confidence_level: int = Field(
|
||||
ge=0, lt=6, description="The confidence of the assistant in its answer."
|
||||
)
|
||||
followup_actions: Optional[List[str]] = Field(
|
||||
description="Actions the assistant recommended the user take."
|
||||
)
|
||||
|
||||
|
||||
class GenerateTicket(BaseModel):
|
||||
"""Generate a ticket containing all the extracted information."""
|
||||
|
||||
issue_summary: str = Field(
|
||||
description="short (<10 word) summary of the issue or question"
|
||||
)
|
||||
question: QuestionCategorization = Field(
|
||||
description="Information inferred from the the question."
|
||||
)
|
||||
response: ResponseCategorization = Field(
|
||||
description="Information inferred from the the response."
|
||||
)
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Registry of environments for ease of access."""
|
||||
|
||||
from langchain_benchmarks.extraction.tasks import email_task
|
||||
from langchain_benchmarks.extraction.tasks import chat_extraction, email_task
|
||||
from langchain_benchmarks.rag.tasks import (
|
||||
LANGCHAIN_DOCS_TASK,
|
||||
SEMI_STRUCTURED_REPORTS_TASK,
|
||||
@@ -21,6 +21,7 @@ registry = Registry(
|
||||
relational_data.RELATIONAL_DATA_TASK,
|
||||
multiverse_math.MULTIVERSE_MATH,
|
||||
email_task.EMAIL_EXTRACTION_TASK,
|
||||
chat_extraction.CHAT_EXTRACTION_TASK,
|
||||
LANGCHAIN_DOCS_TASK,
|
||||
SEMI_STRUCTURED_REPORTS_TASK,
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain-benchmarks"
|
||||
version = "0.0.5"
|
||||
version = "0.0.6"
|
||||
description = "🦜💪 Flex those feathers!"
|
||||
authors = ["LangChain AI"]
|
||||
license = "MIT"
|
||||
|
||||