Compare commits

...

25 Commits

Author SHA1 Message Date
Eugene Yurtsev a7545285c3 x 2023-12-08 11:36:16 -05:00
Eugene Yurtsev 295fb12f8c x 2023-12-08 11:36:02 -05:00
Eugene Yurtsev dd1f33ce02 x 2023-12-08 11:35:54 -05:00
Eugene Yurtsev 0ddb74036b q 2023-12-08 11:35:40 -05:00
Eugene Yurtsev cf10528b99 q 2023-12-08 11:35:35 -05:00
Eugene Yurtsev dbc254ebc0 x 2023-12-08 11:12:34 -05:00
Eugene Yurtsev 4c3de31e6f x 2023-12-08 10:52:12 -05:00
Eugene Yurtsev f8e5d32bc9 x 2023-12-08 10:51:57 -05:00
Eugene Yurtsev 40250950db x 2023-12-08 10:51:52 -05:00
Eugene Yurtsev 9adc5b96f5 x 2023-12-07 23:10:22 -05:00
Eugene Yurtsev 85c8d29342 x 2023-12-07 23:01:56 -05:00
Eugene Yurtsev a42c70f315 x 2023-12-07 22:54:29 -05:00
Eugene Yurtsev 9c6ec01219 x 2023-12-07 22:54:23 -05:00
Eugene Yurtsev d2defc95e3 x 2023-12-07 22:54:14 -05:00
Eugene Yurtsev 8b0e52fdc4 x 2023-12-07 22:54:09 -05:00
Eugene Yurtsev 22b90df0ad x 2023-12-07 22:36:33 -05:00
Eugene Yurtsev 8ba4340730 x 2023-12-07 22:27:08 -05:00
Eugene Yurtsev 2f0b4e9bed x 2023-12-07 22:27:01 -05:00
Eugene Yurtsev 910bd60832 x 2023-12-07 22:23:36 -05:00
Eugene Yurtsev 5b4672a33b x 2023-12-07 18:21:22 -05:00
Eugene Yurtsev 9f827eaca5 Update README.md (#108) 2023-12-07 13:38:25 -05:00
Eugene Yurtsev d9fc08b05c Update README.md (#107) 2023-12-07 13:34:55 -05:00
Lance Martin 8a5ba6d575 Minor cleanup to multi-modal embeddings docs (#105) 2023-12-05 13:40:42 -08:00
William FH 8204930f2b 0.0.7 (#104)
Adds the multimodal benchmark.
2023-12-05 13:14:44 -08:00
Lance Martin 013fe6a153 Multi modal RAG benchmark (#101)
* Example notebooks for eval of multi-modal RAG w/ mm-embd and
mv-retriever vs baseline top-k RAG

---------

Co-authored-by: William Fu-Hinthorn <13333726+hinthornw@users.noreply.github.com>
2023-12-05 12:04:01 -08:00
35 changed files with 3631 additions and 238 deletions
+1 -3
View File
@@ -1,6 +1,4 @@
🚧 Under Active Development 🚧
# 🦜💪 LangChain Benchmarks
# 🦜💯 LangChain Benchmarks
[![Release Notes](https://img.shields.io/github/release/langchain-ai/langchain-benchmarks)](https://github.com/langchain-ai/langchain-benchmarks/releases)
[![CI](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml/badge.svg)](https://github.com/langchain-ai/langchain-benchmarks/actions/workflows/ci.yml)
View File
+52
View File
@@ -0,0 +1,52 @@
import inspect
from textwrap import dedent
from typing import List
from langchain.tools.base import StructuredTool
from agents.encoder import FunctionDefinition, Parameter
# This is temporary until we have a better way to represent parameters
def get_parameters_from_tool(tool: StructuredTool) -> List[Parameter]:
"""Convert a langchain tool to a tool user tool."""
schema = tool.args_schema.schema()
properties = schema["properties"]
parameters = []
# Is this needed or is string OK?
type_adapter = {
"string": "str", # str or string?
"integer": "int",
"number": "float",
"boolean": "bool",
}
for key, value in properties.items():
parameters.append(
{
"name": key,
"type": type_adapter.get(value["type"], value["type"]),
"description": value.get("description", ""),
}
)
return parameters
#
def convert_tool_to_function_definition(tool: StructuredTool) -> FunctionDefinition:
"""Convert a langchain tool to a tool user tool."""
# Here we re-inspect the underlying function to get the doc-string
# since StructuredTool modifies it, but we want the raw one for maximum
# flexibility.
description = inspect.getdoc(tool.func)
parameters = get_parameters_from_tool(tool)
return {
"name": tool.name,
"description": dedent(description),
"parameters": parameters,
"return_value": {
"type": "Any",
},
}
+105
View File
@@ -0,0 +1,105 @@
from typing import List, Literal, Sequence, Tuple, Union
from langchain.agents import AgentOutputParser
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema.messages import HumanMessage
from langchain.schema.runnable import Runnable
from langchain.tools import StructuredTool
from langchain_core.agents import AgentAction, AgentFinish
from langchain_core.language_models import BaseChatModel, BaseLanguageModel
from langchain_core.messages import BaseMessage
from langchain_core.prompts import MessagesPlaceholder
from typing_extensions import NotRequired, TypedDict
from agents.adapters import convert_tool_to_function_definition
from agents.encoder import AstPrinter, TypeScriptEncoder
from agents.prompts import AGENT_INSTRUCTIONS_BLOB_STYLE
def format_observation(tool_name: str, observation: str) -> BaseMessage:
"""Format the observation."""
result = (
"<tool_output>\n"
f"<tool_name>{tool_name}</tool_name>\n"
f"<output>{observation}</output>\n"
"</tool_output>"
)
return HumanMessage(content=result)
def format_steps_for_chat(
intermediate_steps: List[Tuple[AgentAction, str]]
) -> List[BaseMessage]:
"""Format the steps."""
messages = []
for action, observation in intermediate_steps:
if not isinstance(action, AgentAction):
if action.tool != "_Exception":
raise AssertionError(f"Unexpected step: {action}. type: {type(action)}")
messages.append(HumanMessage(content=observation))
messages.extend(action.messages)
messages.append(format_observation(action.tool, observation))
return messages
# PUBLIC API
class AgentInput(TypedDict):
"""The input to the agent."""
input: str
"""The input to the agent."""
intermediate_steps: List[Tuple[AgentAction, str]]
"""The intermediate steps taken by the agent."""
examples: NotRequired[List[BaseMessage]]
"""A list of messages that can be used to form example traces."""
def create_agent(
model: Union[BaseChatModel, BaseLanguageModel],
tools: Sequence[StructuredTool],
parser: AgentOutputParser,
*,
ast_printer: Union[AstPrinter, Literal["xml"]] = "xml",
) -> Runnable[AgentInput, Union[AgentAction, AgentFinish]]:
"""Create an agent for a chat model."""
if isinstance(ast_printer, str):
if ast_printer == "xml":
ast_printer = AstPrinter()
elif ast_printer == "typescript":
ast_printer = TypeScriptEncoder()
else:
raise ValueError(f"Unknown ast printer: {ast_printer}")
elif isinstance(ast_printer, AstPrinter):
pass
else:
raise TypeError(
f"Expected AstPrinter or str, got {type(ast_printer)} for `ast_printer`"
)
function_definitions = [convert_tool_to_function_definition(tool) for tool in tools]
tool_description = ast_printer.visit_function_definitions(function_definitions)
template = ChatPromptTemplate.from_messages(
[
("system", AGENT_INSTRUCTIONS_BLOB_STYLE),
MessagesPlaceholder("examples"), # Can use to add example traces
("human", "{input}"),
MessagesPlaceholder("history"),
]
).partial(tool_description=tool_description)
agent = (
{
"input": lambda x: x["input"],
"history": lambda x: format_steps_for_chat(x["intermediate_steps"]),
"examples": lambda x: x.get("examples", []),
}
| template
| model.bind(stop=["</tool>"])
| parser
)
return agent
+226
View File
@@ -0,0 +1,226 @@
"""Prototyping code for rendering function definitions, invocations, and results.
Types are simplified for now to `str`.
We should actually support something like pydantic or jsonschema for the types, so
we can expand them recursively for nested types.
"""
import abc
from typing import Any, List, Optional
from typing_extensions import NotRequired, TypedDict
class Parameter(TypedDict):
"""Representation for a parameter."""
name: str
type: str
description: str
class Arguments(TypedDict):
"""Arguments are passed to a function during function invocation."""
name: Optional[str]
value: Any
class ReturnValue(TypedDict):
"""Representation for a return value of a function call."""
type: str
description: NotRequired[str]
class FunctionDefinition(TypedDict):
"""Representation for a function."""
name: str
description: str # Function description
parameters: List[Parameter]
return_value: ReturnValue
class FunctionInvocation(TypedDict):
"""Representation for a function invocation."""
id: NotRequired[str]
name: str
arguments: List[Arguments]
class FunctionResult(TypedDict):
"""Representation for a function result."""
id: NotRequired[str]
name: str
result: Optional[str]
error: Optional[str]
class Visitor(abc.ABC):
def visit_function_definition(self, function_definition: FunctionDefinition) -> str:
"""Render a function."""
raise NotImplementedError()
def visit_function_definitions(
self, function_definitions: List[FunctionDefinition]
) -> str:
"""Render a function."""
raise NotImplementedError()
def visit_function_invocation(self, function_invocation: FunctionInvocation) -> str:
"""Render a function invocation."""
raise NotImplementedError()
def visit_function_result(self, function_result: FunctionResult) -> str:
"""Render a function result."""
raise NotImplementedError()
class AstPrinter(Visitor):
"""Print the AST."""
class XMLEncoder(AstPrinter):
def visit_function_definition(self, function_definition: FunctionDefinition) -> str:
"""Render a function."""
parameters_as_strings = [
"<parameter>\n"
f"<name>{parameter['name']}</name>\n"
f"<type>{parameter['type']}</type>\n"
f"<description>{parameter['description']}</description>\n"
"</parameter>\n"
for parameter in function_definition["parameters"]
]
function = (
"<function>\n"
f"<function_name>{function_definition['name']}</function_name>\n"
"<description>\n"
f"{function_definition['description']}\n"
"</description>\n"
"<parameters>\n"
f"{''.join(parameters_as_strings)}" # Already includes trailing newline
"</parameters>\n"
"<return_value>\n"
f"<type>{function_definition['return_value']['type']}</type>\n"
f"<description>{function_definition['return_value']['description']}</description>\n"
"</return_value>\n"
"</function>"
)
return function
def visit_function_definitions(
self, function_definitions: List[FunctionDefinition]
) -> str:
"""Render a function."""
strs = [
self.visit_function_definition(function_definition)
for function_definition in function_definitions
]
return "<functions>\n" + "\n".join(strs) + "\n</functions>"
def visit_function_invocation(self, invocation: FunctionInvocation) -> str:
"""Render a function invocation."""
arguments_as_strings = [
"<argument>\n"
f"<name>{argument['name']}</name>\n"
f"<value>{argument['value']}</value>\n"
"</argument>\n"
for argument in invocation["arguments"]
]
lines = ["<function_invocation>"]
if invocation.get("id"):
lines.append(f"<id>{invocation['id']}</id>")
lines.extend(
[
f"<function_name>{invocation['name']}</function_name>\n"
"<arguments>\n"
f"{''.join(arguments_as_strings)}" # Already includes trailing newline
"</arguments>\n"
"</function_invocation>"
]
)
return "\n".join(lines)
def visit_function_result(self, function_result: FunctionResult) -> str:
"""Render a function result."""
lines = [
"<function_result>",
]
if function_result.get("id"):
lines.append(f"<id>{function_result['id']}</id>")
lines.extend(
[
f"<function_name>{function_result['name']}</function_name>",
f"<result>{function_result['result']}</result>",
f"<error>{function_result['error']}</error>",
"</function_result>",
]
)
return "\n".join(lines)
class TypeScriptEncoder(AstPrinter):
def visit_function_definition(self, function_definition: FunctionDefinition) -> str:
"""Render a function."""
parameters_as_strings = [
f"{parameter['name']}: {parameter['type']}"
for parameter in function_definition["parameters"]
]
# Let's use JSdoc style comments
# First the function description
lines = [
f"// {function_definition['description']}",
# Then the parameter descriptions
*[
f"// @param {parameter['name']} {parameter['description']}"
for parameter in function_definition["parameters"]
],
# Then the return value description
f"// @returns {function_definition['return_value']['description']}",
# Then the function definition
f"function {function_definition['name']}("
f"{', '.join(parameters_as_strings)}): "
f"{function_definition['return_value']['type']};",
]
# finally join
function = "\n".join(lines)
return function
def visit_function_definitions(
self, function_definitions: List[FunctionDefinition]
) -> str:
"""Render a function."""
strs = [
self.visit_function_definition(function_definition)
for function_definition in function_definitions
]
return "\n\n".join(strs)
def visit_function_invocation(self, invocation: FunctionInvocation) -> str:
"""Render a function invocation."""
arguments_as_strings = [
f"{argument['name']}: {argument['value']}"
for argument in invocation["arguments"]
]
lines = [f"{invocation['name']}(" f"{', '.join(arguments_as_strings)});"]
return "\n".join(lines)
def visit_function_result(self, function_result: FunctionResult) -> str:
"""Render a function result."""
lines = []
if function_result["error"]:
lines.append(f"ERROR: {function_result['error']}")
else:
lines.append(f"> {function_result['result']}")
if function_result.get("id"):
lines.append(f"// ID: {function_result['id']}")
return "\n".join(lines)
+25
View File
@@ -0,0 +1,25 @@
# EXAMPLE_TRACE = [
# HumanMessage(content="type the letter 'o'"),
# AIMessage(
# content="""
# <tool>
# {
# "tool_name": "type_letter",
# "arguments": {
# "letter": "o"
# }
# }
# </tool>\
# """
# ),
# HumanMessage(
# content="""\
# <tool_outputs>
# <tool_name>type_letter</tool_name>
# <output>o</output>
# </tool_outputs>\
# """
# ),
# ]
#
#
+75
View File
@@ -0,0 +1,75 @@
from typing import List, Optional
from langchain.agents import AgentExecutor
from langchain.chat_models import ChatAnthropic, ChatFireworks
from langchain_core.runnables import Runnable, RunnableConfig
from agents.agent import create_agent
from agents.parser import ParameterizedAgentParser
from langchain_benchmarks.model_registration import FIREWORK_NAME_TO_MODEL
from langchain_benchmarks.schema import ToolUsageTask
from langchain_benchmarks.tool_usage import apply_agent_executor_adapter
class CustomAgentFactory:
def __init__(self, task: ToolUsageTask, model: str) -> None:
"""Create an OpenAI agent factory for the given task.
Args:
task: The task to create an agent factory for.
"""
if model not in self.list_models():
raise ValueError(f"Unknown model: {model}")
self.task = task
self.model = model
@staticmethod
def list_models() -> List[str]:
"""List all models."""
return sorted(
[
"claude-2.1",
"claude-2",
*FIREWORK_NAME_TO_MODEL.keys(),
]
)
def __call__(self) -> Runnable:
env = self.task.create_environment()
if self.model in {"claude-2.1", "claude-2"}:
model = ChatAnthropic(model=self.model, temperature=0)
elif self.model in FIREWORK_NAME_TO_MODEL:
model = ChatFireworks(
model=FIREWORK_NAME_TO_MODEL[self.model], temperature=0
)
else:
raise ValueError(f"Unknown model: {self.model}")
def _add_task_instructions(
input: dict, config: Optional[RunnableConfig] = None, **kwargs
) -> dict:
"""Add task instructions to the question."""
input = input.copy()
input["question"] = (
f"{self.task.instructions}\nWrite down your answer, "
f"but do not explain it. Input: `{input['question']}`"
)
return input
agent = create_agent(
model,
env.tools,
ParameterizedAgentParser(
wrapping_xml_tag="tool", require_closing_xml_tag=False
),
)
executor = AgentExecutor(
agent=agent,
tools=env.tools,
handle_parsing_errors=True,
return_intermediate_steps=True,
)
return _add_task_instructions | apply_agent_executor_adapter(
executor, state_reader=env.read_state
)
+120
View File
@@ -0,0 +1,120 @@
import ast
import re
from typing import Any, Union
from langchain.agents import AgentOutputParser
from langchain.pydantic_v1 import BaseModel, Field, ValidationError
from langchain_core.agents import AgentAction, AgentActionMessageLog, AgentFinish
from langchain_core.exceptions import OutputParserException
from langchain_core.messages import AIMessage
class _ToolInvocationRequest(BaseModel):
"""Light-weight pydantic model for validating the raw tool invocation request.
The purpose of this model, is to make sure that whatever as parsed from
the raw llm output has `tool_name` and potential `arguments` fields, and
nothing else.
"""
tool_name: str
# OK parameterless tools which do not take arguments
named_arguments: Any = Field(default_factory=dict)
class ParameterizedAgentParser(AgentOutputParser):
"""A generalized parser that makes it easier to parameterize different parsing."""
wrapping_xml_tag: str
"""The tag that wraps the function invocation request.
For example, if "tool", then the function invocation request should be wrapped
in <tool>...</tool>.
"""
require_closing_xml_tag: bool = False
"""Whether we should require a closing tag for the wrapping_xml_tag.
For example, if True, then the function invocation request should be wrapped
"""
def parse(self, text: str) -> Union[AgentFinish, AgentAction]:
"""Parse the output of the agent."""
open_tag = f"<{self.wrapping_xml_tag}>"
close_tag = f"</{self.wrapping_xml_tag}>"
if open_tag in text:
# This is a hack to make sure that </tool> is always present
# in the output if <tool>. </tool> may be a stop sequence for the
# language model, so depending on implementation
# the stop sequence may be cut off.
# There might be a better way to do this, but this works and
# is simple.
if not self.require_closing_xml_tag:
text += close_tag
pattern = rf"{open_tag}(?P<invocation>.*?){close_tag}"
match = re.search(pattern, text, re.DOTALL)
if match:
content = match.group("invocation").strip()
return parse_invocation(content, self.wrapping_xml_tag)
return AgentFinish(
log=text,
return_values={
"output": text,
},
)
def parse_invocation(text: str, tag: str) -> AgentAction:
"""Parse the content of the function invocation.
Args:
text: The text to parse.
tag: The tag that wraps the function invocation request.
Returns:
An AgentAction that corresponds to the function invocation.
Raises:
OutputParserException: If the parsing fails.
This exception is meant to be caught by the agent executor and
handled appropriately to provide feedback to the LLM.
"""
ai_content = f"<{tag}>{text}</{tag}>"
try:
result = ast.literal_eval(text)
except Exception as e:
# Convert this to something controllable by the user.
err_msg = (
f"ERROR: Please use the format "
f'<{tag}>{{"tool_name": $TOOL_NAME, "arguments": $ARGUMENTS}}</{tag}>'
)
raise OutputParserException(
error=e,
llm_output=ai_content,
observation=err_msg,
send_to_llm=True,
)
try:
request = _ToolInvocationRequest(**result)
except ValidationError as e:
err_msg = (
f"ERROR: Please use the format "
f'<{tag}>{{"tool_name": $TOOL_NAME, "arguments": $ARGUMENTS}}</{tag}>'
)
raise OutputParserException(
error=e,
llm_output=ai_content,
send_to_llm=True,
observation=err_msg,
)
return AgentActionMessageLog(
message_log=[AIMessage(content=ai_content)],
tool=request.tool_name,
tool_input=request.named_arguments,
log=f"\nInvoking {request.tool_name}: {request.named_arguments}\n\t",
)
+42
View File
@@ -0,0 +1,42 @@
AGENT_INSTRUCTIONS_XML_FORMAT = """\
In this environment you have access to a set of tools you can use to answer the user's question.
You may call them like this:
<function_calls>
<invoke>
<tool_name>$TOOL_NAME</tool_name>
<parameters>
<$PARAMETER_NAME>$PARAMETER_VALUE</$PARAMETER_NAME>
...
</parameters>
</invoke>
</function_calls>
Here are the tools available:
{tool_description}
""" # noqa: E501
AGENT_INSTRUCTIONS_BLOB_STYLE = """\
In this environment you have access to a set of tools you can use to answer the user's question.
Here are the tools available:
{tool_description}
You may call one tool at a time using a format that includes <tool> and </tool> tag.
Inside the tag the content is a python dictionary that uses python literals (e.g., numbers, strings, lists, dictionaries, etc.) to specify the tool invocation.
It must match the schema of the function as described in the tool description.
"arguments" is a dictionary of the arguments to the function.
<tool>
{{
"tool_name": $TOOL_NAME,
"arguments": $ARGUMENTS
}}
</tool>
If you do not know the answer use more tools. You can only take a single action at a time.\
""" # noqa: E501
View File
View File
@@ -0,0 +1,29 @@
"""Test XML encoding and decoding of function definitions, invocation, and results."""
from agents.encoder import (
FunctionDefinition,
TypeScriptEncoder,
)
def test_function_definition() -> None:
"""Test encoding a function definition."""
function_definition = FunctionDefinition(
name="test_function",
description="A test function",
parameters=[
{"name": "test_parameter", "type": "str", "description": "A test parameter"}
],
return_value={"type": "str", "description": "A test return value"},
)
encoder = TypeScriptEncoder()
xml = encoder.visit_function_definition(function_definition)
assert xml == (
"// A test function\n"
"// @param test_parameter A test parameter\n"
"// @returns A test return value\n"
"function test_function(test_parameter: str): str;"
)
# Not important to test other ones right now since we can't parse / interpret
# typescript anyway.
+79
View File
@@ -0,0 +1,79 @@
"""Test XML encoding and decoding of function definitions, invocation, and results."""
from agents.encoder import (
FunctionDefinition,
FunctionInvocation,
FunctionResult,
XMLEncoder,
)
def test_function_definition_encoding() -> None:
"""Test encoding a function definition."""
function_definition = FunctionDefinition(
name="test_function",
description="A test function",
parameters=[
{"name": "test_parameter", "type": "str", "description": "A test parameter"}
],
return_value={"type": "str", "description": "A test return value"},
)
encoder = XMLEncoder()
xml = encoder.visit_function_definition(function_definition)
assert xml == (
"<function>\n"
"<function_name>test_function</function_name>\n"
"<description>\n"
"A test function\n"
"</description>\n"
"<parameters>\n"
"<parameter>\n"
"<name>test_parameter</name>\n"
"<type>str</type>\n"
"<description>A test parameter</description>\n"
"</parameter>\n"
"</parameters>\n"
"<return_value>\n"
"<type>str</type>\n"
"<description>A test return value</description>\n"
"</return_value>\n"
"</function>"
)
def test_function_result_encoding() -> None:
"""Test encoding a function result."""
function_result = FunctionResult(
name="test_function",
result="test_result",
error="test_error",
)
encoder = XMLEncoder()
xml = encoder.visit_function_result(function_result)
assert xml == (
"<function_result>\n"
"<function_name>test_function</function_name>\n"
"<result>test_result</result>\n"
"<error>test_error</error>\n"
"</function_result>"
)
def test_function_invocation() -> None:
"""Test function invocation."""
function_invocation = FunctionInvocation(
name="test_function",
arguments=[{"name": "test_argument", "value": "test_value"}],
)
encoder = XMLEncoder()
xml = encoder.visit_function_invocation(function_invocation)
assert xml == (
"<function_invocation>\n"
"<function_name>test_function</function_name>\n"
"<arguments>\n"
"<argument>\n"
"<name>test_argument</name>\n"
"<value>test_value</value>\n"
"</argument>\n"
"</arguments>\n"
"</function_invocation>"
)
+58
View File
@@ -0,0 +1,58 @@
import pytest
from langchain.tools import tool
from agents.adapters import convert_tool_to_function_definition
from agents.encoder import XMLEncoder
@tool
def get_hello() -> str:
"""Get hello."""
return "hello"
@tool
def repeat(x: str) -> str:
"""Repeat x.
Args:
x: The string to repeat.
Returns:
The repeated string.
"""
return x
def test_parameterless_function() -> None:
"""Test foo."""
function_definition = convert_tool_to_function_definition(get_hello)
assert function_definition == {
"name": "get_hello",
"description": "Get hello.",
"parameters": [],
"return_value": {
"type": "Any",
},
}
@pytest.mark.skip("Need to fix handling of leading whitespace")
def test_function_with_parameters() -> None:
import textwrap
doc = textwrap.dedent(repeat.func.__doc__)
assert convert_tool_to_function_definition(repeat) == {
"name": "repeat",
"description": doc,
"parameters": [
{
"name": "x",
"type": "str",
"description": "", # Need to fix this
}
],
"return_value": {
"type": "Any",
},
}
+11
View File
@@ -0,0 +1,11 @@
from langchain_adapters.alternative import AgentOutputParser
from langchain_core.agents import AgentFinish
def test_parser() -> None:
"""Test parser."""
parser = AgentOutputParser(require_closing_tag=False, tag="tool")
assert isinstance(parser.invoke("goodbye"), AgentFinish)
assert parser.invoke("<tool>hello</tool>") == "hello"
assert parser.invoke("<tool>hello") == "hello"
# assert isinstance(parser.invoke("<tag>hello</tag>"), AgentAction)
+35
View File
@@ -0,0 +1,35 @@
"""Throttle using a token bucket."""
import threading
import time
class Throttle:
def __init__(self, rate: int) -> None:
"""Initialize the throttle."""
self.rate = rate
self.tokens = 0
self._consume_lock = threading.Lock()
self.last = None
def consume(self, amount: int = 0) -> int:
"""Consume the given amount of tokens."""
with self._consume_lock:
now = time.time()
# initialize on first call to avoid a burst
if self.last is None:
self.last = now
elapsed = now - self.last
if elapsed * self.rate > 1:
self.tokens += elapsed * self.rate
self.last = now
self.tokens = min(self.tokens, self.rate)
if self.tokens >= amount:
self.tokens -= amount
return amount
return 0
+2
View File
@@ -1 +1,3 @@
chromadb/
index.md
Untitled.ipynb
+223 -222
View File
@@ -1,225 +1,226 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "033684fb-65b2-4586-a959-68c614741ca2",
"metadata": {},
"source": [
"# Datasets\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain-benchmarks/blob/main/docs/source/notebooks/datasets.ipynb)\n",
"\n",
"Here, we'll see how to work with LangSmith datasets."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -U langchain-benchmarks"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6d272fbf-710e-4a49-a0da-67e010541905",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, download_public_dataset"
]
},
{
"cell_type": "markdown",
"id": "18ee0f96-e5c4-4ae9-aebf-7d8b88c51662",
"metadata": {},
"source": [
"Let's first download the dataset to the local file system"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "58b94f6d-0c91-4361-9b22-f758ffaa150a",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fetching examples...\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a2fad8c0c3549ec96a3b38fe8a002b0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/21 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done fetching examples.\n"
]
}
],
"source": [
"download_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "841db832-b0d3-4fd1-8531-1154ec9b3caa",
"metadata": {},
"source": [
"we can take a look at the first two examples"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "664e90fc-af84-4c5f-a3dd-5d9ffe649650",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"0703a989-2693-4039-a1f6-7281fc1b4cb0\",\n",
" \"inputs\": {\n",
" \"question\": \"do bob and alice live in the same city?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"no\"\n",
" },\n",
" \"runs\": []\n",
" },\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"b258b95a-9524-4da7-b758-c5481109322d\",\n",
" \"inputs\": {\n",
" \"question\": \"Is it likely that Donna is outside with an umbrella at this time?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_current_time_for_location\",\n",
" \"get_current_weather_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"yes\"\n",
" },\n",
" \"runs\": []\n",
" }\n",
"]\n"
]
}
],
"source": [
"import json\n",
"\n",
"with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
" print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
]
},
{
"cell_type": "markdown",
"id": "2c6cf01f-466b-406d-b4c7-2395747780fd",
"metadata": {},
"source": [
"We can also clone the dataset to our local tenant"
]
},
{
"cell_type": "markdown",
"id": "e4dea4df-2f1c-436b-a71c-49ffb2295ccc",
"metadata": {},
"source": [
"Executing this command will clone the dataset to your own LangSmith tenant. \n",
"For this to work you must have a [LangSmith account](https://smith.langchain.com/) set up."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Get from https://smith.langchain.com/settings\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = \"ls_...\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18d0b905-2a6a-4752-a7cb-8653bd9049e3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"clone_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\",\n",
" dataset_name=\"Agent Dataset\",\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
"cells": [
{
"cell_type": "markdown",
"id": "033684fb-65b2-4586-a959-68c614741ca2",
"metadata": {},
"source": [
"# Datasets\n",
"\n",
"Here, we'll see how to work with LangSmith datasets."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9aa20db",
"metadata": {},
"outputs": [],
"source": [
"%pip install -U langchain-benchmarks"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "6d272fbf-710e-4a49-a0da-67e010541905",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks import clone_public_dataset, download_public_dataset"
]
},
{
"cell_type": "markdown",
"id": "18ee0f96-e5c4-4ae9-aebf-7d8b88c51662",
"metadata": {},
"source": [
"Let's first download the dataset to the local file system"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "58b94f6d-0c91-4361-9b22-f758ffaa150a",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fetching examples...\n"
]
},
"nbformat": 4,
"nbformat_minor": 5
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a2fad8c0c3549ec96a3b38fe8a002b0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/21 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done fetching examples.\n"
]
}
],
"source": [
"download_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "841db832-b0d3-4fd1-8531-1154ec9b3caa",
"metadata": {},
"source": [
"we can take a look at the first two examples"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "664e90fc-af84-4c5f-a3dd-5d9ffe649650",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"0703a989-2693-4039-a1f6-7281fc1b4cb0\",\n",
" \"inputs\": {\n",
" \"question\": \"do bob and alice live in the same city?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.511629\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\",\n",
" \"get_user_location\",\n",
" \"get_city_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"no\"\n",
" },\n",
" \"runs\": []\n",
" },\n",
" {\n",
" \"created_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"dataset_id\": \"9f73165c-d333-4d14-8f59-bd7eede5db08\",\n",
" \"id\": \"b258b95a-9524-4da7-b758-c5481109322d\",\n",
" \"inputs\": {\n",
" \"question\": \"Is it likely that Donna is outside with an umbrella at this time?\"\n",
" },\n",
" \"modified_at\": \"2023-11-15T15:26:53.491359\",\n",
" \"outputs\": {\n",
" \"expected_steps\": [\n",
" \"find_users_by_name\",\n",
" \"get_user_location\",\n",
" \"get_current_time_for_location\",\n",
" \"get_current_weather_for_location\"\n",
" ],\n",
" \"order_matters\": false,\n",
" \"reference\": \"yes\"\n",
" },\n",
" \"runs\": []\n",
" }\n",
"]\n"
]
}
],
"source": [
"import json\n",
"\n",
"with open(\"./e95d45da-aaa3-44b3-ba2b-7c15ff6e46f5.json\", \"r\", encoding=\"utf-8\") as f:\n",
" print(json.dumps(json.load(f)[:2], indent=2, sort_keys=True))"
]
},
{
"cell_type": "markdown",
"id": "2c6cf01f-466b-406d-b4c7-2395747780fd",
"metadata": {},
"source": [
"We can also clone the dataset to our local tenant"
]
},
{
"cell_type": "markdown",
"id": "e4dea4df-2f1c-436b-a71c-49ffb2295ccc",
"metadata": {},
"source": [
"Executing this command will clone the dataset to your own LangSmith tenant. \n",
"For this to work you must have a [LangSmith account](https://smith.langchain.com/) set up."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b20ba9a6",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Get from https://smith.langchain.com/settings\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = \"ls_...\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18d0b905-2a6a-4752-a7cb-8653bd9049e3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"clone_public_dataset(\n",
" \"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/examples\",\n",
" dataset_name=\"Agent Dataset\",\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+267
View File
@@ -0,0 +1,267 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d9cb90d8-e6a1-4c89-9cde-0e6c0a28f5c0",
"metadata": {},
"source": [
"# Model Registry\n",
"\n",
"Here, we'll see how to access the model registry.\n",
"\n",
"If you see a model that you want to use and it's missing, please open a PR to add it!"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "31831289-51fb-4ee5-98f3-0476cf11b187",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks import model_registry"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "aaed190d-fa4b-4445-9bfb-0e784e2a083b",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<thead>\n",
"<tr><th>Name </th><th>Type </th><th>Provider </th><th>Description </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td>gpt-3.5-turbo-1106 </td><td>chat </td><td>openai </td><td>The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. Learn more.</td></tr>\n",
"<tr><td>gpt-3.5-turbo </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
"<tr><td>gpt-3.5-turbo-16k </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
"<tr><td>gpt-3.5-turbo-instruct</td><td>llm </td><td>openai </td><td>Similar capabilities as text-davinci-003 but compatible with legacy Completions endpoint and not Chat Completions. </td></tr>\n",
"<tr><td>gpt-3.5-turbo-0613 </td><td>chat </td><td>openai </td><td>Legacy Snapshot of gpt-3.5-turbo from June 13th 2023. Will be deprecated on June 13, 2024. </td></tr>\n",
"<tr><td>gpt-3.5-turbo-16k-0613</td><td>chat </td><td>openai </td><td>Legacy Snapshot of gpt-3.5-16k-turbo from June 13th 2023. Will be deprecated on June 13, 2024. </td></tr>\n",
"<tr><td>gpt-3.5-turbo-0301 </td><td>chat </td><td>openai </td><td>Legacy Snapshot of gpt-3.5-turbo from March 1st 2023. Will be deprecated on June 13th 2024. </td></tr>\n",
"<tr><td>text-davinci-003 </td><td>llm </td><td>openai </td><td>Legacy Can do language tasks with better quality and consistency than the curie, babbage, or ada models. Will be deprecated on Jan 4th 2024. </td></tr>\n",
"<tr><td>text-davinci-002 </td><td>llm </td><td>openai </td><td>Legacy Similar capabilities to text-davinci-003 but trained with supervised fine-tuning instead of reinforcement learning. Will be deprecated on Jan 4th 2024. </td></tr>\n",
"<tr><td>code-davinci-002 </td><td>llm </td><td>openai </td><td>Legacy Optimized for code-completion tasks. Will be deprecated on Jan 4th 2024. </td></tr>\n",
"<tr><td>llama-v2-7b-chat </td><td>chat </td><td>fireworks </td><td>7b parameter LlamaChat model </td></tr>\n",
"<tr><td>llama-v2-13b-chat </td><td>chat </td><td>fireworks </td><td>13b parameter LlamaChat model </td></tr>\n",
"<tr><td>llama-v2-70b-chat </td><td>chat </td><td>fireworks </td><td>70b parameter LlamaChat model </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"ModelRegistry(registered_models=[RegisteredModel(name='gpt-3.5-turbo-1106', provider='openai', description='The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. Learn more.', params={'model': 'gpt-3.5-turbo-1106'}, type='chat', path=None), RegisteredModel(name='gpt-3.5-turbo', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo'}, type='chat', path=None), RegisteredModel(name='gpt-3.5-turbo-16k', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo-16k'}, type='chat', path=None), RegisteredModel(name='gpt-3.5-turbo-instruct', provider='openai', description='Similar capabilities as text-davinci-003 but compatible with legacy Completions endpoint and not Chat Completions.', params={'model': 'gpt-3.5-turbo-instruct'}, type='llm', path=None), RegisteredModel(name='gpt-3.5-turbo-0613', provider='openai', description='Legacy Snapshot of gpt-3.5-turbo from June 13th 2023. Will be deprecated on June 13, 2024.', params={'model': 'gpt-3.5-turbo-0613'}, type='chat', path=None), RegisteredModel(name='gpt-3.5-turbo-16k-0613', provider='openai', description='Legacy Snapshot of gpt-3.5-16k-turbo from June 13th 2023. Will be deprecated on June 13, 2024.', params={'model': 'gpt-3.5-turbo-16k-0613'}, type='chat', path=None), RegisteredModel(name='gpt-3.5-turbo-0301', provider='openai', description='Legacy Snapshot of gpt-3.5-turbo from March 1st 2023. Will be deprecated on June 13th 2024.', params={'model': 'gpt-3.5-turbo-0301'}, type='chat', path=None), RegisteredModel(name='text-davinci-003', provider='openai', description='Legacy Can do language tasks with better quality and consistency than the curie, babbage, or ada models. Will be deprecated on Jan 4th 2024.', params={'model': 'text-davinci-003'}, type='llm', path=None), RegisteredModel(name='text-davinci-002', provider='openai', description='Legacy Similar capabilities to text-davinci-003 but trained with supervised fine-tuning instead of reinforcement learning. Will be deprecated on Jan 4th 2024.', params={'model': 'text-davinci-002'}, type='llm', path=None), RegisteredModel(name='code-davinci-002', provider='openai', description='Legacy Optimized for code-completion tasks. Will be deprecated on Jan 4th 2024.', params={'model': 'code-davinci-002'}, type='llm', path=None), RegisteredModel(name='llama-v2-7b-chat', provider='fireworks', description='7b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-7b-chat'}, type='chat', path=None), RegisteredModel(name='llama-v2-13b-chat', provider='fireworks', description='13b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-13b-chat'}, type='chat', path=None), RegisteredModel(name='llama-v2-70b-chat', provider='fireworks', description='70b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-70b-chat'}, type='chat', path=None)])"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_registry"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "64bfc631-1f1e-4cf4-8636-b8be7b46fef8",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<tbody>\n",
"<tr><td>name </td><td>gpt-3.5-turbo-1106 </td></tr>\n",
"<tr><td>type </td><td>chat </td></tr>\n",
"<tr><td>provider </td><td>openai </td></tr>\n",
"<tr><td>description</td><td>The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. Learn more.</td></tr>\n",
"<tr><td>model_path </td><td>langchain.chat_models.openai.ChatOpenAI </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"RegisteredModel(name='gpt-3.5-turbo-1106', provider='openai', description='The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. Learn more.', params={'model': 'gpt-3.5-turbo-1106'}, type='chat', path=None)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"registered_model = model_registry[0]\n",
"registered_model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3604d49e-afbe-48ad-ac10-1e538b1ad376",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"model = registered_model.get_model()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bdece532-9843-427a-a10b-4545ed4ec151",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"AIMessage(content='Hello there! How can I assist you today?')"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.invoke('hello!')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "db40d4da-dc70-4e6d-b7e8-61de1e15ed2e",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<thead>\n",
"<tr><th>Name </th><th>Type </th><th>Provider </th><th>Description </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td>gpt-3.5-turbo-1106</td><td>chat </td><td>openai </td><td>The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. Learn more.</td></tr>\n",
"<tr><td>gpt-3.5-turbo </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
"<tr><td>gpt-3.5-turbo-16k </td><td>chat </td><td>openai </td><td>Currently points to gpt-3.5-turbo-0613. </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"ModelRegistry(registered_models=[RegisteredModel(name='gpt-3.5-turbo-1106', provider='openai', description='The latest GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. Learn more.', params={'model': 'gpt-3.5-turbo-1106'}, type='chat', path=None), RegisteredModel(name='gpt-3.5-turbo', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo'}, type='chat', path=None), RegisteredModel(name='gpt-3.5-turbo-16k', provider='openai', description='Currently points to gpt-3.5-turbo-0613.', params={'model': 'gpt-3.5-turbo-16k'}, type='chat', path=None)])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_registry[:3]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9874846a-52f3-4921-b1ed-0858521bb9a9",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<thead>\n",
"<tr><th>Name </th><th>Type </th><th>Provider </th><th>Description </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td>llama-v2-7b-chat </td><td>chat </td><td>fireworks </td><td>7b parameter LlamaChat model </td></tr>\n",
"<tr><td>llama-v2-13b-chat</td><td>chat </td><td>fireworks </td><td>13b parameter LlamaChat model</td></tr>\n",
"<tr><td>llama-v2-70b-chat</td><td>chat </td><td>fireworks </td><td>70b parameter LlamaChat model</td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"ModelRegistry(registered_models=[RegisteredModel(name='llama-v2-7b-chat', provider='fireworks', description='7b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-7b-chat'}, type='chat', path=None), RegisteredModel(name='llama-v2-13b-chat', provider='fireworks', description='13b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-13b-chat'}, type='chat', path=None), RegisteredModel(name='llama-v2-70b-chat', provider='fireworks', description='70b parameter LlamaChat model', params={'model': 'accounts/fireworks/models/llama-v2-70b-chat'}, type='chat', path=None)])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_registry.filter(provider='fireworks')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "eb531591-f46b-4745-ae67-4dfd6217ec5f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gpt-3.5-turbo-1106\n",
"gpt-3.5-turbo\n",
"gpt-3.5-turbo-16k\n",
"gpt-3.5-turbo-instruct\n",
"gpt-3.5-turbo-0613\n",
"gpt-3.5-turbo-16k-0613\n",
"gpt-3.5-turbo-0301\n",
"text-davinci-003\n",
"text-davinci-002\n",
"code-davinci-002\n",
"llama-v2-7b-chat\n",
"llama-v2-13b-chat\n",
"llama-v2-70b-chat\n"
]
}
],
"source": [
"for registered_model in model_registry:\n",
" print(registered_model.name)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
File diff suppressed because one or more lines are too long
@@ -0,0 +1,610 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9fa3470d-9448-4792-9f65-6978fc58cf84",
"metadata": {},
"source": [
"# Multi-modal eval: Baseline\n",
"\n",
"`Multi-modal slide decks` is a public dataset that contains a dataset of question-answer pairs from slide decks with visual content.\n",
"\n",
"The question-answer pairs are derived from the visual content in the decks, testing the ability of RAG to perform visual reasoning.\n",
"\n",
"As a baseline, we evaluate this dataset using text-based RAG pipeline, below.\n",
"\n",
"This will not reason about visual content and will simply load the text from the slides. \n",
"\n",
"## Pre-requisites"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47220461-d4e9-4f1d-9c57-672ca947ca0d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# %pip install -U langchain langsmith langchain_benchmarks\n",
"# %pip install --quiet chromadb openai pypdf pandas"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "196de967-6de6-40da-aa75-e836923ab5e3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
"env_vars = [\"LANGCHAIN_API_KEY\", \"OPENAI_API_KEY\"]\n",
"for var in env_vars:\n",
" if var not in os.environ:\n",
" os.environ[var] = getpass.getpass(prompt=f\"Enter your {var}: \")"
]
},
{
"cell_type": "markdown",
"id": "10da8e11-6288-4131-bd60-d5aa86928acc",
"metadata": {},
"source": [
"## Dataset\n",
"\n",
"We can browse the available LangChain benchmark datasets for retrieval."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2ff97905-14a6-413c-99be-58b7a9c8d4c1",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<thead>\n",
"<tr><th>Name </th><th>Type </th><th>Dataset ID </th><th>Description </th></tr>\n",
"</thead>\n",
"<tbody>\n",
"<tr><td>LangChain Docs Q&A </td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d\" target=\"_blank\" rel=\"noopener\">452ccafc-18e1-4314-885b-edd735f17b9d</a></td><td>Questions and answers based on a snapshot of the LangChain python docs.\n",
"\n",
"The environment provides the documents and the retriever information.\n",
"\n",
"Each example is composed of a question and reference answer.\n",
"\n",
"Success is measured based on the accuracy of the answer relative to the reference answer.\n",
"We also measure the faithfulness of the model's response relative to the retrieved documents (if any). </td></tr>\n",
"<tr><td>Semi-structured Reports</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d\" target=\"_blank\" rel=\"noopener\">c47d9617-ab99-4d6e-a6e6-92b8daf85a7d</a></td><td>Questions and answers based on PDFs containing tables and charts.\n",
"\n",
"The task provides the raw documents as well as factory methods to easily index them\n",
"and create a retriever.\n",
"\n",
"Each example is composed of a question and reference answer.\n",
"\n",
"Success is measured based on the accuracy of the answer relative to the reference answer.\n",
"We also measure the faithfulness of the model's response relative to the retrieved documents (if any). </td></tr>\n",
"<tr><td>Multi-modal slide decks</td><td>RetrievalTask</td><td><a href=\"https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d\" target=\"_blank\" rel=\"noopener\">40afc8e7-9d7e-44ed-8971-2cae1eb59731</a></td><td>This public dataset is a work-in-progress and will be extended over time.\n",
" \n",
"Questions and answers based on slide decks containing visual tables and charts.\n",
"\n",
"Each example is composed of a question and reference answer.\n",
"\n",
"Success is measured based on the accuracy of the answer relative to the reference answer. </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"Registry(tasks=[RetrievalTask(name='LangChain Docs Q&A', dataset_id='https://smith.langchain.com/public/452ccafc-18e1-4314-885b-edd735f17b9d/d', description=\"Questions and answers based on a snapshot of the LangChain python docs.\\n\\nThe environment provides the documents and the retriever information.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", get_docs=<function load_cached_docs at 0x104485800>, retriever_factories={'basic': <function _chroma_retriever_factory at 0x1360289a0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x136028a40>, 'hyde': <function _chroma_hyde_retriever_factory at 0x136028ae0>}, architecture_factories={'conversational-retrieval-qa': <function default_response_chain at 0x126ba2660>}), RetrievalTask(name='Semi-structured Reports', dataset_id='https://smith.langchain.com/public/c47d9617-ab99-4d6e-a6e6-92b8daf85a7d/d', description=\"Questions and answers based on PDFs containing tables and charts.\\n\\nThe task provides the raw documents as well as factory methods to easily index them\\nand create a retriever.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\nWe also measure the faithfulness of the model's response relative to the retrieved documents (if any).\\n\", get_docs=<function load_docs at 0x136029620>, retriever_factories={'basic': <function _chroma_retriever_factory at 0x1360296c0>, 'parent-doc': <function _chroma_parent_document_retriever_factory at 0x136029760>, 'hyde': <function _chroma_hyde_retriever_factory at 0x136029800>}, architecture_factories={}), RetrievalTask(name='Multi-modal slide decks', dataset_id='https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d', description='This public dataset is a work-in-progress and will be extended over time.\\n \\nQuestions and answers based on slide decks containing visual tables and charts.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\n', get_docs={}, retriever_factories={}, architecture_factories={})])"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain_benchmarks import clone_public_dataset, registry\n",
"\n",
"registry = registry.filter(Type=\"RetrievalTask\")\n",
"registry"
]
},
{
"cell_type": "markdown",
"id": "2fb7dc3d-28f1-4c28-b0d0-3784d04b81ce",
"metadata": {},
"source": [
"`Multi-modal slide decks` is the relevant dataset for our task."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "219a4141-4a5f-48e4-ae05-5a824e2193fd",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<tbody>\n",
"<tr><td>Name </td><td>Multi-modal slide decks </td></tr>\n",
"<tr><td>Type </td><td>RetrievalTask </td></tr>\n",
"<tr><td>Dataset ID </td><td><a href=\"https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d\" target=\"_blank\" rel=\"noopener\">40afc8e7-9d7e-44ed-8971-2cae1eb59731</a></td></tr>\n",
"<tr><td>Description </td><td>This public dataset is a work-in-progress and will be extended over time.\n",
" \n",
"Questions and answers based on slide decks containing visual tables and charts.\n",
"\n",
"Each example is composed of a question and reference answer.\n",
"\n",
"Success is measured based on the accuracy of the answer relative to the reference answer. </td></tr>\n",
"<tr><td>Retriever Factories </td><td> </td></tr>\n",
"<tr><td>Architecture Factories</td><td> </td></tr>\n",
"<tr><td>get_docs </td><td>{} </td></tr>\n",
"</tbody>\n",
"</table>"
],
"text/plain": [
"RetrievalTask(name='Multi-modal slide decks', dataset_id='https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d', description='This public dataset is a work-in-progress and will be extended over time.\\n \\nQuestions and answers based on slide decks containing visual tables and charts.\\n\\nEach example is composed of a question and reference answer.\\n\\nSuccess is measured based on the accuracy of the answer relative to the reference answer.\\n', get_docs={}, retriever_factories={}, architecture_factories={})"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"task = registry[\"Multi-modal slide decks\"]\n",
"task"
]
},
{
"cell_type": "markdown",
"id": "2d6569b5-e79a-41b7-9745-c2f8a1dd704e",
"metadata": {},
"source": [
"Clone the dataset so that it's available in our LangSmith datasets."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d2caa086-9549-4c74-bba9-ba80d5a7b218",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset Multi-modal slide decks already exists. Skipping.\n",
"You can access the dataset at https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306.\n"
]
}
],
"source": [
"clone_public_dataset(task.dataset_id, dataset_name=task.name)"
]
},
{
"cell_type": "markdown",
"id": "bf350917-a1e5-46f4-81cd-c1678ab9220f",
"metadata": {},
"source": [
"Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "99ce6afb-2317-4bc1-9faf-4f828095ad91",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_benchmarks.rag.tasks.multi_modal_slide_decks import get_file_names\n",
"\n",
"file_names = list(get_file_names()) # PosixPath"
]
},
{
"cell_type": "markdown",
"id": "848a4cdb-6c08-4c01-81ce-16ab83a7fdff",
"metadata": {},
"source": [
"## Load\n",
"\n",
"Load and split the files for indexing."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "6ce85810-98a7-406e-b44e-ce860ac35986",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 98 text elements in DDOG_Q3_earnings_deck.pdf\n"
]
}
],
"source": [
"from langchain.document_loaders import PyPDFLoader\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"\n",
"\n",
"def load_and_split(file):\n",
" \"\"\"\n",
" Load and split PDF files\n",
" :param file: PosixPath path for pdf\n",
" :return: A list of text chunks\n",
" \"\"\"\n",
"\n",
" loader = PyPDFLoader(str(file))\n",
" pdf_pages = loader.load()\n",
"\n",
" text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
" chunk_size=100, chunk_overlap=50\n",
" )\n",
"\n",
" # Get chunks\n",
" docs = text_splitter.split_documents(pdf_pages)\n",
" texts = [d.page_content for d in docs]\n",
" print(f\"There are {len(texts)} text elements in {file.name}\")\n",
" return texts\n",
"\n",
"\n",
"texts = []\n",
"for fi in file_names:\n",
" texts.extend(load_and_split(fi))"
]
},
{
"cell_type": "markdown",
"id": "eb01925d-b7d1-47a1-bd90-805178d3c4a9",
"metadata": {},
"source": [
"## Index\n",
"\n",
"Embed (OpenAIEmbeddings) and store splits in a vectorstore (Chroma)."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ceb31f71-45fb-4b12-bc1c-31981de334bb",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores import Chroma\n",
"\n",
"vectorstore_baseline = Chroma.from_texts(\n",
" texts=texts, collection_name=\"baseline-multi-modal\", embedding=OpenAIEmbeddings()\n",
")\n",
"\n",
"retriever_baseline = vectorstore_baseline.as_retriever()"
]
},
{
"cell_type": "markdown",
"id": "e6dcbb01-f480-456d-b972-c732eb26c393",
"metadata": {},
"source": [
"## RAG\n",
"\n",
"Create a pipeline for retrieval of relevant chunks based on semantic similarity to the input question.\n",
"\n",
"Pass the images to GPT-4 for answer synthesis."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ea233664-e527-42f1-a820-0c2271e16c20",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.prompts import ChatPromptTemplate\n",
"from langchain.schema.output_parser import StrOutputParser\n",
"from langchain.schema.runnable import RunnablePassthrough\n",
"\n",
"\n",
"def rag_chain(retriever):\n",
" \"\"\"\n",
" RAG pipeline for the indexed presentations\n",
" :param retriever: PosixPath path for pdf\n",
" \"\"\"\n",
"\n",
" # Prompt template\n",
" template = \"\"\"Answer the question based only on the following context, which can include text and tables:\n",
" {context}\n",
" Question: {question}\n",
" \"\"\"\n",
" prompt = ChatPromptTemplate.from_template(template)\n",
"\n",
" # LLM\n",
" model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
"\n",
" # RAG pipeline\n",
" chain = (\n",
" {\n",
" \"context\": retriever | (lambda x: \"\\n\\n\".join([i.page_content for i in x])),\n",
" \"question\": RunnablePassthrough(),\n",
" }\n",
" | prompt\n",
" | model\n",
" | StrOutputParser()\n",
" )\n",
" return chain\n",
"\n",
"\n",
"# Create RAG chain\n",
"chain = rag_chain(retriever_baseline)"
]
},
{
"cell_type": "markdown",
"id": "95df1446-143d-4f4c-a15b-2a379266d8bf",
"metadata": {},
"source": [
"## Eval\n",
"\n",
"Run evaluation on our dataset:\n",
"\n",
"* `task.name` is the dataset of QA pairs that we cloned\n",
"* `eval_config` specifies the [LangSmith evaluator](https://docs.smith.langchain.com/evaluation/evaluator-implementations#correctness-qa-evaluation) for our dataset, which will use GPT-4 as a grader\n",
"* The grader will evaluate the chain-generated answer to each question relative to ground truth"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "479ce09d-642e-4b3b-9e4e-e9c2b7f0e9ca",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"View the evaluation results for project '866f-baseline' at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306/compare?selectedSessions=30199d47-50d7-4c5c-a55a-e74157e05951\n",
"\n",
"View all tests for Dataset Multi-modal slide decks at:\n",
"https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/08a29acb-5ad6-42ce-a482-574c9e2e5306\n",
"[------------------------------------------------->] 10/10"
]
},
{
"data": {
"text/html": [
"<h3>Experiment Results:</h3>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>output</th>\n",
" <th>feedback.COT Contextual Accuracy</th>\n",
" <th>error</th>\n",
" <th>execution_time</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>10</td>\n",
" <td>10.000000</td>\n",
" <td>0</td>\n",
" <td>10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>10</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>Datadog has 20 total customers.</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>0.200000</td>\n",
" <td>NaN</td>\n",
" <td>4.674478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>0.421637</td>\n",
" <td>NaN</td>\n",
" <td>0.864273</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>3.307960</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>4.113816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>4.700962</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>5.018359</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>6.188082</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" output feedback.COT Contextual Accuracy \\\n",
"count 10 10.000000 \n",
"unique 10 NaN \n",
"top Datadog has 20 total customers. NaN \n",
"freq 1 NaN \n",
"mean NaN 0.200000 \n",
"std NaN 0.421637 \n",
"min NaN 0.000000 \n",
"25% NaN 0.000000 \n",
"50% NaN 0.000000 \n",
"75% NaN 0.000000 \n",
"max NaN 1.000000 \n",
"\n",
" error execution_time \n",
"count 0 10.000000 \n",
"unique 0 NaN \n",
"top NaN NaN \n",
"freq NaN NaN \n",
"mean NaN 4.674478 \n",
"std NaN 0.864273 \n",
"min NaN 3.307960 \n",
"25% NaN 4.113816 \n",
"50% NaN 4.700962 \n",
"75% NaN 5.018359 \n",
"max NaN 6.188082 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import uuid\n",
"\n",
"from langchain.smith import RunEvalConfig\n",
"from langsmith.client import Client\n",
"\n",
"# Evaluator configuration\n",
"client = Client()\n",
"eval_config = RunEvalConfig(\n",
" evaluators=[\"cot_qa\"],\n",
")\n",
"\n",
"# Experiments\n",
"chain_map = {\n",
" \"baseline\": chain,\n",
"}\n",
"\n",
"# Run evaluation\n",
"run_id = uuid.uuid4().hex[:4]\n",
"test_runs = {}\n",
"for project_name, chain in chain_map.items():\n",
" test_runs[project_name] = client.run_on_dataset(\n",
" dataset_name=task.name,\n",
" llm_or_chain_factory=lambda: (lambda x: x[\"Question\"]) | chain,\n",
" evaluation=eval_config,\n",
" verbose=True,\n",
" project_name=f\"{run_id}-{project_name}\",\n",
" project_metadata={\"chain\": project_name},\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+2
View File
@@ -34,5 +34,7 @@
./notebooks/retrieval/intro
./notebooks/retrieval/langchain_docs_qa
./notebooks/retrieval/semi_structured
./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval_baseline
./notebooks/retrieval/multi_modal_benchmarking/multi_modal_eval
./notebooks/retrieval/comparing_techniques
```
+8 -1
View File
@@ -1,3 +1,4 @@
from langchain_benchmarks.model_registration import model_registry
from langchain_benchmarks.registration import registry
from langchain_benchmarks.utils._langsmith import (
clone_public_dataset,
@@ -5,4 +6,10 @@ from langchain_benchmarks.utils._langsmith import (
)
# Please keep this list sorted!
__all__ = ["clone_public_dataset", "download_public_dataset", "registry"]
__all__ = [
"clone_public_dataset",
"download_public_dataset",
"registry",
"model_registry",
]
+152
View File
@@ -0,0 +1,152 @@
from __future__ import annotations
from langchain_benchmarks.schema import RegisteredModel, ModelRegistry
_OpenAIModels = [
RegisteredModel(
provider="openai",
name="gpt-3.5-turbo-1106",
type="chat",
description=(
"The latest GPT-3.5 Turbo model with improved instruction following, "
"JSON mode, reproducible outputs, parallel function calling, and more. "
"Returns a maximum of 4,096 output tokens. Learn more."
),
params={
"model": "gpt-3.5-turbo-1106",
},
),
RegisteredModel(
provider="openai",
name="gpt-3.5-turbo",
type="chat",
description="Currently points to gpt-3.5-turbo-0613.",
params={
"model": "gpt-3.5-turbo",
},
),
RegisteredModel(
provider="openai",
name="gpt-3.5-turbo-16k",
type="chat",
description="Currently points to gpt-3.5-turbo-0613.",
params={
"model": "gpt-3.5-turbo-16k",
},
),
RegisteredModel(
provider="openai",
name="gpt-3.5-turbo-instruct",
type="llm",
description=(
"Similar capabilities as text-davinci-003 but compatible with legacy "
"Completions endpoint and not Chat Completions."
),
params={
"model": "gpt-3.5-turbo-instruct",
},
),
RegisteredModel(
provider="openai",
name="gpt-3.5-turbo-0613",
type="chat",
description=(
"Legacy Snapshot of gpt-3.5-turbo from June 13th 2023. "
"Will be deprecated on June 13, 2024."
),
params={
"model": "gpt-3.5-turbo-0613",
},
),
RegisteredModel(
provider="openai",
name="gpt-3.5-turbo-16k-0613",
type="chat",
description=(
"Legacy Snapshot of gpt-3.5-16k-turbo from June 13th 2023. "
"Will be deprecated on June 13, 2024."
),
params={
"model": "gpt-3.5-turbo-16k-0613",
},
),
RegisteredModel(
provider="openai",
name="gpt-3.5-turbo-0301",
type="chat",
description=(
"Legacy Snapshot of gpt-3.5-turbo from March 1st 2023. "
"Will be deprecated on June 13th 2024."
),
params={
"model": "gpt-3.5-turbo-0301",
},
),
RegisteredModel(
provider="openai",
name="text-davinci-003",
type="llm",
description=(
"Legacy Can do language tasks with better quality and consistency than "
"the curie, babbage, or ada models. Will be deprecated on Jan 4th 2024."
),
params={
"model": "text-davinci-003",
},
),
RegisteredModel(
provider="openai",
name="text-davinci-002",
type="llm",
description=(
"Legacy Similar capabilities to text-davinci-003 but trained with "
"supervised fine-tuning instead of reinforcement learning. "
"Will be deprecated on Jan 4th 2024."
),
params={
"model": "text-davinci-002",
},
),
RegisteredModel(
provider="openai",
name="code-davinci-002",
type="llm",
description="Legacy Optimized for code-completion tasks. Will be deprecated "
"on Jan 4th 2024.",
params={
"model": "code-davinci-002",
},
),
]
_FireworksModels = [
RegisteredModel(
provider="fireworks",
name="llama-v2-7b-chat",
type="chat",
description="7b parameter LlamaChat model",
params={
"model": "accounts/fireworks/models/llama-v2-7b-chat",
},
),
RegisteredModel(
provider="fireworks",
name="llama-v2-13b-chat",
type="chat",
description="13b parameter LlamaChat model",
params={
"model": "accounts/fireworks/models/llama-v2-13b-chat",
},
),
RegisteredModel(
provider="fireworks",
name="llama-v2-70b-chat",
type="chat",
description="70b parameter LlamaChat model",
params={
"model": "accounts/fireworks/models/llama-v2-70b-chat",
},
),
]
model_registry = ModelRegistry(registered_models=_OpenAIModels + _FireworksModels)
@@ -0,0 +1 @@
pdfs/
+8 -1
View File
@@ -1,7 +1,14 @@
from langchain_benchmarks.rag.tasks.langchain_docs.task import LANGCHAIN_DOCS_TASK
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.task import (
MULTI_MODAL_SLIDE_DECKS_TASK,
)
from langchain_benchmarks.rag.tasks.semi_structured_reports.task import (
SEMI_STRUCTURED_REPORTS_TASK,
)
# Please keep this sorted
__all__ = ["LANGCHAIN_DOCS_TASK", "SEMI_STRUCTURED_REPORTS_TASK"]
__all__ = [
"LANGCHAIN_DOCS_TASK",
"SEMI_STRUCTURED_REPORTS_TASK",
"MULTI_MODAL_SLIDE_DECKS_TASK",
]
@@ -0,0 +1,5 @@
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import (
get_file_names,
)
__all__ = ["get_file_names"]
@@ -0,0 +1,5 @@
from langchain_benchmarks.rag.tasks.multi_modal_slide_decks.indexing.retriever_registry import (
get_file_names,
)
__all__ = ["get_file_names"]
@@ -0,0 +1,39 @@
import logging
import os
import zipfile
from pathlib import Path
from typing import Iterable, Optional
from langchain_benchmarks.rag.utils._downloading import (
fetch_remote_file,
is_folder_populated,
)
logger = logging.getLogger(__name__)
_DIRECTORY = Path(os.path.abspath(__file__)).parent
# Stores the zipped pdfs for this dataset
REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/multi_modal_slide_decks.zip"
DOCS_DIR = _DIRECTORY / "pdfs"
def fetch_raw_docs(
filename: Optional[str] = None, docs_dir: Optional[str] = None
) -> None:
filename = filename or _DIRECTORY / Path(REMOTE_DOCS_FILE).name
docs_dir = docs_dir or DOCS_DIR
if not is_folder_populated(docs_dir):
fetch_remote_file(REMOTE_DOCS_FILE, filename)
with zipfile.ZipFile(filename, "r") as zip_ref:
zip_ref.extractall(docs_dir)
os.remove(filename)
def get_file_names() -> Iterable[Path]:
fetch_raw_docs()
# Traverse the directory and partition the pdfs
for path in DOCS_DIR.rglob("*.pdf"):
# Ignore __MACOSX
if "__MACOSX" in str(path):
continue
yield path
@@ -0,0 +1,23 @@
from langchain_benchmarks.schema import RetrievalTask
# ID of public Multi Modal Slide Decks dataset
DATASET_ID = "https://smith.langchain.com/public/40afc8e7-9d7e-44ed-8971-2cae1eb59731/d"
MULTI_MODAL_SLIDE_DECKS_TASK = RetrievalTask(
name="Multi-modal slide decks",
dataset_id=DATASET_ID,
retriever_factories={},
architecture_factories={},
get_docs={},
description=(
"""\
This public dataset is a work-in-progress and will be extended over time.
Questions and answers based on slide decks containing visual tables and charts.
Each example is composed of a question and reference answer.
Success is measured based on the accuracy of the answer relative to the reference answer.
""" # noqa: E501
),
)
@@ -24,7 +24,6 @@ _DIRECTORY = Path(os.path.abspath(__file__)).parent
# Stores the zipped pdfs for this dataset
REMOTE_DOCS_FILE = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/semi_structured_earnings.zip"
DOCS_DIR = _DIRECTORY / "pdfs"
LOCAL_FILE = _DIRECTORY / "chroma_db.zip"
_DEFAULT_SEARCH_KWARGS = {"k": 6}
@@ -32,17 +31,17 @@ _DEFAULT_SEARCH_KWARGS = {"k": 6}
def fetch_raw_docs(
filename: Optional[str] = None, docs_dir: Optional[str] = None
) -> None:
filename = filename or LOCAL_FILE
filename = filename or _DIRECTORY / Path(REMOTE_DOCS_FILE).name
docs_dir = docs_dir or DOCS_DIR
if not is_folder_populated(docs_dir):
fetch_remote_file(REMOTE_DOCS_FILE, filename)
with zipfile.ZipFile(filename, "r") as zip_ref:
zip_ref.extractall(docs_dir)
os.remove(LOCAL_FILE)
os.remove(filename)
def get_file_names():
def get_file_names() -> Iterable[Path]:
fetch_raw_docs()
# Traverse the directory and partition the pdfs
for path in DOCS_DIR.glob("*.pdf"):
+2
View File
@@ -3,6 +3,7 @@
from langchain_benchmarks.extraction.tasks import chat_extraction, email_task
from langchain_benchmarks.rag.tasks import (
LANGCHAIN_DOCS_TASK,
MULTI_MODAL_SLIDE_DECKS_TASK,
SEMI_STRUCTURED_REPORTS_TASK,
)
from langchain_benchmarks.schema import Registry
@@ -24,5 +25,6 @@ registry = Registry(
chat_extraction.CHAT_EXTRACTION_TASK,
LANGCHAIN_DOCS_TASK,
SEMI_STRUCTURED_REPORTS_TASK,
MULTI_MODAL_SLIDE_DECKS_TASK,
]
)
+202 -6
View File
@@ -2,8 +2,12 @@
from __future__ import annotations
import dataclasses
import importlib
import urllib
from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union, Sequence
from langchain_core.language_models import BaseChatModel, BaseLanguageModel
from typing_extensions import Literal
from langchain.prompts import ChatPromptTemplate
from langchain.schema import BaseRetriever
@@ -109,12 +113,16 @@ class ExtractionTask(BaseTask):
@dataclasses.dataclass(frozen=True)
class RetrievalTask(BaseTask):
retriever_factories: Dict[str, Callable[[Embeddings], BaseRetriever]] # noqa: F821
"""Factories that index the docs using the specified strategy."""
architecture_factories: Dict[str, Callable[[Embeddings], BaseRetriever]] # noqa: F821
"""Factories methods that help build some off-the-shelf architectures。"""
get_docs: Callable[..., Iterable[Document]]
get_docs: Optional[Callable[..., Iterable[Document]]] = None
"""A function that returns the documents to be indexed."""
retriever_factories: Dict[
str, Callable[[Embeddings], BaseRetriever]
] = dataclasses.field(default_factory=dict) # noqa: F821
"""Factories that index the docs using the specified strategy."""
architecture_factories: Dict[
str, Callable[[Embeddings], BaseRetriever]
] = dataclasses.field(default_factory=dict) # noqa: F821
"""Factories methods that help build some off-the-shelf architectures。"""
@property
def _table(self) -> List[List[str]]:
@@ -149,6 +157,7 @@ class Registry:
raise ValueError(
f"Duplicate task name {task.name}. " f"Task names must be unique."
)
seen_names.add(task.name)
def _repr_html_(self) -> str:
"""Return an HTML representation of the registry."""
@@ -206,3 +215,190 @@ class Registry:
if not isinstance(task, BaseTask):
raise TypeError("Only tasks can be added to the registry.")
self.tasks.append(task)
Provider = Literal["fireworks", "openai"]
ModelType = Literal["chat", "llm"]
AUTHORIZED_NAMESPACES = {"langchain"}
def _get_model_class_from_path(
path: str
) -> Union[Type[BaseChatModel], Type[BaseLanguageModel]]:
"""Get the class of the model."""
module_name, attribute_name = path.rsplit(".", 1)
top_namespace = path.split(".")[0]
if top_namespace not in AUTHORIZED_NAMESPACES:
raise ValueError(
f"Unauthorized namespace {top_namespace}. "
f"Authorized namespaces are: {AUTHORIZED_NAMESPACES}"
)
# Import the module dynamically
module = importlib.import_module(module_name)
model_class = getattr(module, attribute_name)
if not issubclass(model_class, (BaseLanguageModel, BaseChatModel)):
raise ValueError(
f"Model class {model_class} is not a subclass of BaseLanguageModel"
)
return model_class
def _get_default_path(provider: str, type_: ModelType) -> str:
"""Get the default path for a model."""
paths = {
("fireworks", "chat"): "langchain.chat_models.fireworks.ChatFireworks",
("fireworks", "llm"): "langchain.language_models.fireworks.Fireworks",
("openai", "chat"): "langchain.chat_models.openai.ChatOpenAI",
("openai", "llm"): "langchain.language_models.openai.OpenAI",
}
if (provider, type_) not in paths:
raise ValueError(f"Unknown provider {provider} and type {type_}")
return paths[(provider, type_)]
@dataclasses.dataclass(frozen=True)
class RegisteredModel:
"""Descriptive information about a model.
This information can be used to instantiate the underlying model.
"""
name: str
provider: Provider
description: str
params: Dict[str, Any]
type: ModelType
# Path to the model class.
# For example, "langchain.chat_models.anthropic import ChatAnthropicModel"
path: Optional[str] = None # If not provided, will use default path
def get_model(
self, *, model_params: Optional[Dict[str, Any]] = None
) -> Union[BaseChatModel, BaseLanguageModel]:
"""Get the class of the model."""
all_params = {**self.params, **(model_params or {})}
model_class = _get_model_class_from_path(self.model_path)
return model_class(**all_params)
@property
def model_path(self) -> str:
"""Get the path of the model."""
return self.path or _get_default_path(self.provider, self.type)
@property
def _table(self) -> List[List[str]]:
"""Return a table representation of the environment."""
return [
["name", self.name],
["type", self.type],
["provider", self.provider],
["description", self.description],
["model_path", self.model_path],
]
def _repr_html_(self) -> str:
"""Return an HTML representation of the environment."""
return tabulate(
self._table,
tablefmt="unsafehtml",
)
StrFilter = Union[None, str, Sequence[str]]
def _is_in_filter(actual_value: str, filter_value: StrFilter) -> bool:
"""Filter for a string attribute."""
if filter_value is None:
return True
if isinstance(filter_value, str):
return actual_value == filter_value
return actual_value in filter_value
@dataclasses.dataclass(frozen=False)
class ModelRegistry:
registered_models: Sequence[RegisteredModel]
def __post_init__(self) -> None:
"""Validate that all the tasks have unique names and IDs."""
seen_names = set()
for model in self.registered_models:
if model.name in seen_names:
raise ValueError(
f"Duplicate model name {model.name}. " f"Task names must be unique."
)
seen_names.add(model.name)
def get_model(self, name: str) -> Optional[RegisteredModel]:
"""Get model info."""
return next(model for model in self.registered_models if model.name == name)
def filter(
self,
*,
type: StrFilter = None,
name: StrFilter = None,
provider: StrFilter = None,
) -> ModelRegistry:
"""Filter the tasks in the registry."""
models = self.registered_models
selected_models = []
for model in models:
if not _is_in_filter(model.type, type):
continue
if not _is_in_filter(model.name, name):
continue
if not _is_in_filter(model.provider, provider):
continue
selected_models.append(model)
return ModelRegistry(registered_models=selected_models)
def _repr_html_(self) -> str:
"""Return an HTML representation of the registry."""
headers = [
"Name",
"Type",
"Provider",
"Description",
]
table = [
[
model.name,
model.type,
model.provider,
model.description,
]
for model in self.registered_models
]
return tabulate(table, headers=headers, tablefmt="unsafehtml")
def __len__(self) -> int:
"""Return the number of tasks in the registry."""
return len(self.registered_models)
def __iter__(self) -> Iterable[RegisteredModel]:
"""Iterate over the tasks in the registry."""
return iter(self.registered_models)
def __getitem__(
self, key: Union[int, str]
) -> Union[RegisteredModel, ModelRegistry]:
"""Get an environment from the registry."""
if isinstance(key, slice):
return ModelRegistry(registered_models=self.registered_models[key])
elif isinstance(key, (int, str)):
# If key is an integer, return the corresponding environment
if isinstance(key, str):
return self.get_model(key)
else:
return self.registered_models[key]
else:
raise TypeError("Key must be an integer or a slice.")
+1 -1
View File
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-benchmarks"
version = "0.0.6"
version = "0.0.7"
description = "🦜💪 Flex those feathers!"
authors = ["LangChain AI"]
license = "MIT"
+68
View File
@@ -0,0 +1,68 @@
import pytest
from langchain_benchmarks.schema import RegisteredModel, ModelRegistry
# Create some sample RegisteredModel instances for testing
SAMPLE_MODELS = [
RegisteredModel(
"model1", "fireworks", "Description 1", {"param1": "value1"}, "chat"
),
RegisteredModel("model2", "openai", "Description 2", {"param2": "value2"}, "llm"),
]
@pytest.fixture
def sample_registry() -> ModelRegistry:
return ModelRegistry(SAMPLE_MODELS)
def test_init() -> None:
# Test the constructor of ModelRegistry
registry = ModelRegistry(SAMPLE_MODELS)
assert len(registry.registered_models) == 2
def test_get_model(sample_registry: ModelRegistry) -> None:
# Test the get_model method
model = sample_registry.get_model("model1")
assert model.name == "model1"
def test_filter(sample_registry: ModelRegistry) -> None:
# Test the filter method
filtered_registry = sample_registry.filter(type="chat")
assert len(filtered_registry.registered_models) == 1
assert filtered_registry.registered_models[0].type == "chat"
def test_repr_html(sample_registry: ModelRegistry) -> None:
# Test the _repr_html_ method
html_representation = sample_registry._repr_html_()
assert "<table>" in html_representation
def test_len(sample_registry: ModelRegistry) -> None:
# Test the __len__ method
assert len(sample_registry) == 2
def test_iter(sample_registry: ModelRegistry) -> None:
# Test the __iter__ method
models = list(iter(sample_registry))
assert len(models) == 2
assert isinstance(models[0], RegisteredModel)
def test_getitem(sample_registry: ModelRegistry) -> None:
# Test the __getitem__ method for integer and string keys
model = sample_registry[0]
assert model.name == "model1"
model = sample_registry["model2"]
assert model.name == "model2"
def test_getitem_slice(sample_registry: ModelRegistry) -> None:
# Test the __getitem__ method for slices
sliced_registry = sample_registry[:1]
assert len(sliced_registry.registered_models) == 1
assert sliced_registry.registered_models[0].name == "model1"