mirror of
https://github.com/langchain-ai/agent-evals.git
synced 2026-07-01 20:35:18 -04:00
split out datasets (#4)
This commit is contained in:
@@ -1,292 +0,0 @@
|
||||
## Company Data Enrichment
|
||||
|
||||
This directory contains evaluation script for the company data enrichment agents.
|
||||
|
||||
## Agent schema
|
||||
|
||||
High level, company data enrichment agents are expected to take a company and a JSON schema that describes the attributes to extract, and output extracted information in a JSON object. Here is an example:
|
||||
|
||||
- Input:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "object",
|
||||
"title": "company_data_enrichment_input",
|
||||
"required": [
|
||||
"company"
|
||||
],
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string",
|
||||
"title": "Company"
|
||||
},
|
||||
"extraction_schema": {
|
||||
"type": "object",
|
||||
"title": "Extraction Schema"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- Output:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "object",
|
||||
"title": "company_data_enrichment_output",
|
||||
"required": [
|
||||
"info"
|
||||
],
|
||||
"properties": {
|
||||
"info": {
|
||||
"type": "object",
|
||||
"title": "Info"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## Datasets
|
||||
|
||||
There are two public datasets available for evaluation in LangSmith:
|
||||
|
||||
- [Public companies](https://smith.langchain.com/public/640df79c-1831-494e-8824-d7300205dc8e/d). This dataset has a list of publicly traded companies to extract the following fields for:
|
||||
- `name`
|
||||
- `description`
|
||||
- `website`
|
||||
- `linkedin_profile`
|
||||
- `headquarters`
|
||||
- `employee_count`
|
||||
- `ceo`
|
||||
|
||||
Example input:
|
||||
```json
|
||||
{
|
||||
"company": "Nvidia",
|
||||
"extraction_schema": {
|
||||
"type": "object",
|
||||
"title": "company_info",
|
||||
"required": [
|
||||
"name",
|
||||
"description",
|
||||
"website",
|
||||
"linked_profile",
|
||||
"headquarters",
|
||||
"employee_count",
|
||||
"ceo"
|
||||
],
|
||||
"properties": {
|
||||
"ceo": {
|
||||
"type": "string",
|
||||
"description": "Name of the company's CEO"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Official company name"
|
||||
},
|
||||
"website": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's official website URL"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Brief description of the company and its activities"
|
||||
},
|
||||
"headquarters": {
|
||||
"type": "string",
|
||||
"description": "Location of company headquarters, formatted as <city>, <state code> (e.g. San Francisco, CA)"
|
||||
},
|
||||
"employee_count": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Number of employees in the company"
|
||||
},
|
||||
"linkedin_profile": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's LinkedIn profile URL"
|
||||
}
|
||||
},
|
||||
"description": "Company information"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```json
|
||||
{
|
||||
"info": {
|
||||
"ceo": "Jensen Huang",
|
||||
"name": "Nvidia Corporation",
|
||||
"website": "https://www.nvidia.com",
|
||||
"description": "Nvidia Corporation is a multinational technology company specializing in the design and manufacture of graphics processing units (GPUs) for gaming, professional visualization, data centers, and automotive markets. The company is a leader in artificial intelligence (AI) computing, providing platforms and solutions that power AI applications across various industries.",
|
||||
"headquarters": "Santa Clara, CA",
|
||||
"employee_count": 29600,
|
||||
"linkedin_profile": "https://www.linkedin.com/company/nvidia"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- [Startups](https://smith.langchain.com/public/afabd12a-62fa-4c09-b083-6b1742b4cc3a/d). This dataset has a list of AI startups to extract the following fields for:
|
||||
|
||||
- `name`
|
||||
- `description`
|
||||
- `website`
|
||||
- `crunchbase_profile`
|
||||
- `year_founded`
|
||||
- `ceo`
|
||||
- `total_funding_mm_usd`
|
||||
- `latest_round`
|
||||
- `latest_round_date`
|
||||
- `latest_round_amount_mm_usd`
|
||||
|
||||
Example input:
|
||||
|
||||
```json
|
||||
{
|
||||
"company": "LangChain",
|
||||
"extraction_schema": {
|
||||
"type": "object",
|
||||
"title": "company_info",
|
||||
"required": [
|
||||
"name",
|
||||
"description",
|
||||
"website",
|
||||
"crunchbase_profile",
|
||||
"year_founded",
|
||||
"ceo",
|
||||
"total_funding_mm_usd",
|
||||
"latest_round",
|
||||
"latest_round_date",
|
||||
"latest_round_amount_mm_usd"
|
||||
],
|
||||
"properties": {
|
||||
"ceo": {
|
||||
"type": "string",
|
||||
"description": "Name of the company's CEO"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Official company name"
|
||||
},
|
||||
"website": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's official website URL"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Brief description of the company and its activities"
|
||||
},
|
||||
"latest_round": {
|
||||
"type": "string",
|
||||
"description": "Type of the most recent funding round (e.g., Series A, Seed, etc.)"
|
||||
},
|
||||
"year_founded": {
|
||||
"type": "integer",
|
||||
"minimum": 1800,
|
||||
"description": "Year when the company was founded"
|
||||
},
|
||||
"latest_round_date": {
|
||||
"type": "string",
|
||||
"format": "date",
|
||||
"description": "Date of the most recent funding round (YYYY-MM-DD)"
|
||||
},
|
||||
"crunchbase_profile": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's Crunchbase profile URL"
|
||||
},
|
||||
"total_funding_mm_usd": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Total funding raised in millions of USD"
|
||||
},
|
||||
"latest_round_amount_mm_usd": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Amount raised in the most recent funding round in millions of USD"
|
||||
}
|
||||
},
|
||||
"description": "Company information"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```json
|
||||
{
|
||||
"info": {
|
||||
"ceo": "Harrison Chase",
|
||||
"name": "LangChain, Inc.",
|
||||
"website": "https://www.langchain.com",
|
||||
"description": "LangChain helps developers to build applications powered by large language models (LLMs). It provides tools and frameworks to integrate LLMs with external data sources and APIs, facilitating the creation of advanced AI applications.",
|
||||
"latest_round": "Series A",
|
||||
"year_founded": 2022,
|
||||
"latest_round_date": "2024-02-15",
|
||||
"crunchbase_profile": "https://www.crunchbase.com/organization/langchain",
|
||||
"total_funding_mm_usd": 35,
|
||||
"latest_round_amount_mm_usd": 25
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Evaluation Metric
|
||||
|
||||
Currently there is a single evaluation metric: fraction of the fields that were correctly extracted (per company). Correctness is defined differently depending on the field type:
|
||||
|
||||
- exact matches for fields like `founding_year` / `website`
|
||||
- fuzzy matches for fields like `company_name` / `ceo`
|
||||
- embedding similarity for fields like `description`
|
||||
- checking within a certain tolerance (+/- 10%) for fields like `employee_count` / `total_funding_mm_usd`
|
||||
|
||||
These can be adjusted in the `run_eval.py` script if you're adapting this to your own dataset.
|
||||
|
||||
## Running evals
|
||||
|
||||
To evaluate the agent, you can run `company_data_enrichment/run_eval.py` script. This will create new experiments in LangSmith for the two [datasets](#datasets) mentioned above.
|
||||
|
||||
**Basic usage:**
|
||||
|
||||
```shell
|
||||
python company_data_enrichment/run_eval.py
|
||||
```
|
||||
|
||||
By default this will use the `Public companies` dataset & `Company mAIstro` agent by LangChain.
|
||||
|
||||
**Advanced usage:**
|
||||
|
||||
You can pass the following parameters to customize the evaluation:
|
||||
|
||||
- `--dataset-name`: Name of the dataset to evaluate against. Defaults to `Public Company Data Enrichment` dataset.
|
||||
- `--graph-id`: graph ID of the agent to evaluate. Defaults to `company_maistro`.
|
||||
- `--agent-url`: URL of the deployed agent to evaluate. Defaults to `Company mAIstro` deployment.
|
||||
- `--experiment-prefix`: Prefix for the experiment name.
|
||||
- `--min-score`: Minimum acceptable score for evaluation. If specified, the script will raise an assertion error if the average score is below this threshold.
|
||||
|
||||
```shell
|
||||
python company_data_enrichment/run_eval.py --experiment-prefix "My custom prefix" --min-score 0.9
|
||||
```
|
||||
|
||||
### Using different schema
|
||||
|
||||
If your agent uses a schema that's different from the [example one above](#agent-schema), you can modify `make_agent_runner` in `run_eval.py` in the following way:
|
||||
|
||||
```python
|
||||
def make_agent_runner(agent_id: str, agent_url: str):
|
||||
agent_graph = RemoteGraph(agent_id, url=agent_url)
|
||||
|
||||
def run_agent(inputs: dict):
|
||||
# transform the inputs (single LangSmith dataset record) to match the agent's schema
|
||||
transformed_inputs = {"my_agent_key": inputs["company"], ...}
|
||||
response = agent_graph.invoke(transformed_inputs)
|
||||
# transform the agent outputs to match expected eval schema
|
||||
transformed_outputs = {"info": response["my_agent_output_key"]}
|
||||
return transformed_outputs
|
||||
|
||||
return run_agent
|
||||
```
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
## Math
|
||||
# Math
|
||||
|
||||
This directory contains evaluation script for the math agents.
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
## People Data Enrichment
|
||||
# People Data Enrichment
|
||||
|
||||
This directory contains evaluation script for the people data enrichment agents.
|
||||
|
||||
|
||||
@@ -0,0 +1,172 @@
|
||||
# Public Company Data Enrichment
|
||||
|
||||
This directory contains evaluation script for evaluating an agent on how well it does at researching information about a public company.
|
||||
|
||||
## Dataset
|
||||
|
||||
The dataset used can be found [here](https://smith.langchain.com/public/640df79c-1831-494e-8824-d7300205dc8e/d). This dataset has a list of publicly traded companies to extract the following fields for:
|
||||
- `name`
|
||||
- `description`
|
||||
- `website`
|
||||
- `linkedin_profile`
|
||||
- `headquarters`
|
||||
- `employee_count`
|
||||
- `ceo`
|
||||
|
||||
<details>
|
||||
<summary>Example input</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"company": "Nvidia",
|
||||
"extraction_schema": {
|
||||
"type": "object",
|
||||
"title": "company_info",
|
||||
"required": [
|
||||
"name",
|
||||
"description",
|
||||
"website",
|
||||
"linked_profile",
|
||||
"headquarters",
|
||||
"employee_count",
|
||||
"ceo"
|
||||
],
|
||||
"properties": {
|
||||
"ceo": {
|
||||
"type": "string",
|
||||
"description": "Name of the company's CEO"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Official company name"
|
||||
},
|
||||
"website": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's official website URL"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Brief description of the company and its activities"
|
||||
},
|
||||
"headquarters": {
|
||||
"type": "string",
|
||||
"description": "Location of company headquarters, formatted as <city>, <state code> (e.g. San Francisco, CA)"
|
||||
},
|
||||
"employee_count": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"description": "Number of employees in the company"
|
||||
},
|
||||
"linkedin_profile": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's LinkedIn profile URL"
|
||||
}
|
||||
},
|
||||
"description": "Company information"
|
||||
}
|
||||
}
|
||||
```
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Example output</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"info": {
|
||||
"ceo": "Jensen Huang",
|
||||
"name": "Nvidia Corporation",
|
||||
"website": "https://www.nvidia.com",
|
||||
"description": "Nvidia Corporation is a multinational technology company specializing in the design and manufacture of graphics processing units (GPUs) for gaming, professional visualization, data centers, and automotive markets. The company is a leader in artificial intelligence (AI) computing, providing platforms and solutions that power AI applications across various industries.",
|
||||
"headquarters": "Santa Clara, CA",
|
||||
"employee_count": 29600,
|
||||
"linkedin_profile": "https://www.linkedin.com/company/nvidia"
|
||||
}
|
||||
}
|
||||
```
|
||||
</details>
|
||||
|
||||
## Evaluation Metric
|
||||
|
||||
Currently there is a single evaluation metric: fraction of the fields that were correctly extracted (per company). Correctness is defined differently depending on the field type:
|
||||
|
||||
- exact matches for fields like `website`
|
||||
- fuzzy matches for fields like `company_name` / `ceo`
|
||||
- embedding similarity for fields like `description`
|
||||
- checking within a certain tolerance (+/- 10%) for fields like `employee_count`
|
||||
|
||||
These can be adjusted in the `run_eval.py` script if you're adapting this to your own dataset.
|
||||
|
||||
## Invoking the agent
|
||||
|
||||
The agent is invoked using a `RemoteGraph`:
|
||||
|
||||
```python
|
||||
from langgraph.pregel.remote import RemoteGraph
|
||||
|
||||
agent_graph = RemoteGraph(agent_id, url=agent_url)
|
||||
agent_graph.invoke(inputs)
|
||||
```
|
||||
|
||||
### Using different agent schema
|
||||
|
||||
Your agent might be using a custom input/output schema that doesn't match the dataset schema. To handle this, you can modify `transform_dataset_inputs` and `transform_agent_outputs` in `run_eval.py` in the following way:
|
||||
|
||||
```python
|
||||
|
||||
def transform_dataset_inputs(inputs: dict) -> dict:
|
||||
"""Transform LangSmith dataset inputs to match the agent's input schema before invoking the agent."""
|
||||
# see the `Example input` for reference on what `inputs` dict will look like
|
||||
return {"my_agent_key": inputs["company"], ...}
|
||||
|
||||
|
||||
def transform_agent_outputs(outputs: dict) -> dict:
|
||||
"""Transform agent outputs to match the LangSmith dataset output schema."""
|
||||
# see the `Example output` for reference on what the output from `run_agent` would look like
|
||||
return {"info": response["my_agent_output_key"]}
|
||||
```
|
||||
|
||||
`transform_dataset_inputs` will be applied to LangSmith dataset inputs before invoking the agent, and `transform_agent_outputs` will be applied to the agent's response before it's compared to the expected output in the LangSmith eval dataset.
|
||||
|
||||
## Running evals
|
||||
|
||||
To evaluate the agent, you can run `public_company_data_enrichment/run_eval.py` script. This will create new experiments in LangSmith for the two [datasets](#datasets) mentioned above.
|
||||
|
||||
By default this will use the `Public Company Data Enrichment` dataset & `Company mAIstro` agent by LangChain.
|
||||
|
||||
```shell
|
||||
python public_company_data_enrichment/run_eval.py --experiment-prefix "My custom prefix"
|
||||
```
|
||||
|
||||
You can pass the following parameters to customize the evaluation:
|
||||
|
||||
- `--dataset-name`: Name of the dataset to evaluate against. Defaults to `Public Company Data Enrichment` dataset.
|
||||
- `--graph-id`: graph ID of the agent to evaluate. Defaults to `company_maistro`.
|
||||
- `--agent-url`: URL of the deployed agent to evaluate. Defaults to `Company mAIstro` deployment.
|
||||
- `--experiment-prefix`: Prefix for the experiment name.
|
||||
|
||||
### Testing the agent locally
|
||||
|
||||
#### Import agent
|
||||
|
||||
You can import the compiled LangGraph graph object corresponding to your agent and that as `agent_graph` in `run_eval.py` instead of `RemoteGraph`. Then you can run the evaluation script as usual - `graph-id` and `agent-url` params will be ignored.
|
||||
|
||||
#### Run local LangGraph server
|
||||
|
||||
You can test the agent locally by using [LangGraph CLI](https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/#launch-langgraph-server). From the directory that contains the `langgraph.json` configuration file, run
|
||||
|
||||
```shell
|
||||
langgraph dev
|
||||
```
|
||||
|
||||
This will start a local server that you can interact with using `RemoteGraph`.
|
||||
|
||||
Then simply pass local URL for `agent-url` parameter and run the evaluation script as before:
|
||||
|
||||
```shell
|
||||
python public_company_data_enrichment/run_eval.py --experiment-prefix "My custom prefix" --agent-url http://localhost:8123
|
||||
```
|
||||
@@ -0,0 +1,200 @@
|
||||
from typing import Any, Optional
|
||||
|
||||
from Levenshtein import ratio
|
||||
from langsmith import Client, evaluate
|
||||
from langsmith.evaluation import LangChainStringEvaluator, EvaluationResults
|
||||
|
||||
from langgraph.pregel.remote import RemoteGraph
|
||||
|
||||
|
||||
client = Client()
|
||||
|
||||
TOLERANCE = 0.10 # should match within 10%
|
||||
NUMERIC_FIELDS = ("employee_count",)
|
||||
EXACT_MATCH_FIELDS = (
|
||||
"website",
|
||||
"linkedin_profile",
|
||||
"headquarters",
|
||||
)
|
||||
FUZZY_MATCH_FIELDS = ("name", "ceo")
|
||||
LONG_TEXT_FIELDS = ("description",)
|
||||
|
||||
DEFAULT_DATASET_NAME = "Public Company Data Enrichment"
|
||||
DEFAULT_GRAPH_ID = "company_maistro"
|
||||
DEFAULT_AGENT_URL = "https://langr.ph/marketplace/f7dcd212-1bd9-4596-a630-acc6ac4ff2f6"
|
||||
|
||||
|
||||
# evaluation helpers for different types of fields
|
||||
|
||||
|
||||
def evaluate_numeric_fields(outputs: dict, reference_outputs: dict) -> dict[str, float]:
|
||||
lower_bound = 1 - TOLERANCE
|
||||
upper_bound = 1 + TOLERANCE
|
||||
field_to_score = {}
|
||||
for k in NUMERIC_FIELDS:
|
||||
if k not in reference_outputs:
|
||||
continue
|
||||
|
||||
raw_field_value = outputs.get(k, 0)
|
||||
try:
|
||||
score = float(
|
||||
lower_bound
|
||||
<= int(raw_field_value) / reference_outputs[k]
|
||||
<= upper_bound
|
||||
)
|
||||
except ValueError:
|
||||
score = 0.0
|
||||
|
||||
field_to_score[k] = score
|
||||
return field_to_score
|
||||
|
||||
|
||||
def _preprocess_value(value: Any) -> Any:
|
||||
if isinstance(value, str):
|
||||
# for urls
|
||||
return value.rstrip("/")
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def evaluate_exact_match_fields(
|
||||
outputs: dict, reference_outputs: dict
|
||||
) -> dict[str, float]:
|
||||
return {
|
||||
k: float(
|
||||
_preprocess_value(outputs.get(k)) == _preprocess_value(reference_outputs[k])
|
||||
)
|
||||
for k in EXACT_MATCH_FIELDS
|
||||
if k in reference_outputs
|
||||
}
|
||||
|
||||
|
||||
def evaluate_long_text_fields(outputs: dict, reference_outputs: dict):
|
||||
emb_distance_evaluator = LangChainStringEvaluator(
|
||||
"embedding_distance", config={"distance": "cosine"}
|
||||
)
|
||||
return {
|
||||
k: 1
|
||||
- emb_distance_evaluator.evaluator.invoke(
|
||||
{"prediction": outputs.get(k, ""), "reference": reference_outputs[k]}
|
||||
)["score"]
|
||||
for k in LONG_TEXT_FIELDS
|
||||
if k in reference_outputs
|
||||
}
|
||||
|
||||
|
||||
def evaluate_fuzzy_match_fields(outputs: dict, reference_outputs: dict):
|
||||
return {
|
||||
k: ratio(outputs.get(k, "").lower(), reference_outputs[k].lower())
|
||||
for k in FUZZY_MATCH_FIELDS
|
||||
if k in reference_outputs
|
||||
}
|
||||
|
||||
|
||||
# effectively fraction of matching fields
|
||||
def evaluate_agent(outputs: dict, reference_outputs: dict):
|
||||
if "info" not in outputs or not isinstance(outputs["info"], dict):
|
||||
return 0.0
|
||||
|
||||
actual_company_info = outputs["info"]
|
||||
expected_company_info = reference_outputs["info"]
|
||||
|
||||
results = {
|
||||
**evaluate_numeric_fields(actual_company_info, expected_company_info),
|
||||
**evaluate_exact_match_fields(actual_company_info, expected_company_info),
|
||||
**evaluate_fuzzy_match_fields(actual_company_info, expected_company_info),
|
||||
}
|
||||
return sum(results.values()) / len(results)
|
||||
|
||||
|
||||
def get_agent_metadata(graph_id: str, agent_url: str):
|
||||
if "marketplace" in agent_url:
|
||||
project_id = agent_url.split("/")[-1]
|
||||
return {"project_id": project_id, "graph_id": graph_id}
|
||||
return {"graph_id": graph_id}
|
||||
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
def transform_dataset_inputs(inputs: dict) -> dict:
|
||||
"""Transform LangSmith dataset inputs to match the agent's input schema before invoking the agent."""
|
||||
# see the `Example input` in the README for reference on what `inputs` dict will look like
|
||||
# the dataset inputs already match the agent's input schema, but you can add any additional processing here
|
||||
return inputs
|
||||
|
||||
|
||||
def transform_agent_outputs(outputs: dict) -> dict:
|
||||
"""Transform agent outputs to match the LangSmith dataset output schema."""
|
||||
# see the `Example output` in the README for reference on what the output from `run_agent` would look like
|
||||
# the agent outputs already match the dataset output schema, but you can add any additional processing here
|
||||
return outputs
|
||||
|
||||
|
||||
def make_agent_runner(graph_id: str, agent_url: str):
|
||||
"""Wrapper that transforms inputs/outputs to match the expected eval schema and invokes the agent."""
|
||||
agent_graph = RemoteGraph(graph_id, url=agent_url)
|
||||
|
||||
def run_agent(inputs: dict) -> dict:
|
||||
"""Run the agent on the inputs from the LangSmith dataset record, return outputs conforming to the LangSmith dataset output schema."""
|
||||
transformed_inputs = transform_dataset_inputs(inputs)
|
||||
response = agent_graph.invoke(transformed_inputs)
|
||||
return transform_agent_outputs(response)
|
||||
|
||||
return run_agent
|
||||
|
||||
|
||||
def run_eval(
|
||||
*,
|
||||
dataset_name: str,
|
||||
graph_id: str = DEFAULT_GRAPH_ID,
|
||||
agent_url: str = DEFAULT_AGENT_URL,
|
||||
experiment_prefix: Optional[str] = None,
|
||||
) -> EvaluationResults:
|
||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||
run_agent = make_agent_runner(graph_id, agent_url)
|
||||
results = evaluate(
|
||||
run_agent,
|
||||
data=dataset,
|
||||
evaluators=[evaluate_agent],
|
||||
experiment_prefix=experiment_prefix,
|
||||
metadata=get_agent_metadata(graph_id, agent_url),
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default=DEFAULT_DATASET_NAME,
|
||||
help="Name of the dataset to evaluate against",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--graph-id",
|
||||
type=str,
|
||||
default=DEFAULT_GRAPH_ID,
|
||||
help="ID of the graph to evaluate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--agent-url",
|
||||
type=str,
|
||||
default=DEFAULT_AGENT_URL,
|
||||
help="URL of the deployed agent to evaluate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--experiment-prefix",
|
||||
type=str,
|
||||
help="Experiment prefix for the evaluation",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
run_eval(
|
||||
dataset_name=args.dataset_name,
|
||||
graph_id=args.graph_id,
|
||||
agent_url=args.agent_url,
|
||||
experiment_prefix=args.experiment_prefix,
|
||||
)
|
||||
@@ -0,0 +1,184 @@
|
||||
# Public Company Data Enrichment
|
||||
|
||||
This directory contains evaluation script for evaluating an agent on how well it does at researching information about a public company.
|
||||
|
||||
## Dataset
|
||||
|
||||
The dataset used can be found [here](https://smith.langchain.com/public/afabd12a-62fa-4c09-b083-6b1742b4cc3a/d). This dataset has a list of AI startups to extract the following fields for:
|
||||
- `name`
|
||||
- `description`
|
||||
- `website`
|
||||
- `crunchbase_profile`
|
||||
- `year_founded`
|
||||
- `ceo`
|
||||
- `total_funding_mm_usd`
|
||||
- `latest_round`
|
||||
- `latest_round_date`
|
||||
- `latest_round_amount_mm_usd`
|
||||
|
||||
<details>
|
||||
<summary>Example input</summary>
|
||||
|
||||
|
||||
```json
|
||||
{
|
||||
"company": "LangChain",
|
||||
"extraction_schema": {
|
||||
"type": "object",
|
||||
"title": "company_info",
|
||||
"required": [
|
||||
"name",
|
||||
"description",
|
||||
"website",
|
||||
"crunchbase_profile",
|
||||
"year_founded",
|
||||
"ceo",
|
||||
"total_funding_mm_usd",
|
||||
"latest_round",
|
||||
"latest_round_date",
|
||||
"latest_round_amount_mm_usd"
|
||||
],
|
||||
"properties": {
|
||||
"ceo": {
|
||||
"type": "string",
|
||||
"description": "Name of the company's CEO"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Official company name"
|
||||
},
|
||||
"website": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's official website URL"
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "Brief description of the company and its activities"
|
||||
},
|
||||
"latest_round": {
|
||||
"type": "string",
|
||||
"description": "Type of the most recent funding round (e.g., Series A, Seed, etc.)"
|
||||
},
|
||||
"year_founded": {
|
||||
"type": "integer",
|
||||
"minimum": 1800,
|
||||
"description": "Year when the company was founded"
|
||||
},
|
||||
"latest_round_date": {
|
||||
"type": "string",
|
||||
"format": "date",
|
||||
"description": "Date of the most recent funding round (YYYY-MM-DD)"
|
||||
},
|
||||
"crunchbase_profile": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "Company's Crunchbase profile URL"
|
||||
},
|
||||
"total_funding_mm_usd": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Total funding raised in millions of USD"
|
||||
},
|
||||
"latest_round_amount_mm_usd": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"description": "Amount raised in the most recent funding round in millions of USD"
|
||||
}
|
||||
},
|
||||
"description": "Company information"
|
||||
}
|
||||
}
|
||||
```
|
||||
</details>
|
||||
<br>
|
||||
<details>
|
||||
<summary>Example output</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"info": {
|
||||
"ceo": "Harrison Chase",
|
||||
"name": "LangChain, Inc.",
|
||||
"website": "https://www.langchain.com",
|
||||
"description": "LangChain helps developers to build applications powered by large language models (LLMs). It provides tools and frameworks to integrate LLMs with external data sources and APIs, facilitating the creation of advanced AI applications.",
|
||||
"latest_round": "Series A",
|
||||
"year_founded": 2022,
|
||||
"latest_round_date": "2024-02-15",
|
||||
"crunchbase_profile": "https://www.crunchbase.com/organization/langchain",
|
||||
"total_funding_mm_usd": 35,
|
||||
"latest_round_amount_mm_usd": 25
|
||||
}
|
||||
}
|
||||
```
|
||||
</details>
|
||||
|
||||
## Invoking the agent
|
||||
|
||||
The agent is invoked using a `RemoteGraph`:
|
||||
|
||||
```python
|
||||
from langgraph.pregel.remote import RemoteGraph
|
||||
|
||||
agent_graph = RemoteGraph(agent_id, url=agent_url)
|
||||
agent_graph.invoke(inputs)
|
||||
```
|
||||
|
||||
### Using different agent schema
|
||||
|
||||
Your agent might be using a custom input/output schema that doesn't match the dataset schema. To handle this, you can modify `transform_dataset_inputs` and `transform_agent_outputs` in `run_eval.py` in the following way:
|
||||
|
||||
```python
|
||||
|
||||
def transform_dataset_inputs(inputs: dict) -> dict:
|
||||
"""Transform LangSmith dataset inputs to match the agent's input schema before invoking the agent."""
|
||||
# see the `Example input` for reference on what `inputs` dict will look like
|
||||
return {"my_agent_key": inputs["company"], ...}
|
||||
|
||||
|
||||
def transform_agent_outputs(outputs: dict) -> dict:
|
||||
"""Transform agent outputs to match the LangSmith dataset output schema."""
|
||||
# see the `Example output` for reference on what the output from `run_agent` would look like
|
||||
return {"info": response["my_agent_output_key"]}
|
||||
```
|
||||
|
||||
`transform_dataset_inputs` will be applied to LangSmith dataset inputs before invoking the agent, and `transform_agent_outputs` will be applied to the agent's response before it's compared to the expected output in the LangSmith eval dataset.
|
||||
|
||||
## Running evals
|
||||
|
||||
To evaluate the agent, you can run `startup_data_enrichment/run_eval.py` script. This will create new experiments in LangSmith for the two [datasets](#datasets) mentioned above.
|
||||
|
||||
By default this will use the `Startup Data Enrichment` dataset & `Company mAIstro` agent by LangChain.
|
||||
|
||||
```shell
|
||||
python startup_data_enrichment/run_eval.py --experiment-prefix "My custom prefix"
|
||||
```
|
||||
|
||||
You can pass the following parameters to customize the evaluation:
|
||||
|
||||
- `--dataset-name`: Name of the dataset to evaluate against. Defaults to `Startup Data Enrichment` dataset.
|
||||
- `--graph-id`: graph ID of the agent to evaluate. Defaults to `company_maistro`.
|
||||
- `--agent-url`: URL of the deployed agent to evaluate. Defaults to `Company mAIstro` deployment.
|
||||
- `--experiment-prefix`: Prefix for the experiment name.
|
||||
|
||||
### Testing the agent locally
|
||||
|
||||
#### Import agent
|
||||
|
||||
You can import the compiled LangGraph graph object corresponding to your agent and that as `agent_graph` in `run_eval.py` instead of `RemoteGraph`. Then you can run the evaluation script as usual - `graph-id` and `agent-url` params will be ignored.
|
||||
|
||||
#### Run local LangGraph server
|
||||
|
||||
You can test the agent locally by using [LangGraph CLI](https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/#launch-langgraph-server). From the directory that contains the `langgraph.json` configuration file, run
|
||||
|
||||
```shell
|
||||
langgraph dev
|
||||
```
|
||||
|
||||
This will start a local server that you can interact with using `RemoteGraph`.
|
||||
|
||||
Then simply pass local URL for `agent-url` parameter and run the evaluation script as before:
|
||||
|
||||
```shell
|
||||
python startup_data_enrichment/run_eval.py --experiment-prefix "My custom prefix" --agent-url http://localhost:8123
|
||||
```
|
||||
@@ -11,14 +11,12 @@ client = Client()
|
||||
|
||||
TOLERANCE = 0.10 # should match within 10%
|
||||
NUMERIC_FIELDS = (
|
||||
"employee_count",
|
||||
"total_funding_mm_usd",
|
||||
"latest_round_amount_mm_usd",
|
||||
)
|
||||
EXACT_MATCH_FIELDS = (
|
||||
"website",
|
||||
"crunchbase_profile",
|
||||
"linkedin_profile",
|
||||
"headquarters",
|
||||
"year_founded",
|
||||
"latest_round",
|
||||
@@ -27,7 +25,7 @@ EXACT_MATCH_FIELDS = (
|
||||
FUZZY_MATCH_FIELDS = ("name", "ceo")
|
||||
LONG_TEXT_FIELDS = ("description",)
|
||||
|
||||
DEFAULT_DATASET_NAME = "Public Company Data Enrichment"
|
||||
DEFAULT_DATASET_NAME = "Startup Data Enrichment"
|
||||
DEFAULT_GRAPH_ID = "company_maistro"
|
||||
DEFAULT_AGENT_URL = "https://langr.ph/marketplace/f7dcd212-1bd9-4596-a630-acc6ac4ff2f6"
|
||||
|
||||
@@ -115,16 +113,6 @@ def evaluate_agent(outputs: dict, reference_outputs: dict):
|
||||
return sum(results.values()) / len(results)
|
||||
|
||||
|
||||
def make_agent_runner(graph_id: str, agent_url: str):
|
||||
agent_graph = RemoteGraph(graph_id, url=agent_url)
|
||||
|
||||
def run_agent(inputs: dict):
|
||||
response = agent_graph.invoke(inputs)
|
||||
return {"info": response["info"]}
|
||||
|
||||
return run_agent
|
||||
|
||||
|
||||
def get_agent_metadata(graph_id: str, agent_url: str):
|
||||
if "marketplace" in agent_url:
|
||||
project_id = agent_url.split("/")[-1]
|
||||
@@ -132,13 +120,42 @@ def get_agent_metadata(graph_id: str, agent_url: str):
|
||||
return {"graph_id": graph_id}
|
||||
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
def transform_dataset_inputs(inputs: dict) -> dict:
|
||||
"""Transform LangSmith dataset inputs to match the agent's input schema before invoking the agent."""
|
||||
# see the `Example input` in the README for reference on what `inputs` dict will look like
|
||||
# the dataset inputs already match the agent's input schema, but you can add any additional processing here
|
||||
return inputs
|
||||
|
||||
|
||||
def transform_agent_outputs(outputs: dict) -> dict:
|
||||
"""Transform agent outputs to match the LangSmith dataset output schema."""
|
||||
# see the `Example output` in the README for reference on what the output from `run_agent` would look like
|
||||
# the agent outputs already match the dataset output schema, but you can add any additional processing here
|
||||
return outputs
|
||||
|
||||
|
||||
def make_agent_runner(graph_id: str, agent_url: str):
|
||||
"""Wrapper that transforms inputs/outputs to match the expected eval schema and invokes the agent."""
|
||||
agent_graph = RemoteGraph(graph_id, url=agent_url)
|
||||
|
||||
def run_agent(inputs: dict) -> dict:
|
||||
"""Run the agent on the inputs from the LangSmith dataset record, return outputs conforming to the LangSmith dataset output schema."""
|
||||
transformed_inputs = transform_dataset_inputs(inputs)
|
||||
response = agent_graph.invoke(transformed_inputs)
|
||||
return transform_agent_outputs(response)
|
||||
|
||||
return run_agent
|
||||
|
||||
|
||||
def run_eval(
|
||||
*,
|
||||
dataset_name: str,
|
||||
graph_id: str = DEFAULT_GRAPH_ID,
|
||||
agent_url: str = DEFAULT_AGENT_URL,
|
||||
experiment_prefix: Optional[str] = None,
|
||||
min_score: Optional[float] = None,
|
||||
) -> EvaluationResults:
|
||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||
run_agent = make_agent_runner(graph_id, agent_url)
|
||||
@@ -149,15 +166,6 @@ def run_eval(
|
||||
experiment_prefix=experiment_prefix,
|
||||
metadata=get_agent_metadata(graph_id, agent_url),
|
||||
)
|
||||
|
||||
if min_score is not None:
|
||||
results_df = results.to_pandas()
|
||||
score = results_df["feedback.evaluate_agent"].mean()
|
||||
if score < min_score:
|
||||
raise AssertionError(
|
||||
f"Average fraction of correctly extracted fields ({score}) is less than min expected score of {min_score}"
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -188,11 +196,6 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
help="Experiment prefix for the evaluation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-score",
|
||||
type=float,
|
||||
help="Minimum acceptable score for evaluation",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
run_eval(
|
||||
@@ -200,5 +203,4 @@ if __name__ == "__main__":
|
||||
graph_id=args.graph_id,
|
||||
agent_url=args.agent_url,
|
||||
experiment_prefix=args.experiment_prefix,
|
||||
min_score=args.min_score,
|
||||
)
|
||||
Reference in New Issue
Block a user