add create dataset for people

This commit is contained in:
vbarda
2024-12-03 12:43:32 -05:00
parent 90b9781a8a
commit 20a1790cab
4 changed files with 381 additions and 39 deletions
+1 -1
View File
@@ -17,4 +17,4 @@ Below is the list of currently available evals:
| [Math](./math) | [Math Problems](https://smith.langchain.com/public/e0993f2f-c055-4446-afc2-e52da6a4dda0/d) | Solve math problems and return numerical answers | `{"Question": "Find the second derivative of f(x)=ln(x) and evaluate it at x=0.5."}` | `{"Answer": "-4"}` |
| [Public Company Data Enrichment](./public_company_data_enrichment) | [Public Companies](https://smith.langchain.com/public/640df79c-1831-494e-8824-d7300205dc8e/d) | Extract structured company information like CEO, headquarters, employee count etc. | `{"company": "Nvidia", "extraction_schema": {...}}` | `{"info": {"ceo": "Jensen Huang", "name": "Nvidia Corporation", ...}}` |
| [Startup Data Enrichment](./startup_data_enrichment) | [Startups](https://smith.langchain.com/public/afabd12a-62fa-4c09-b083-6b1742b4cc3a/d) | Extract structured company information like latest round, total funding, year founded etc. | `{"company": "LangChain", "extraction_schema": {...}}` | `{"info": {"latest_round": "Series A", ...}}` |
| [People Data Enrichment](./people_data_enrichment) | [People Dataset](https://smith.langchain.com/public/2af89d2a-93f6-4c84-80ac-70defcfd14c8/d) | Extract structured information about people like work experience, role, company etc. | `{"person": {"name": "Erick Friis", "email": "erick@langchain.dev", ...}, "extraction_schema": {...}}` | `{"extracted_information": {"Years-Experience": 10, "Company": "LangChain", ...}}` |
| [People Data Enrichment](./people_data_enrichment) | [People Dataset](https://smith.langchain.com/public/3384cc3a-722c-4eb1-8e41-dff56fea05b8/d) | Extract structured information about people like work experience, role, company etc. | `{"person": {"name": "Erick Friis", "email": "erick@langchain.dev", ...}, "extraction_schema": {...}}` | `{"extracted_information": {"Years-Experience": 10, "Company": "LangChain", ...}}` |
+45 -23
View File
@@ -4,11 +4,11 @@ This directory contains evaluation script for the people data enrichment agents.
## Dataset
The dataset used can be found [here](https://smith.langchain.com/public/2af89d2a-93f6-4c84-80ac-70defcfd14c8/d). This dataset has a list of people to do research on and extract the following fields for:
- `Years-Experience`
- `Company`
- `Role`
- `Prior-Companies`
The dataset used can be found [here](https://smith.langchain.com/public/3384cc3a-722c-4eb1-8e41-dff56fea05b8/d). This dataset has a list of people to do research on and extract the following fields for:
- `years_experience`
- `current_company`
- `role`
- `prior_companies`
<details>
@@ -27,28 +27,28 @@ The dataset used can be found [here](https://smith.langchain.com/public/2af89d2a
"type": "object",
"title": "Person-Schema",
"required": [
"Years-Experience",
"Company",
"Role",
"Prior-Companies"
"years_experience",
"current_company",
"role",
"prior_companies"
],
"properties": {
"Role": {
"role": {
"type": "string",
"description": "Current role of the person."
},
"Company": {
"current_company": {
"type": "string",
"description": "The name of the current company the person works at."
},
"Prior-Companies": {
"prior_companies": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of previous companies where the person has worked"
},
"Years-Experience": {
"years_experience": {
"type": "number",
"description": "How many years of full time work experience (excluding internships) does this person have."
}
@@ -65,26 +65,48 @@ The dataset used can be found [here](https://smith.langchain.com/public/2af89d2a
```json
{
"extracted_information": {
"Role": "Exploring new ideas and building out next project",
"Company": "South Park Commons",
"Prior-Companies": [
"Instabase",
"Chestnut",
"MIT"
"role": "Exploring new ideas and building out next project",
"current_company": "South Park Commons",
"prior_companies": [
"Instabase",
"Chestnut",
"MIT"
],
"Years-Experience": 5
"years_experience": 5
}
}
```
</details>
### Using the dataset
To use the data from this dataset in your own project, you can:
(1) clone the dataset using LangSmith SDK:
```python
from langsmith import Client
client = Client()
cloned_dataset = client.clone_public_dataset(
"https://smith.langchain.com/public/3384cc3a-722c-4eb1-8e41-dff56fea05b8/d",
dataset_name="People Data Enrichment"
)
```
(2) create a new dataset with the same examples using the following command:
```shell
python people_data_enrichment/create_dataset.py
```
## Evaluation Metric
Currently there is a single evaluation metric: fraction of the fields that were correctly extracted (per person). Correctness is defined differently depending on the field type:
- fuzzy matching for list of string fields such as `Prior-Companies`
- fuzzy matches for fields like `Role` / `Company`
- checking within a certain tolerance (+/- 15%) for `Years-Experience` field
- fuzzy matching for list of string fields such as `prior_companies`
- fuzzy matches for fields like `role` / `current_company`
- checking within a certain tolerance (+/- 15%) for `years_experience` field
## Invoking the agent
+320
View File
@@ -0,0 +1,320 @@
EXAMPLES = [
{
"name": "Harrison Chase",
"linkedin_profile": "https://www.linkedin.com/in/harrison-chase-961287118/",
"work_email": "harrison@langchain.dev",
"role": "CEO",
"current_company": "LangChain",
"prior_companies": "Kensho Technologies, Robust Intelligence",
"years_experience": 7,
},
{
"name": "Jake Rachleff",
"linkedin_profile": "https://www.linkedin.com/in/jakerachleff/",
"work_email": "jake@langchain.dev",
"role": "Software Engineer",
"current_company": "LangChain",
"prior_companies": "Databricks",
"years_experience": 6,
},
{
"name": "Nuno Campos",
"linkedin_profile": "https://www.linkedin.com/in/nuno-f-campos/",
"work_email": "nuno@langchain.dev",
"role": "Founding Engineer",
"current_company": "LangChain",
"prior_companies": "James, YLD, Boringbits",
"years_experience": 9,
},
{
"name": "Vince Signori",
"linkedin_profile": "https://www.linkedin.com/in/vincesignori/",
"work_email": "vince@langchain.dev",
"role": "GTM",
"current_company": "LangChain",
"prior_companies": "iPass, Imperva, Zendesk, Trifacta, HashiCorp",
"years_experience": 12,
},
{
"name": "Nick Huang",
"linkedin_profile": "https://www.linkedin.com/in/ncchuang/",
"work_email": "nick@langchain.dev",
"role": "Deployed Engineer",
"current_company": "LangChain",
"prior_companies": "Intros AI, Palantir",
"years_experience": 2,
},
{
"name": "Will Fu-Hinthorn",
"linkedin_profile": "https://www.linkedin.com/in/williamfuhinthorn/",
"work_email": "will@langchain.dev",
"role": "Foudning Engineer",
"current_company": "LangChain",
"prior_companies": "Microsoft, Robust Intelligence",
"years_experience": 7,
},
{
"name": "Jonathan Hodges",
"linkedin_profile": "https://www.linkedin.com/in/jonathanhodges/",
"work_email": "N/A",
"role": "Chief Architect, AI & ML",
"current_company": "Bolt",
"prior_companies": "NBC Entertainment Digital, Pearson North America, Ascend Learning, Workiva, GHX, Atypical AI, Userpilot",
"years_experience": 16,
},
{
"name": "Jessica Ou",
"linkedin_profile": "https://www.linkedin.com/in/jessicaou/",
"work_email": "jess@langchain.dev",
"role": "Business Operations & Finance",
"current_company": "LangChain",
"prior_companies": "Morgan Stanley, NEA, Sapienne",
"years_experience": 8,
},
{
"name": "Vadym Barda",
"linkedin_profile": "https://www.linkedin.com/in/vadymbarda/",
"work_email": "vadym@langchain.dev",
"role": "SWE/ML Engineer",
"current_company": "LangChain",
"prior_companies": "Kensho Technologies, Tunum",
"years_experience": 9,
},
{
"name": "Adam D'Abbracci",
"linkedin_profile": "https://www.linkedin.com/in/adam-d-abbracci-25390a22/",
"work_email": "N/A",
"role": "Founder",
"current_company": "Proteus",
"prior_companies": "TwentyTwenty Productions, Redcircle, Bags Inc., The Walt Disney Company, The New York Times",
"years_experience": 11,
},
{
"name": "Charles Bernoskie",
"linkedin_profile": "https://www.linkedin.com/in/charlesbernoskie/",
"work_email": "charles@langchain.dev",
"role": "GTM",
"current_company": "LangChain",
"prior_companies": "BuyWithMe, Stack Overflow, Elastic, materialize",
"years_experience": 18,
},
{
"name": "Bagatur Askaryan",
"linkedin_profile": "https://www.linkedin.com/in/bagatur-askaryan/",
"work_email": "bagatur@langchain.dev",
"role": "Founding Engineer",
"current_company": "LangChain",
"prior_companies": "Robust Intelligence",
"years_experience": 4,
},
{
"name": "Cameron Vetter",
"linkedin_profile": "https://www.linkedin.com/in/cameronvetter/",
"work_email": "N/A",
"role": "AI Practice Director",
"current_company": "New Resources Consulting",
"prior_companies": "Edstrom Industries, Centare Group, GE Healthcare, Direct Supply, Runzheimer International, Safenet Consulting, Octavion Technology Group, Zecil Software",
"years_experience": 24,
},
{
"name": "Alex Kira",
"linkedin_profile": "https://www.linkedin.com/in/alexkira/",
"work_email": "alex@langchain.dev",
"role": "Engineer",
"current_company": "LangChain",
"prior_companies": "Air2Web, CheckFree, Optimus Solutions, PromoterBee, Oracle, LoLo, Apple, Telmate, Uber, Netflix, Ramp",
"years_experience": 24,
},
{
"name": "Chester Curme",
"linkedin_profile": "https://www.linkedin.com/in/chestercurme/",
"work_email": "chester@langchain.dev",
"role": "Machine Learning Engineer",
"current_company": "LangChain",
"prior_companies": "Loomis, Sayles & Company, Kensho Technologies, Evisort, Microsoft",
"years_experience": 9,
},
{
"name": "Ankush Gola",
"linkedin_profile": "https://www.linkedin.com/in/ankush-gola-77255866/",
"work_email": "ankush@langchain.dev",
"role": "Co-Founder",
"current_company": "LangChain",
"prior_companies": "Facebook, Robust Intelligence, Unfold",
"years_experience": 9,
},
{
"name": "Jacob Lee",
"linkedin_profile": "https://www.linkedin.com/in/jacoblee93/",
"work_email": "jacob@langchain.dev",
"role": "Founding Engineer",
"current_company": "LangChain",
"prior_companies": "Google, Autocode, Remora Software",
"years_experience": 9,
},
{
"name": "Lance Martin",
"linkedin_profile": "https://www.linkedin.com/in/lance-martin-64a33b5/",
"work_email": "lance@langchain.dev",
"role": "Software/ML",
"current_company": "LangChain",
"prior_companies": "Uber, Ike, Nuro",
"years_experience": 9,
},
{
"name": "David Duong",
"linkedin_profile": "https://www.linkedin.com/in/duongtat",
"work_email": "david@langchain.dev",
"role": "Founding Engineer",
"current_company": "LangChain",
"prior_companies": "AstrumQ Interactive, Moravio, Spendee, Fitify, Cleevio",
"years_experience": 9,
},
{
"name": "Kevin Swiber",
"linkedin_profile": "https://www.linkedin.com/in/kevinswiber",
"work_email": "N/A",
"role": "API Strategy Lead",
"current_company": "Postman",
"prior_companies": "Ford Motor Company, EHIM, Quicken Loans, Apigee, LunchBadger, NodeSource",
"years_experience": 18,
},
{
"name": "Andrew Nguonly",
"linkedin_profile": "https://www.linkedin.com/in/andrewnguonly/",
"work_email": "andrew@langchain.dev",
"role": "Software Engineer",
"current_company": "LangChain",
"prior_companies": "j2 Global, CGI, Omaze, Honey, Netflix, Carta, South Park Commons",
"years_experience": 13,
},
{
"name": "Nazar Borovets",
"linkedin_profile": "https://www.linkedin.com/in/nazareka/",
"work_email": "N/A",
"role": "Backend & LLM Python Developer",
"current_company": "Veido",
"prior_companies": "HexOcean, Wooden Borovets Products, Gart Technology",
"years_experience": 1,
},
{
"name": "Eric Han",
"linkedin_profile": "https://www.linkedin.com/in/eric-han27/",
"work_email": "eric@langchain.dev",
"role": "Software Engineer",
"current_company": "LangChain",
"prior_companies": "Hotwire, Bottles Waiting Inc., Shopagon, Overlay Gaming Corporation, Instabase",
"years_experience": 12,
},
{
"name": "Greg Asquith",
"linkedin_profile": "https://www.linkedin.com/in/gregasquith/",
"work_email": "N/A",
"role": "Technology Consultant",
"current_company": "gregasquith.com",
"prior_companies": "Renault, Essence, Adcessible, Smartly Video and Display",
"years_experience": 9,
},
{
"name": "Erick Friis",
"linkedin_profile": "https://www.linkedin.com/in/efriis/",
"work_email": "erick@langchain.dev",
"role": "Founding Engineer",
"current_company": "LangChain",
"prior_companies": "Instabase, Chestnut, South Park Commons",
"years_experience": 5,
},
{
"name": "Oliver Dupuis",
"linkedin_profile": "https://www.linkedin.com/in/olivierdupuis/",
"work_email": "N/A",
"role": "Data Product Builder",
"current_company": "RepublicOfData.io",
"prior_companies": "University of Ottawa, Lantrns Analytics, Rittman Analytics",
"years_experience": 19,
},
{
"name": "Eugene Yurtsev",
"linkedin_profile": "https://www.linkedin.com/in/eugene-yurtsev-797a3b1b/",
"work_email": "eugene@langchain.dev",
"role": "SWE/ML Engineer",
"current_company": "LangChain",
"prior_companies": "Kensho Technologies, Yurtsev",
"years_experience": 9,
},
{
"name": "Brian Vander Schaaf",
"linkedin_profile": "https://www.linkedin.com/in/brianvanderschaaf/",
"work_email": "brian@langchain.dev",
"role": "Software Engineer",
"current_company": "LangChain",
"prior_companies": "Yodle, Two Sigma IQ, Frontrunner",
"years_experience": 8,
},
{
"name": "Julia Schottenstein",
"linkedin_profile": "https://www.linkedin.com/in/julia-schottenstein-25424318/",
"work_email": "julia@langchain.dev",
"role": "Building",
"current_company": "LangChain",
"prior_companies": "Qatalyst Partners, NEA, dbt Labs",
"years_experience": 10,
},
{
"name": "Wei Wong",
"linkedin_profile": "https://www.linkedin.com/in/weijianwong/",
"work_email": "wei@langchain.dev",
"role": "Account Executive",
"current_company": "LangChain",
"prior_companies": "Deutsche Bank, SingleStore, Snowflake",
"years_experience": 7,
},
]
if __name__ == "__main__":
from langsmith import Client
from langsmith.utils import LangSmithNotFoundError
client = Client()
dataset_name = "People Data Enrichment"
# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
try:
exists_dataset = client.read_dataset(dataset_name=dataset_name)
print(f"Dataset '{dataset_name}' already exists.")
print("You can access the dataset via the URL: ", exists_dataset.url)
exit(1)
except LangSmithNotFoundError:
# Then let's create the dataset if it doesn't exist
pass
dataset = client.create_dataset(
dataset_name=dataset_name,
description="Evaluate ability to research information about people (e.g., name, email, linkedin)",
)
# Prepare inputs, outputs, and metadata for bulk creation
inputs = [
{
k: v
for k, v in record.items()
if k in ["name", "work_email", "linkedin_profile"]
}
for record in EXAMPLES
]
outputs = [
{
k: v
for k, v in record.items()
if k in ["years_experience", "current_company", "role", "prior_companies"]
}
for record in EXAMPLES
]
client.create_examples(
inputs=inputs,
outputs=outputs,
dataset_id=dataset.id,
)
print(f"Dataset '{dataset_name}' created with {len(EXAMPLES)} examples.")
print("You can access the dataset via the URL: ", dataset.url)
+15 -15
View File
@@ -8,9 +8,9 @@ import argparse
# Defaults
EXPERIMENT_PREFIX = "People mAIstro "
TOLERANCE = 0.15 # should match within 15%
NUMERIC_FIELDS = ("Years-Experience",)
FUZZY_MATCH_FIELDS = ("Role", "Company")
LIST_OF_STRING_FIELDS = ("Prior-Companies",)
NUMERIC_FIELDS = ("years_experience",)
FUZZY_MATCH_FIELDS = ("role", "current_company")
LIST_OF_STRING_FIELDS = ("prior_companies",)
DEFAULT_DATASET_NAME = "People Data Enrichment"
DEFAULT_GRAPH_ID = "people_maistro"
DEFAULT_AGENT_URL = "https://langr.ph/marketplace/62bf5890-28fa-4dd1-b469-4751fe7ecdf3"
@@ -20,29 +20,29 @@ client = Client()
extraction_schema = {
"type": "object",
"required": [
"Years-Experience",
"Company",
"Role",
"Prior-Companies",
"years_experience",
"current_company",
"role",
"prior_companies",
],
"properties": {
"Role": {"type": "string", "description": "Current role of the person."},
"Years-Experience": {
"role": {"type": "string", "description": "Current role of the person."},
"years_experience": {
"type": "number",
"description": "How many years of full time work experience (excluding internships) does this person have.",
},
"Company": {
"current_company": {
"type": "string",
"description": "The name of the current company the person works at.",
},
"Prior-Companies": {
"prior_companies": {
"type": "array",
"items": {"type": "string"},
"description": "List of previous companies where the person has worked",
},
},
"description": "Person information",
"title": "Person-Schema",
"title": "Person",
}
@@ -132,9 +132,9 @@ def transform_dataset_inputs(inputs: dict) -> dict:
# see the `Example input` in the README for reference on what `inputs` dict should look like
return {
"person": {
"name": inputs["Person"],
"email": inputs["Work-Email"],
"linkedin": inputs["Linkedin"],
"name": inputs["name"],
"email": inputs["work_email"],
"linkedin": inputs["linkedin_profile"],
},
"extraction_schema": extraction_schema,
}