add create dataset for people

2026-07-01 20:35:18 -04:00 · 2024-12-03 12:43:32 -05:00
parent 90b9781a8a
commit 20a1790cab
4 changed files with 381 additions and 39 deletions
@@ -17,4 +17,4 @@ Below is the list of currently available evals:
 | [Math](./math) | [Math Problems](https://smith.langchain.com/public/e0993f2f-c055-4446-afc2-e52da6a4dda0/d) | Solve math problems and return numerical answers | `{"Question": "Find the second derivative of f(x)=ln(x) and evaluate it at x=0.5."}` | `{"Answer": "-4"}` |
 | [Public Company Data Enrichment](./public_company_data_enrichment) | [Public Companies](https://smith.langchain.com/public/640df79c-1831-494e-8824-d7300205dc8e/d) | Extract structured company information like CEO, headquarters, employee count etc. | `{"company": "Nvidia", "extraction_schema": {...}}` | `{"info": {"ceo": "Jensen Huang", "name": "Nvidia Corporation", ...}}` |
 | [Startup Data Enrichment](./startup_data_enrichment) | [Startups](https://smith.langchain.com/public/afabd12a-62fa-4c09-b083-6b1742b4cc3a/d) | Extract structured company information like latest round, total funding, year founded etc. | `{"company": "LangChain", "extraction_schema": {...}}` | `{"info": {"latest_round": "Series A", ...}}` |
-| [People Data Enrichment](./people_data_enrichment) | [People Dataset](https://smith.langchain.com/public/2af89d2a-93f6-4c84-80ac-70defcfd14c8/d) | Extract structured information about people like work experience, role, company etc. | `{"person": {"name": "Erick Friis", "email": "erick@langchain.dev", ...}, "extraction_schema": {...}}` | `{"extracted_information": {"Years-Experience": 10, "Company": "LangChain", ...}}` |
+| [People Data Enrichment](./people_data_enrichment) | [People Dataset](https://smith.langchain.com/public/3384cc3a-722c-4eb1-8e41-dff56fea05b8/d) | Extract structured information about people like work experience, role, company etc. | `{"person": {"name": "Erick Friis", "email": "erick@langchain.dev", ...}, "extraction_schema": {...}}` | `{"extracted_information": {"Years-Experience": 10, "Company": "LangChain", ...}}` |
@@ -4,11 +4,11 @@ This directory contains evaluation script for the people data enrichment agents.

 ## Dataset

-The dataset used can be found [here](https://smith.langchain.com/public/2af89d2a-93f6-4c84-80ac-70defcfd14c8/d). This dataset has a list of people to do research on and extract the following fields for:
-  - `Years-Experience`
-  - `Company`
-  - `Role`
-  - `Prior-Companies`
+The dataset used can be found [here](https://smith.langchain.com/public/3384cc3a-722c-4eb1-8e41-dff56fea05b8/d). This dataset has a list of people to do research on and extract the following fields for:
+  - `years_experience`
+  - `current_company`
+  - `role`
+  - `prior_companies`


 <details>
@@ -27,28 +27,28 @@ The dataset used can be found [here](https://smith.langchain.com/public/2af89d2a
    "type": "object",
    "title": "Person-Schema",
    "required": [
-      "Years-Experience",
-      "Company",
-      "Role",
-      "Prior-Companies"
+      "years_experience",
+      "current_company",
+      "role",
+      "prior_companies"
    ],
    "properties": {
-      "Role": {
+      "role": {
        "type": "string",
        "description": "Current role of the person."
      },
-      "Company": {
+      "current_company": {
        "type": "string",
        "description": "The name of the current company the person works at."
      },
-      "Prior-Companies": {
+      "prior_companies": {
        "type": "array",
        "items": {
          "type": "string"
        },
        "description": "List of previous companies where the person has worked"
      },
-      "Years-Experience": {
+      "years_experience": {
        "type": "number",
        "description": "How many years of full time work experience (excluding internships) does this person have."
      }
@@ -65,26 +65,48 @@ The dataset used can be found [here](https://smith.langchain.com/public/2af89d2a
 ```json
 {
  "extracted_information": {
-      "Role": "Exploring new ideas and building out next project",
-      "Company": "South Park Commons",
-      "Prior-Companies": [
-      "Instabase",
-      "Chestnut",
-      "MIT"
+      "role": "Exploring new ideas and building out next project",
+      "current_company": "South Park Commons",
+      "prior_companies": [
+        "Instabase",
+        "Chestnut",
+        "MIT"
      ],
-      "Years-Experience": 5
+      "years_experience": 5
  }
 }
 ```
 </details>

+### Using the dataset
+
+To use the data from this dataset in your own project, you can:
+
+(1) clone the dataset using LangSmith SDK:
+
+```python
+from langsmith import Client
+client = Client()
+
+cloned_dataset = client.clone_public_dataset(
+    "https://smith.langchain.com/public/3384cc3a-722c-4eb1-8e41-dff56fea05b8/d",
+    dataset_name="People Data Enrichment"
+)
+```
+
+(2) create a new dataset with the same examples using the following command:
+
+```shell
+python people_data_enrichment/create_dataset.py
+```
+
 ## Evaluation Metric

 Currently there is a single evaluation metric: fraction of the fields that were correctly extracted (per person). Correctness is defined differently depending on the field type:

- fuzzy matching for list of string fields such as `Prior-Companies`
- fuzzy matches for fields like `Role` / `Company`
- checking within a certain tolerance (+/- 15%) for `Years-Experience` field
+- fuzzy matching for list of string fields such as `prior_companies`
+- fuzzy matches for fields like `role` / `current_company`
+- checking within a certain tolerance (+/- 15%) for `years_experience` field

 ## Invoking the agent

@@ -0,0 +1,320 @@
+EXAMPLES = [
+    {
+        "name": "Harrison Chase",
+        "linkedin_profile": "https://www.linkedin.com/in/harrison-chase-961287118/",
+        "work_email": "harrison@langchain.dev",
+        "role": "CEO",
+        "current_company": "LangChain",
+        "prior_companies": "Kensho Technologies, Robust Intelligence",
+        "years_experience": 7,
+    },
+    {
+        "name": "Jake Rachleff",
+        "linkedin_profile": "https://www.linkedin.com/in/jakerachleff/",
+        "work_email": "jake@langchain.dev",
+        "role": "Software Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Databricks",
+        "years_experience": 6,
+    },
+    {
+        "name": "Nuno Campos",
+        "linkedin_profile": "https://www.linkedin.com/in/nuno-f-campos/",
+        "work_email": "nuno@langchain.dev",
+        "role": "Founding Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "James, YLD, Boringbits",
+        "years_experience": 9,
+    },
+    {
+        "name": "Vince Signori",
+        "linkedin_profile": "https://www.linkedin.com/in/vincesignori/",
+        "work_email": "vince@langchain.dev",
+        "role": "GTM",
+        "current_company": "LangChain",
+        "prior_companies": "iPass, Imperva, Zendesk, Trifacta, HashiCorp",
+        "years_experience": 12,
+    },
+    {
+        "name": "Nick Huang",
+        "linkedin_profile": "https://www.linkedin.com/in/ncchuang/",
+        "work_email": "nick@langchain.dev",
+        "role": "Deployed Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Intros AI, Palantir",
+        "years_experience": 2,
+    },
+    {
+        "name": "Will Fu-Hinthorn",
+        "linkedin_profile": "https://www.linkedin.com/in/williamfuhinthorn/",
+        "work_email": "will@langchain.dev",
+        "role": "Foudning Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Microsoft, Robust Intelligence",
+        "years_experience": 7,
+    },
+    {
+        "name": "Jonathan Hodges",
+        "linkedin_profile": "https://www.linkedin.com/in/jonathanhodges/",
+        "work_email": "N/A",
+        "role": "Chief Architect, AI & ML",
+        "current_company": "Bolt",
+        "prior_companies": "NBC Entertainment Digital, Pearson North America, Ascend Learning, Workiva, GHX, Atypical AI, Userpilot",
+        "years_experience": 16,
+    },
+    {
+        "name": "Jessica Ou",
+        "linkedin_profile": "https://www.linkedin.com/in/jessicaou/",
+        "work_email": "jess@langchain.dev",
+        "role": "Business Operations & Finance",
+        "current_company": "LangChain",
+        "prior_companies": "Morgan Stanley, NEA, Sapienne",
+        "years_experience": 8,
+    },
+    {
+        "name": "Vadym Barda",
+        "linkedin_profile": "https://www.linkedin.com/in/vadymbarda/",
+        "work_email": "vadym@langchain.dev",
+        "role": "SWE/ML Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Kensho Technologies, Tunum",
+        "years_experience": 9,
+    },
+    {
+        "name": "Adam D'Abbracci",
+        "linkedin_profile": "https://www.linkedin.com/in/adam-d-abbracci-25390a22/",
+        "work_email": "N/A",
+        "role": "Founder",
+        "current_company": "Proteus",
+        "prior_companies": "TwentyTwenty Productions, Redcircle, Bags Inc., The Walt Disney Company, The New York Times",
+        "years_experience": 11,
+    },
+    {
+        "name": "Charles Bernoskie",
+        "linkedin_profile": "https://www.linkedin.com/in/charlesbernoskie/",
+        "work_email": "charles@langchain.dev",
+        "role": "GTM",
+        "current_company": "LangChain",
+        "prior_companies": "BuyWithMe, Stack Overflow, Elastic, materialize",
+        "years_experience": 18,
+    },
+    {
+        "name": "Bagatur Askaryan",
+        "linkedin_profile": "https://www.linkedin.com/in/bagatur-askaryan/",
+        "work_email": "bagatur@langchain.dev",
+        "role": "Founding Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Robust Intelligence",
+        "years_experience": 4,
+    },
+    {
+        "name": "Cameron Vetter",
+        "linkedin_profile": "https://www.linkedin.com/in/cameronvetter/",
+        "work_email": "N/A",
+        "role": "AI Practice Director",
+        "current_company": "New Resources Consulting",
+        "prior_companies": "Edstrom Industries, Centare Group, GE Healthcare, Direct Supply, Runzheimer International, Safenet Consulting, Octavion Technology Group, Zecil Software",
+        "years_experience": 24,
+    },
+    {
+        "name": "Alex Kira",
+        "linkedin_profile": "https://www.linkedin.com/in/alexkira/",
+        "work_email": "alex@langchain.dev",
+        "role": "Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Air2Web, CheckFree, Optimus Solutions, PromoterBee, Oracle, LoLo, Apple, Telmate, Uber, Netflix, Ramp",
+        "years_experience": 24,
+    },
+    {
+        "name": "Chester Curme",
+        "linkedin_profile": "https://www.linkedin.com/in/chestercurme/",
+        "work_email": "chester@langchain.dev",
+        "role": "Machine Learning Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Loomis, Sayles & Company, Kensho Technologies, Evisort, Microsoft",
+        "years_experience": 9,
+    },
+    {
+        "name": "Ankush Gola",
+        "linkedin_profile": "https://www.linkedin.com/in/ankush-gola-77255866/",
+        "work_email": "ankush@langchain.dev",
+        "role": "Co-Founder",
+        "current_company": "LangChain",
+        "prior_companies": "Facebook, Robust Intelligence, Unfold",
+        "years_experience": 9,
+    },
+    {
+        "name": "Jacob Lee",
+        "linkedin_profile": "https://www.linkedin.com/in/jacoblee93/",
+        "work_email": "jacob@langchain.dev",
+        "role": "Founding Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Google, Autocode, Remora Software",
+        "years_experience": 9,
+    },
+    {
+        "name": "Lance Martin",
+        "linkedin_profile": "https://www.linkedin.com/in/lance-martin-64a33b5/",
+        "work_email": "lance@langchain.dev",
+        "role": "Software/ML",
+        "current_company": "LangChain",
+        "prior_companies": "Uber, Ike, Nuro",
+        "years_experience": 9,
+    },
+    {
+        "name": "David Duong",
+        "linkedin_profile": "https://www.linkedin.com/in/duongtat",
+        "work_email": "david@langchain.dev",
+        "role": "Founding Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "AstrumQ Interactive, Moravio, Spendee, Fitify, Cleevio",
+        "years_experience": 9,
+    },
+    {
+        "name": "Kevin Swiber",
+        "linkedin_profile": "https://www.linkedin.com/in/kevinswiber",
+        "work_email": "N/A",
+        "role": "API Strategy Lead",
+        "current_company": "Postman",
+        "prior_companies": "Ford Motor Company, EHIM, Quicken Loans, Apigee, LunchBadger, NodeSource",
+        "years_experience": 18,
+    },
+    {
+        "name": "Andrew Nguonly",
+        "linkedin_profile": "https://www.linkedin.com/in/andrewnguonly/",
+        "work_email": "andrew@langchain.dev",
+        "role": "Software Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "j2 Global, CGI, Omaze, Honey, Netflix, Carta, South Park Commons",
+        "years_experience": 13,
+    },
+    {
+        "name": "Nazar Borovets",
+        "linkedin_profile": "https://www.linkedin.com/in/nazareka/",
+        "work_email": "N/A",
+        "role": "Backend & LLM Python Developer",
+        "current_company": "Veido",
+        "prior_companies": "HexOcean, Wooden Borovets Products, Gart Technology",
+        "years_experience": 1,
+    },
+    {
+        "name": "Eric Han",
+        "linkedin_profile": "https://www.linkedin.com/in/eric-han27/",
+        "work_email": "eric@langchain.dev",
+        "role": "Software Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Hotwire, Bottles Waiting Inc., Shopagon, Overlay Gaming Corporation, Instabase",
+        "years_experience": 12,
+    },
+    {
+        "name": "Greg Asquith",
+        "linkedin_profile": "https://www.linkedin.com/in/gregasquith/",
+        "work_email": "N/A",
+        "role": "Technology Consultant",
+        "current_company": "gregasquith.com",
+        "prior_companies": "Renault, Essence, Adcessible, Smartly Video and Display",
+        "years_experience": 9,
+    },
+    {
+        "name": "Erick Friis",
+        "linkedin_profile": "https://www.linkedin.com/in/efriis/",
+        "work_email": "erick@langchain.dev",
+        "role": "Founding Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Instabase, Chestnut, South Park Commons",
+        "years_experience": 5,
+    },
+    {
+        "name": "Oliver Dupuis",
+        "linkedin_profile": "https://www.linkedin.com/in/olivierdupuis/",
+        "work_email": "N/A",
+        "role": "Data Product Builder",
+        "current_company": "RepublicOfData.io",
+        "prior_companies": "University of Ottawa, Lantrns Analytics, Rittman Analytics",
+        "years_experience": 19,
+    },
+    {
+        "name": "Eugene Yurtsev",
+        "linkedin_profile": "https://www.linkedin.com/in/eugene-yurtsev-797a3b1b/",
+        "work_email": "eugene@langchain.dev",
+        "role": "SWE/ML Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Kensho Technologies, Yurtsev",
+        "years_experience": 9,
+    },
+    {
+        "name": "Brian Vander Schaaf",
+        "linkedin_profile": "https://www.linkedin.com/in/brianvanderschaaf/",
+        "work_email": "brian@langchain.dev",
+        "role": "Software Engineer",
+        "current_company": "LangChain",
+        "prior_companies": "Yodle, Two Sigma IQ, Frontrunner",
+        "years_experience": 8,
+    },
+    {
+        "name": "Julia Schottenstein",
+        "linkedin_profile": "https://www.linkedin.com/in/julia-schottenstein-25424318/",
+        "work_email": "julia@langchain.dev",
+        "role": "Building",
+        "current_company": "LangChain",
+        "prior_companies": "Qatalyst Partners, NEA, dbt Labs",
+        "years_experience": 10,
+    },
+    {
+        "name": "Wei Wong",
+        "linkedin_profile": "https://www.linkedin.com/in/weijianwong/",
+        "work_email": "wei@langchain.dev",
+        "role": "Account Executive",
+        "current_company": "LangChain",
+        "prior_companies": "Deutsche Bank, SingleStore, Snowflake",
+        "years_experience": 7,
+    },
+]
+
+if __name__ == "__main__":
+    from langsmith import Client
+    from langsmith.utils import LangSmithNotFoundError
+
+    client = Client()
+    dataset_name = "People Data Enrichment"
+
+    # Storing inputs in a dataset lets us
+    # run chains and LLMs over a shared set of examples.
+    try:
+        exists_dataset = client.read_dataset(dataset_name=dataset_name)
+        print(f"Dataset '{dataset_name}' already exists.")
+        print("You can access the dataset via the URL: ", exists_dataset.url)
+        exit(1)
+    except LangSmithNotFoundError:
+        # Then let's create the dataset if it doesn't exist
+        pass
+
+    dataset = client.create_dataset(
+        dataset_name=dataset_name,
+        description="Evaluate ability to research information about people (e.g., name, email, linkedin)",
+    )
+
+    # Prepare inputs, outputs, and metadata for bulk creation
+    inputs = [
+        {
+            k: v
+            for k, v in record.items()
+            if k in ["name", "work_email", "linkedin_profile"]
+        }
+        for record in EXAMPLES
+    ]
+    outputs = [
+        {
+            k: v
+            for k, v in record.items()
+            if k in ["years_experience", "current_company", "role", "prior_companies"]
+        }
+        for record in EXAMPLES
+    ]
+    client.create_examples(
+        inputs=inputs,
+        outputs=outputs,
+        dataset_id=dataset.id,
+    )
+    print(f"Dataset '{dataset_name}' created with {len(EXAMPLES)} examples.")
+    print("You can access the dataset via the URL: ", dataset.url)
@@ -8,9 +8,9 @@ import argparse
 # Defaults
 EXPERIMENT_PREFIX = "People mAIstro "
 TOLERANCE = 0.15  # should match within 15%
-NUMERIC_FIELDS = ("Years-Experience",)
-FUZZY_MATCH_FIELDS = ("Role", "Company")
-LIST_OF_STRING_FIELDS = ("Prior-Companies",)
+NUMERIC_FIELDS = ("years_experience",)
+FUZZY_MATCH_FIELDS = ("role", "current_company")
+LIST_OF_STRING_FIELDS = ("prior_companies",)
 DEFAULT_DATASET_NAME = "People Data Enrichment"
 DEFAULT_GRAPH_ID = "people_maistro"
 DEFAULT_AGENT_URL = "https://langr.ph/marketplace/62bf5890-28fa-4dd1-b469-4751fe7ecdf3"
@@ -20,29 +20,29 @@ client = Client()
 extraction_schema = {
    "type": "object",
    "required": [
-        "Years-Experience",
-        "Company",
-        "Role",
-        "Prior-Companies",
+        "years_experience",
+        "current_company",
+        "role",
+        "prior_companies",
    ],
    "properties": {
-        "Role": {"type": "string", "description": "Current role of the person."},
-        "Years-Experience": {
+        "role": {"type": "string", "description": "Current role of the person."},
+        "years_experience": {
            "type": "number",
            "description": "How many years of full time work experience (excluding internships) does this person have.",
        },
-        "Company": {
+        "current_company": {
            "type": "string",
            "description": "The name of the current company the person works at.",
        },
-        "Prior-Companies": {
+        "prior_companies": {
            "type": "array",
            "items": {"type": "string"},
            "description": "List of previous companies where the person has worked",
        },
    },
    "description": "Person information",
-    "title": "Person-Schema",
+    "title": "Person",
 }


@@ -132,9 +132,9 @@ def transform_dataset_inputs(inputs: dict) -> dict:
    # see the `Example input` in the README for reference on what `inputs` dict should look like
    return {
        "person": {
-            "name": inputs["Person"],
-            "email": inputs["Work-Email"],
-            "linkedin": inputs["Linkedin"],
+            "name": inputs["name"],
+            "email": inputs["work_email"],
+            "linkedin": inputs["linkedin_profile"],
        },
        "extraction_schema": extraction_schema,
    }