feat: added langchain v1 and refactored folders

2026-07-01 19:54:41 -04:00 · 2025-11-22 14:39:09 +01:00
parent 51cbdc0435
commit d716f3c2af
11 changed files with 282 additions and 84 deletions
@@ -1,5 +1,6 @@
 OPENAI_API_KEY=sk-

+# If using LangSmith
 LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
 LANGSMITH_API_KEY=lsv2_pt_
@@ -1,64 +1,24 @@
 # 🔒 PII Removal LangSmith

-A comprehensive demonstration of how to prevent logging of sensitive data and personally identifiable information (PII) in LangSmith traces using environment variables, client input/output manipulation, custom anonymizers, and LangGraph integration.
+A comprehensive demonstration of how to prevent logging of sensitive data and personally identifiable information (PII) in LangSmith traces. **You can mask PII even without LangChain!** This repository shows multiple approaches including direct LangSmith integration, LangGraph workflows, and LangChain middleware.

 ## ✨ Features

 - **🔐 Automatic PII Masking**: Uses LangSmith's `create_anonymizer` to automatically mask emails, IP addresses, phone numbers, credit cards, SSNs, and dates
+- **🚫 Works Without LangChain**: PII masking works directly with OpenAI and LangSmith - no LangChain required!
 - **🛠️ Multiple Approaches**: Shows different methods for PII removal including environment variables, client manipulation, and custom anonymizers
 - **🔄 LangGraph Integration**: Demonstrates PII masking in a LangGraph agent workflow
+- **🛡️ LangChain PIIMiddleware**: Demonstrates PII detection and handling using LangChain's PIIMiddleware with configurable strategies (redact, mask, hash, block)

-## 🛡️ PII Masking Methods
+## ⚙️ Setup

-You can try these methods in the `remove_pii.ipynb` notebook:
+### Prerequisites

-### 1. Environment Variables
-Set environment variables to hide all inputs/outputs globally:
-```bash
-export LANGCHAIN_HIDE_INPUTS=true
-export LANGCHAIN_HIDE_OUTPUTS=true
-```
-
-### 2. LangSmith Client with Anonymizer
-Use custom regex patterns to mask specific PII types:
-```python
-from langsmith.anonymizer import create_anonymizer
-
-anonymizer = create_anonymizer([
-    {"pattern": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "replace": "[EMAIL_REDACTED]"},
-    {"pattern": r"\b(?:\d{1,3}\.){3}\d{1,3}\b", "replace": "[IP_REDACTED]"}
-])
-
-langsmith_client = Client(anonymizer=anonymizer)
-```
-
-### 3. Wrapped OpenAI Client
-Integrate PII masking directly with OpenAI API calls:
-```python
-from langsmith.wrappers import wrap_openai
-
-openai_client = wrap_openai(openai.Client())
-response = openai_client.chat.completions.create(
-    model="gpt-4o-mini",
-    messages=[...],
-    langsmith_extra={"client": langsmith_client}
-)
-```
-
-## 🚀 LangGraph Integration
-
-The `langgraph/agent.py` demonstrates how to integrate PII masking into a LangGraph workflow, ensuring all inputs are automatically masked before processing.
-
-> [!IMPORTANT]
->  The `@asynccontextmanager` is required to inject the custom LangSmith client with anonymizer into the graph. 
-
-### 📋 Prerequisites
-
- Python 3.10+
+- Python 3.11+
 - OpenAI API key
 - LangSmith API key (optional, for trace viewing)

-### ⚙️ Setup
+### Installation Steps

 1. **Clone the repository**
   ```bash
@@ -74,36 +34,178 @@ The `langgraph/agent.py` demonstrates how to integrate PII masking into a LangGr

 3. **Install dependencies**
   ```bash
-   pip install -r ./langgraph/requirements.txt
+   pip install -r requirements.txt
   ```

-4. **Set up environment variables**
+4. **Create `.env` file**
   ```bash
-   # Create .env file in langgraph directory
-   cp langgraph/.env.example langgraph/.env
-   # Edit langgraph/.env with your API keys
+   # Create .env file in root directory
+   touch .env  # On Windows: type nul > .env
+   ```
+   
+   Add your API keys to `.env`:
+   ```bash
+   OPENAI_API_KEY=your_openai_api_key_here
+   LANGSMITH_API_KEY=your_langsmith_api_key_here  # Optional
+   LANGSMITH_TRACING=true  # Optional, for trace viewing
   ```

-### 🎯 Running the LangGraph Demo
+## 🛡️ PII Masking Methods (No LangChain Required!)
+
+> **💡 Key Point**: You can mask PII in LangSmith traces **without using LangChain**. The methods below work directly with OpenAI and LangSmith.
+
+See the `non-langchain-example/remove_pii.ipynb` notebook for working examples:
+
+### 1. Environment Variables (Simplest Method)
+Hide all inputs/outputs globally - no code changes needed:
+```bash
+export LANGSMITH_HIDE_INPUTS=true
+export LANGSMITH_HIDE_OUTPUTS=true
+```
+
+### 2. LangSmith Client with Anonymizer (Recommended)
+Use custom regex patterns to mask specific PII types - works with any OpenAI client:
+```python
+import openai
+from langsmith import Client
+from langsmith.wrappers import wrap_openai
+from langsmith.anonymizer import create_anonymizer
+
+# Create anonymizer with regex patterns
+anonymizer = create_anonymizer([
+    {"pattern": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "replace": "[EMAIL_REDACTED]"},
+    {"pattern": r"\b(?:\d{1,3}\.){3}\d{1,3}\b", "replace": "[IP_REDACTED]"}
+])
+
+# Use with LangSmith client
+langsmith_client = Client(anonymizer=anonymizer)
+openai_client = wrap_openai(openai.Client())
+
+# PII is automatically masked in traces
+response = openai_client.chat.completions.create(
+    model="gpt-4o-mini",
+    messages=[{"role": "user", "content": "My email is john@example.com"}],
+    langsmith_extra={"client": langsmith_client}
+)
+```
+
+### 3. Custom Input/Output Redaction
+Define custom logic to redact specific fields:
+```python
+from langsmith import Client
+from langsmith.wrappers import wrap_openai
+import openai
+
+def redact_system_messages(inputs: dict) -> dict:
+    """Redact system messages from inputs."""
+    messages = inputs.get("messages", [])
+    redacted = [
+        {"role": m.get("role"), "content": "REDACTED"}
+        if m.get("role") == "system" else m
+        for m in messages
+    ]
+    return {**inputs, "messages": redacted}
+
+langsmith_client = Client(hide_inputs=redact_system_messages)
+openai_client = wrap_openai(openai.Client())
+
+response = openai_client.chat.completions.create(
+    model="gpt-4o-mini",
+    messages=[...],
+    langsmith_extra={"client": langsmith_client}
+)
+```
+
+> **📝 Note**: All three methods work with the standard OpenAI Python SDK - no LangChain dependency required!
+
+## 🚀 LangGraph Integration
+
+The `langgraph-example/agent.py` demonstrates how to integrate PII masking into a LangGraph workflow using LangSmith's anonymizer (the same anonymizer approach shown above - no LangChain required for the masking itself).
+
+> [!IMPORTANT]
+>  The `@asynccontextmanager` is required to inject the custom LangSmith client with anonymizer into the graph.
+
+## 🛡️ LangChain PIIMiddleware Integration
+
+The `langchain-example/agent.py` demonstrates how to use LangChain's `PIIMiddleware` to detect and handle PII with configurable strategies. If you're already using LangChain agents, this provides fine-grained control over PII handling within the agent middleware layer.
+
+### Features
+
+- **Built-in PII Types**: Email, credit card, IP address, MAC address, URL
+- **Custom PII Detectors**: Regex patterns or custom functions for domain-specific PII
+- **Multiple Strategies**: 
+  - `redact`: Replace with `[REDACTED_TYPE]`
+  - `mask`: Partially mask (e.g., `****-****-****-1234`)
+  - `hash`: Replace with deterministic hash
+  - `block`: Raise exception when detected
+- **Flexible Application**: Apply to inputs, outputs, and tool results independently
+
+### Example Usage
+
+```python
+from langchain.agents import create_agent
+from langchain.agents.middleware import PIIMiddleware
+
+# Custom detector function
+def detect_api_key(content: str) -> list[dict[str, str | int]]:
+    matches = []
+    pattern = r"sk-[a-zA-Z0-9]{32,}"
+    for match in re.finditer(pattern, content):
+        matches.append({
+            "text": match.group(0),
+            "start": match.start(),
+            "end": match.end(),
+        })
+    return matches
+
+# Create agent with PII middleware
+agent = create_agent(
+    model="gpt-4o-mini",
+    tools=[...],
+    middleware=[
+        # Built-in email detection with redact strategy
+        PIIMiddleware("email", strategy="redact", apply_to_input=True),
+        # Built-in credit card with mask strategy
+        PIIMiddleware("credit_card", strategy="mask", apply_to_input=True),
+        # Custom API key detector with block strategy
+        PIIMiddleware("api_key", detector=detect_api_key, strategy="block", apply_to_input=True),
+    ],
+)
+```
+
+## 🎯 Running the Examples
+
+### Quick Start: Non-LangChain Example
+
+Try PII masking with just OpenAI and LangSmith:
+
+1. **Run the notebook**
+   ```bash
+   jupyter notebook non-langchain-example/remove_pii.ipynb
+   ```
+
+### Running LangGraph/LangChain Examples

 1. **Start LangGraph Studio**
   ```bash
-   langgraph dev --config langgraph/langgraph.json
+   langgraph dev --config langgraph.json
   ```

 2. **Access the Studio**
   - Open your browser to `http://localhost:2024`
-   - The agent will automatically mask PII in all inputs
+   - Select either `langgraph_pii_masking` or `langchain_pii_masking`

 3. **Test PII Masking**
-   - Try inputs with PII like: "My email is john@example.com and phone is (555) 123-4567"
-   - Observe how LangSmith automatically masks the PII before processing
+   - **LangGraph example**: Try "My email is john@example.com and phone is (555) 123-4567"
+   - **LangChain example**: Try "My email is john@example.com and credit card is 4532-1234-5678-9010"
+   - Observe how PII is automatically handled before processing

 ![LangGraph PII Masking Trace](images/langgraph.png)

 ## 📊 Supported PII Types

-The demo automatically masks:
+### Direct LangSmith Integration (No LangChain Required)
+The `non-langchain-example/remove_pii.ipynb` and `langgraph-example/agent.py` use LangSmith's anonymizer:
 - **📧 Email addresses**: `user@example.com` → `[EMAIL_REDACTED]`
 - **🌐 IP addresses**: `192.168.1.1` → `[IP_REDACTED]`
 - **📞 Phone numbers**: `(555) 123-4567` → `[PHONE_REDACTED]`
@@ -111,11 +213,29 @@ The demo automatically masks:
 - **🆔 Social Security Numbers**: `123-45-6789` → `[SSN_REDACTED]`
 - **📅 Dates**: `12/25/2024` → `[DATE_REDACTED]`

+### LangChain PIIMiddleware (Optional)
+The `langchain-example/agent.py` uses LangChain's middleware:
+- **📧 Email addresses**: Redacted (`[REDACTED_email]`)
+- **💳 Credit cards**: Masked (shows last 4 digits: `****-****-****-9010`)
+- **🔑 API keys**: Blocked (raises exception when detected)
+
+## 🎯 Which Approach Should I Use?
+
+| Approach | When to Use | Dependencies |
+|----------|-------------|---------------|
+| **LangSmith Anonymizer** | ✅ **Recommended for most cases** - Works with any OpenAI client, no LangChain needed | `openai`, `langsmith` |
+| **Environment Variables** | Simple blanket hiding of all inputs/outputs | None (just env vars) |
+| **LangChain PIIMiddleware** | Only if you're already using LangChain agents and need middleware-level control | `langchain`, `langchain-openai` |
+| **LangGraph + Anonymizer** | When building LangGraph workflows | `langgraph`, `langsmith` |
+
+> **💡 Recommendation**: Start with LangSmith's anonymizer (Method 2 above) - it works with standard OpenAI SDK and requires no LangChain dependencies!
+
 ## 📚 Documentation

 For more detailed information on PII masking and observability, visit:
 - [LangSmith Documentation](https://docs.smith.langchain.com/observability/how_to_guides/mask_inputs_outputs)
 - [LangGraph Documentation](https://langchain-ai.github.io/langgraph/)
+- [LangChain PIIMiddleware Documentation](https://python.langchain.com/docs/how_to/pii_middleware)

 ## 🤝 Contributing

@@ -0,0 +1,79 @@
+from langchain.agents import create_agent
+from langchain.agents.middleware import PIIMiddleware
+from langchain_openai import ChatOpenAI
+from langchain_core.tools import tool
+from dotenv import load_dotenv
+
+load_dotenv(dotenv_path="../.env", override=True)
+
+# Define tools for the agent
+@tool
+def get_weather(location: str) -> str:
+    """Get the weather for a given location."""
+    return f"The weather in {location} is sunny and 72°F."
+
+@tool
+def calculate(expression: str) -> str:
+    """Calculate a simple mathematical expression (basic operations only)."""
+    import operator
+    import ast
+    
+    # Safe evaluation using ast.literal_eval for simple expressions
+    # This only works for simple numeric expressions, not arbitrary code
+    try:
+        # For simple arithmetic, we'll use a safer approach
+        # This is a simplified version - in production, use a proper math parser
+        allowed_ops = {
+            ast.Add: operator.add,
+            ast.Sub: operator.sub,
+            ast.Mult: operator.mul,
+            ast.Div: operator.truediv,
+        }
+        
+        def safe_eval(node):
+            if isinstance(node, ast.Constant):
+                return node.value
+            elif isinstance(node, ast.BinOp):
+                left = safe_eval(node.left)
+                right = safe_eval(node.right)
+                op = allowed_ops.get(type(node.op))
+                if op is None:
+                    raise ValueError("Unsupported operation")
+                return op(left, right)
+            else:
+                raise ValueError("Unsupported expression")
+        
+        tree = ast.parse(expression, mode='eval')
+        result = safe_eval(tree.body)
+        return f"The result is {result}"
+    except:
+        return "Invalid expression. Please use simple arithmetic (e.g., '2 + 2', '10 * 5')"
+
+# Create the agent with PII middleware
+# Demonstrates different strategies: redact, mask, and custom detector
+agent = create_agent(
+    model=ChatOpenAI(model="gpt-4o-mini", temperature=0),
+    tools=[get_weather, calculate],
+    middleware=[
+        # Email addresses - redact strategy (built-in type)
+        PIIMiddleware("email", strategy="redact", apply_to_input=True),
+        # Credit card numbers - mask strategy (shows last 4 digits)
+        PIIMiddleware("credit_card", strategy="mask", apply_to_input=True),
+        # Custom API key detector using regex pattern - redact strategy
+        PIIMiddleware(
+            "api_key",
+            detector=r"sk-[a-zA-Z0-9]{32,}",
+            strategy="redact",
+            apply_to_input=True,
+        ),
+    ],
+)
+
+# For compatibility with async context manager pattern
+from contextlib import asynccontextmanager
+
+@asynccontextmanager
+async def compile_agent():
+    """Async context manager for the agent."""
+    yield agent
+
@@ -1,16 +1,16 @@
 from dotenv import load_dotenv
 from contextlib import asynccontextmanager
-from langchain_core.messages import AnyMessage, HumanMessage, SystemMessage
+from langchain_core.messages import AnyMessage, HumanMessage
 from langgraph.graph import END, START, StateGraph
 from langgraph.graph.message import add_messages
-from typing_extensions import Annotated, List, TypedDict
+from typing_extensions import Annotated, TypedDict
 from langsmith import Client
 from langsmith.wrappers import wrap_openai
 import langsmith as ls
 from langsmith.anonymizer import create_anonymizer
 import openai

-load_dotenv(dotenv_path="./langgraph/.env", override=True)
+load_dotenv(dotenv_path="../.env", override=True)

 # create an anonymizer that masks various PII patterns
 anonymizer = create_anonymizer([
@@ -68,6 +68,6 @@ builder.add_edge("llm_node", END)
 agent = builder.compile()

@asynccontextmanager
-async def create_agent():
+async def compile_agent():
    with ls.tracing_context(client=langsmith_client):
        yield agent
@@ -0,0 +1,10 @@
+{
+    "graphs": {
+        "langgraph_pii_masking": "./langgraph-example/agent.py:compile_agent",
+        "langchain_pii_masking": "./langchain-example/agent.py:compile_agent"
+    },
+    "env": ".env",
+    "python_version": "3.11",
+    "dependencies": ["requirements.txt"],
+    "image_distro": "wolfi"
+}
@@ -1,9 +0,0 @@
-{
-    "graphs": {
-        "pii_masking": "./langgraph/agent.py:create_agent"
-    },
-    "env": ".env",
-    "python_version": "3.11",
-    "dependencies": ["./langgraph/requirements.txt"],
-    "image_distro": "wolfi"
-}
@@ -1,4 +0,0 @@
-langgraph==0.6.1
-langgraph-cli[inmem]==0.3.6
-python-dotenv==1.1.1
-langchain-openai==0.3.28
@@ -1,14 +1,15 @@
-langgraph
-langgraph-sdk
-langgraph-checkpoint-sqlite
-langsmith>=0.2.0
-langchain-community
-langchain-core
-langchain-openai
+langgraph-sdk>=0.2.9
+langgraph-checkpoint-sqlite>=3.0.0
+langchain-community>=0.4.1
 notebook
-python-dotenv
 lxml
 scikit-learn
 pandas
 pyarrow
-openai
+
+langgraph>=1.0.3
+langchain>=1.0.8
+langgraph-cli[inmem]
+python-dotenv
+langchain-openai>=1.0.0
+langsmith>=0.4.0