mirror of
https://github.com/langchain-ai/langsmith-pii-removal.git
synced 2026-07-01 19:54:41 -04:00
feat: added langchain v1 and refactored folders
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
OPENAI_API_KEY=sk-
|
||||
|
||||
# If using LangSmith
|
||||
LANGSMITH_TRACING=true
|
||||
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
|
||||
LANGSMITH_API_KEY=lsv2_pt_
|
||||
@@ -1,64 +1,24 @@
|
||||
# 🔒 PII Removal LangSmith
|
||||
|
||||
A comprehensive demonstration of how to prevent logging of sensitive data and personally identifiable information (PII) in LangSmith traces using environment variables, client input/output manipulation, custom anonymizers, and LangGraph integration.
|
||||
A comprehensive demonstration of how to prevent logging of sensitive data and personally identifiable information (PII) in LangSmith traces. **You can mask PII even without LangChain!** This repository shows multiple approaches including direct LangSmith integration, LangGraph workflows, and LangChain middleware.
|
||||
|
||||
## ✨ Features
|
||||
|
||||
- **🔐 Automatic PII Masking**: Uses LangSmith's `create_anonymizer` to automatically mask emails, IP addresses, phone numbers, credit cards, SSNs, and dates
|
||||
- **🚫 Works Without LangChain**: PII masking works directly with OpenAI and LangSmith - no LangChain required!
|
||||
- **🛠️ Multiple Approaches**: Shows different methods for PII removal including environment variables, client manipulation, and custom anonymizers
|
||||
- **🔄 LangGraph Integration**: Demonstrates PII masking in a LangGraph agent workflow
|
||||
- **🛡️ LangChain PIIMiddleware**: Demonstrates PII detection and handling using LangChain's PIIMiddleware with configurable strategies (redact, mask, hash, block)
|
||||
|
||||
## 🛡️ PII Masking Methods
|
||||
## ⚙️ Setup
|
||||
|
||||
You can try these methods in the `remove_pii.ipynb` notebook:
|
||||
### Prerequisites
|
||||
|
||||
### 1. Environment Variables
|
||||
Set environment variables to hide all inputs/outputs globally:
|
||||
```bash
|
||||
export LANGCHAIN_HIDE_INPUTS=true
|
||||
export LANGCHAIN_HIDE_OUTPUTS=true
|
||||
```
|
||||
|
||||
### 2. LangSmith Client with Anonymizer
|
||||
Use custom regex patterns to mask specific PII types:
|
||||
```python
|
||||
from langsmith.anonymizer import create_anonymizer
|
||||
|
||||
anonymizer = create_anonymizer([
|
||||
{"pattern": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "replace": "[EMAIL_REDACTED]"},
|
||||
{"pattern": r"\b(?:\d{1,3}\.){3}\d{1,3}\b", "replace": "[IP_REDACTED]"}
|
||||
])
|
||||
|
||||
langsmith_client = Client(anonymizer=anonymizer)
|
||||
```
|
||||
|
||||
### 3. Wrapped OpenAI Client
|
||||
Integrate PII masking directly with OpenAI API calls:
|
||||
```python
|
||||
from langsmith.wrappers import wrap_openai
|
||||
|
||||
openai_client = wrap_openai(openai.Client())
|
||||
response = openai_client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[...],
|
||||
langsmith_extra={"client": langsmith_client}
|
||||
)
|
||||
```
|
||||
|
||||
## 🚀 LangGraph Integration
|
||||
|
||||
The `langgraph/agent.py` demonstrates how to integrate PII masking into a LangGraph workflow, ensuring all inputs are automatically masked before processing.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> The `@asynccontextmanager` is required to inject the custom LangSmith client with anonymizer into the graph.
|
||||
|
||||
### 📋 Prerequisites
|
||||
|
||||
- Python 3.10+
|
||||
- Python 3.11+
|
||||
- OpenAI API key
|
||||
- LangSmith API key (optional, for trace viewing)
|
||||
|
||||
### ⚙️ Setup
|
||||
### Installation Steps
|
||||
|
||||
1. **Clone the repository**
|
||||
```bash
|
||||
@@ -74,36 +34,178 @@ The `langgraph/agent.py` demonstrates how to integrate PII masking into a LangGr
|
||||
|
||||
3. **Install dependencies**
|
||||
```bash
|
||||
pip install -r ./langgraph/requirements.txt
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
4. **Set up environment variables**
|
||||
4. **Create `.env` file**
|
||||
```bash
|
||||
# Create .env file in langgraph directory
|
||||
cp langgraph/.env.example langgraph/.env
|
||||
# Edit langgraph/.env with your API keys
|
||||
# Create .env file in root directory
|
||||
touch .env # On Windows: type nul > .env
|
||||
```
|
||||
|
||||
Add your API keys to `.env`:
|
||||
```bash
|
||||
OPENAI_API_KEY=your_openai_api_key_here
|
||||
LANGSMITH_API_KEY=your_langsmith_api_key_here # Optional
|
||||
LANGSMITH_TRACING=true # Optional, for trace viewing
|
||||
```
|
||||
|
||||
### 🎯 Running the LangGraph Demo
|
||||
## 🛡️ PII Masking Methods (No LangChain Required!)
|
||||
|
||||
> **💡 Key Point**: You can mask PII in LangSmith traces **without using LangChain**. The methods below work directly with OpenAI and LangSmith.
|
||||
|
||||
See the `non-langchain-example/remove_pii.ipynb` notebook for working examples:
|
||||
|
||||
### 1. Environment Variables (Simplest Method)
|
||||
Hide all inputs/outputs globally - no code changes needed:
|
||||
```bash
|
||||
export LANGSMITH_HIDE_INPUTS=true
|
||||
export LANGSMITH_HIDE_OUTPUTS=true
|
||||
```
|
||||
|
||||
### 2. LangSmith Client with Anonymizer (Recommended)
|
||||
Use custom regex patterns to mask specific PII types - works with any OpenAI client:
|
||||
```python
|
||||
import openai
|
||||
from langsmith import Client
|
||||
from langsmith.wrappers import wrap_openai
|
||||
from langsmith.anonymizer import create_anonymizer
|
||||
|
||||
# Create anonymizer with regex patterns
|
||||
anonymizer = create_anonymizer([
|
||||
{"pattern": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "replace": "[EMAIL_REDACTED]"},
|
||||
{"pattern": r"\b(?:\d{1,3}\.){3}\d{1,3}\b", "replace": "[IP_REDACTED]"}
|
||||
])
|
||||
|
||||
# Use with LangSmith client
|
||||
langsmith_client = Client(anonymizer=anonymizer)
|
||||
openai_client = wrap_openai(openai.Client())
|
||||
|
||||
# PII is automatically masked in traces
|
||||
response = openai_client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[{"role": "user", "content": "My email is john@example.com"}],
|
||||
langsmith_extra={"client": langsmith_client}
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Custom Input/Output Redaction
|
||||
Define custom logic to redact specific fields:
|
||||
```python
|
||||
from langsmith import Client
|
||||
from langsmith.wrappers import wrap_openai
|
||||
import openai
|
||||
|
||||
def redact_system_messages(inputs: dict) -> dict:
|
||||
"""Redact system messages from inputs."""
|
||||
messages = inputs.get("messages", [])
|
||||
redacted = [
|
||||
{"role": m.get("role"), "content": "REDACTED"}
|
||||
if m.get("role") == "system" else m
|
||||
for m in messages
|
||||
]
|
||||
return {**inputs, "messages": redacted}
|
||||
|
||||
langsmith_client = Client(hide_inputs=redact_system_messages)
|
||||
openai_client = wrap_openai(openai.Client())
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=[...],
|
||||
langsmith_extra={"client": langsmith_client}
|
||||
)
|
||||
```
|
||||
|
||||
> **📝 Note**: All three methods work with the standard OpenAI Python SDK - no LangChain dependency required!
|
||||
|
||||
## 🚀 LangGraph Integration
|
||||
|
||||
The `langgraph-example/agent.py` demonstrates how to integrate PII masking into a LangGraph workflow using LangSmith's anonymizer (the same anonymizer approach shown above - no LangChain required for the masking itself).
|
||||
|
||||
> [!IMPORTANT]
|
||||
> The `@asynccontextmanager` is required to inject the custom LangSmith client with anonymizer into the graph.
|
||||
|
||||
## 🛡️ LangChain PIIMiddleware Integration
|
||||
|
||||
The `langchain-example/agent.py` demonstrates how to use LangChain's `PIIMiddleware` to detect and handle PII with configurable strategies. If you're already using LangChain agents, this provides fine-grained control over PII handling within the agent middleware layer.
|
||||
|
||||
### Features
|
||||
|
||||
- **Built-in PII Types**: Email, credit card, IP address, MAC address, URL
|
||||
- **Custom PII Detectors**: Regex patterns or custom functions for domain-specific PII
|
||||
- **Multiple Strategies**:
|
||||
- `redact`: Replace with `[REDACTED_TYPE]`
|
||||
- `mask`: Partially mask (e.g., `****-****-****-1234`)
|
||||
- `hash`: Replace with deterministic hash
|
||||
- `block`: Raise exception when detected
|
||||
- **Flexible Application**: Apply to inputs, outputs, and tool results independently
|
||||
|
||||
### Example Usage
|
||||
|
||||
```python
|
||||
from langchain.agents import create_agent
|
||||
from langchain.agents.middleware import PIIMiddleware
|
||||
|
||||
# Custom detector function
|
||||
def detect_api_key(content: str) -> list[dict[str, str | int]]:
|
||||
matches = []
|
||||
pattern = r"sk-[a-zA-Z0-9]{32,}"
|
||||
for match in re.finditer(pattern, content):
|
||||
matches.append({
|
||||
"text": match.group(0),
|
||||
"start": match.start(),
|
||||
"end": match.end(),
|
||||
})
|
||||
return matches
|
||||
|
||||
# Create agent with PII middleware
|
||||
agent = create_agent(
|
||||
model="gpt-4o-mini",
|
||||
tools=[...],
|
||||
middleware=[
|
||||
# Built-in email detection with redact strategy
|
||||
PIIMiddleware("email", strategy="redact", apply_to_input=True),
|
||||
# Built-in credit card with mask strategy
|
||||
PIIMiddleware("credit_card", strategy="mask", apply_to_input=True),
|
||||
# Custom API key detector with block strategy
|
||||
PIIMiddleware("api_key", detector=detect_api_key, strategy="block", apply_to_input=True),
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
## 🎯 Running the Examples
|
||||
|
||||
### Quick Start: Non-LangChain Example
|
||||
|
||||
Try PII masking with just OpenAI and LangSmith:
|
||||
|
||||
1. **Run the notebook**
|
||||
```bash
|
||||
jupyter notebook non-langchain-example/remove_pii.ipynb
|
||||
```
|
||||
|
||||
### Running LangGraph/LangChain Examples
|
||||
|
||||
1. **Start LangGraph Studio**
|
||||
```bash
|
||||
langgraph dev --config langgraph/langgraph.json
|
||||
langgraph dev --config langgraph.json
|
||||
```
|
||||
|
||||
2. **Access the Studio**
|
||||
- Open your browser to `http://localhost:2024`
|
||||
- The agent will automatically mask PII in all inputs
|
||||
- Select either `langgraph_pii_masking` or `langchain_pii_masking`
|
||||
|
||||
3. **Test PII Masking**
|
||||
- Try inputs with PII like: "My email is john@example.com and phone is (555) 123-4567"
|
||||
- Observe how LangSmith automatically masks the PII before processing
|
||||
- **LangGraph example**: Try "My email is john@example.com and phone is (555) 123-4567"
|
||||
- **LangChain example**: Try "My email is john@example.com and credit card is 4532-1234-5678-9010"
|
||||
- Observe how PII is automatically handled before processing
|
||||
|
||||

|
||||
|
||||
## 📊 Supported PII Types
|
||||
|
||||
The demo automatically masks:
|
||||
### Direct LangSmith Integration (No LangChain Required)
|
||||
The `non-langchain-example/remove_pii.ipynb` and `langgraph-example/agent.py` use LangSmith's anonymizer:
|
||||
- **📧 Email addresses**: `user@example.com` → `[EMAIL_REDACTED]`
|
||||
- **🌐 IP addresses**: `192.168.1.1` → `[IP_REDACTED]`
|
||||
- **📞 Phone numbers**: `(555) 123-4567` → `[PHONE_REDACTED]`
|
||||
@@ -111,11 +213,29 @@ The demo automatically masks:
|
||||
- **🆔 Social Security Numbers**: `123-45-6789` → `[SSN_REDACTED]`
|
||||
- **📅 Dates**: `12/25/2024` → `[DATE_REDACTED]`
|
||||
|
||||
### LangChain PIIMiddleware (Optional)
|
||||
The `langchain-example/agent.py` uses LangChain's middleware:
|
||||
- **📧 Email addresses**: Redacted (`[REDACTED_email]`)
|
||||
- **💳 Credit cards**: Masked (shows last 4 digits: `****-****-****-9010`)
|
||||
- **🔑 API keys**: Blocked (raises exception when detected)
|
||||
|
||||
## 🎯 Which Approach Should I Use?
|
||||
|
||||
| Approach | When to Use | Dependencies |
|
||||
|----------|-------------|---------------|
|
||||
| **LangSmith Anonymizer** | ✅ **Recommended for most cases** - Works with any OpenAI client, no LangChain needed | `openai`, `langsmith` |
|
||||
| **Environment Variables** | Simple blanket hiding of all inputs/outputs | None (just env vars) |
|
||||
| **LangChain PIIMiddleware** | Only if you're already using LangChain agents and need middleware-level control | `langchain`, `langchain-openai` |
|
||||
| **LangGraph + Anonymizer** | When building LangGraph workflows | `langgraph`, `langsmith` |
|
||||
|
||||
> **💡 Recommendation**: Start with LangSmith's anonymizer (Method 2 above) - it works with standard OpenAI SDK and requires no LangChain dependencies!
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
For more detailed information on PII masking and observability, visit:
|
||||
- [LangSmith Documentation](https://docs.smith.langchain.com/observability/how_to_guides/mask_inputs_outputs)
|
||||
- [LangGraph Documentation](https://langchain-ai.github.io/langgraph/)
|
||||
- [LangChain PIIMiddleware Documentation](https://python.langchain.com/docs/how_to/pii_middleware)
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
from langchain.agents import create_agent
|
||||
from langchain.agents.middleware import PIIMiddleware
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_core.tools import tool
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path="../.env", override=True)
|
||||
|
||||
# Define tools for the agent
|
||||
@tool
|
||||
def get_weather(location: str) -> str:
|
||||
"""Get the weather for a given location."""
|
||||
return f"The weather in {location} is sunny and 72°F."
|
||||
|
||||
@tool
|
||||
def calculate(expression: str) -> str:
|
||||
"""Calculate a simple mathematical expression (basic operations only)."""
|
||||
import operator
|
||||
import ast
|
||||
|
||||
# Safe evaluation using ast.literal_eval for simple expressions
|
||||
# This only works for simple numeric expressions, not arbitrary code
|
||||
try:
|
||||
# For simple arithmetic, we'll use a safer approach
|
||||
# This is a simplified version - in production, use a proper math parser
|
||||
allowed_ops = {
|
||||
ast.Add: operator.add,
|
||||
ast.Sub: operator.sub,
|
||||
ast.Mult: operator.mul,
|
||||
ast.Div: operator.truediv,
|
||||
}
|
||||
|
||||
def safe_eval(node):
|
||||
if isinstance(node, ast.Constant):
|
||||
return node.value
|
||||
elif isinstance(node, ast.BinOp):
|
||||
left = safe_eval(node.left)
|
||||
right = safe_eval(node.right)
|
||||
op = allowed_ops.get(type(node.op))
|
||||
if op is None:
|
||||
raise ValueError("Unsupported operation")
|
||||
return op(left, right)
|
||||
else:
|
||||
raise ValueError("Unsupported expression")
|
||||
|
||||
tree = ast.parse(expression, mode='eval')
|
||||
result = safe_eval(tree.body)
|
||||
return f"The result is {result}"
|
||||
except:
|
||||
return "Invalid expression. Please use simple arithmetic (e.g., '2 + 2', '10 * 5')"
|
||||
|
||||
# Create the agent with PII middleware
|
||||
# Demonstrates different strategies: redact, mask, and custom detector
|
||||
agent = create_agent(
|
||||
model=ChatOpenAI(model="gpt-4o-mini", temperature=0),
|
||||
tools=[get_weather, calculate],
|
||||
middleware=[
|
||||
# Email addresses - redact strategy (built-in type)
|
||||
PIIMiddleware("email", strategy="redact", apply_to_input=True),
|
||||
# Credit card numbers - mask strategy (shows last 4 digits)
|
||||
PIIMiddleware("credit_card", strategy="mask", apply_to_input=True),
|
||||
# Custom API key detector using regex pattern - redact strategy
|
||||
PIIMiddleware(
|
||||
"api_key",
|
||||
detector=r"sk-[a-zA-Z0-9]{32,}",
|
||||
strategy="redact",
|
||||
apply_to_input=True,
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# For compatibility with async context manager pattern
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@asynccontextmanager
|
||||
async def compile_agent():
|
||||
"""Async context manager for the agent."""
|
||||
yield agent
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
from dotenv import load_dotenv
|
||||
from contextlib import asynccontextmanager
|
||||
from langchain_core.messages import AnyMessage, HumanMessage, SystemMessage
|
||||
from langchain_core.messages import AnyMessage, HumanMessage
|
||||
from langgraph.graph import END, START, StateGraph
|
||||
from langgraph.graph.message import add_messages
|
||||
from typing_extensions import Annotated, List, TypedDict
|
||||
from typing_extensions import Annotated, TypedDict
|
||||
from langsmith import Client
|
||||
from langsmith.wrappers import wrap_openai
|
||||
import langsmith as ls
|
||||
from langsmith.anonymizer import create_anonymizer
|
||||
import openai
|
||||
|
||||
load_dotenv(dotenv_path="./langgraph/.env", override=True)
|
||||
load_dotenv(dotenv_path="../.env", override=True)
|
||||
|
||||
# create an anonymizer that masks various PII patterns
|
||||
anonymizer = create_anonymizer([
|
||||
@@ -68,6 +68,6 @@ builder.add_edge("llm_node", END)
|
||||
agent = builder.compile()
|
||||
|
||||
@asynccontextmanager
|
||||
async def create_agent():
|
||||
async def compile_agent():
|
||||
with ls.tracing_context(client=langsmith_client):
|
||||
yield agent
|
||||
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"graphs": {
|
||||
"langgraph_pii_masking": "./langgraph-example/agent.py:compile_agent",
|
||||
"langchain_pii_masking": "./langchain-example/agent.py:compile_agent"
|
||||
},
|
||||
"env": ".env",
|
||||
"python_version": "3.11",
|
||||
"dependencies": ["requirements.txt"],
|
||||
"image_distro": "wolfi"
|
||||
}
|
||||
@@ -1,9 +0,0 @@
|
||||
{
|
||||
"graphs": {
|
||||
"pii_masking": "./langgraph/agent.py:create_agent"
|
||||
},
|
||||
"env": ".env",
|
||||
"python_version": "3.11",
|
||||
"dependencies": ["./langgraph/requirements.txt"],
|
||||
"image_distro": "wolfi"
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
langgraph==0.6.1
|
||||
langgraph-cli[inmem]==0.3.6
|
||||
python-dotenv==1.1.1
|
||||
langchain-openai==0.3.28
|
||||
+10
-9
@@ -1,14 +1,15 @@
|
||||
langgraph
|
||||
langgraph-sdk
|
||||
langgraph-checkpoint-sqlite
|
||||
langsmith>=0.2.0
|
||||
langchain-community
|
||||
langchain-core
|
||||
langchain-openai
|
||||
langgraph-sdk>=0.2.9
|
||||
langgraph-checkpoint-sqlite>=3.0.0
|
||||
langchain-community>=0.4.1
|
||||
notebook
|
||||
python-dotenv
|
||||
lxml
|
||||
scikit-learn
|
||||
pandas
|
||||
pyarrow
|
||||
openai
|
||||
|
||||
langgraph>=1.0.3
|
||||
langchain>=1.0.8
|
||||
langgraph-cli[inmem]
|
||||
python-dotenv
|
||||
langchain-openai>=1.0.0
|
||||
langsmith>=0.4.0
|
||||
Reference in New Issue
Block a user