From 5b31b793317261d2a577b306c8160743e0a0a5d2 Mon Sep 17 00:00:00 2001 From: Stephen Chu Date: Tue, 23 Dec 2025 18:42:56 -0500 Subject: [PATCH] Improve PII scrubbing: Use recursive field name matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Changed scrub_trace.sh to match field names recursively at any depth - Works with arrays and nested objects (e.g., "content" finds all content fields) - Simpler interface: just field names instead of dotted paths - Safer for PII: catches sensitive data in unexpected locations - Updated README with new usage examples and field list 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- README.md | 25 +++++++++++++------------ scrub_trace.sh | 40 ++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 71ed75a..8f88817 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ Customers extract their trace and scrub sensitive data before sending: export LANGSMITH_API_KEY='your-api-key' ./extract_trace.sh 00000000-0000-0000-f319-b36446ca3f23 -# 2. Scrub PII -./scrub_trace.sh trace_00000000-0000-0000-f319-b36446ca3f23.json "inputs.messages,inputs.email" +# 2. Scrub PII (recursively redacts field names) +./scrub_trace.sh trace_00000000-0000-0000-f319-b36446ca3f23.json "content,email" # 3. Review scrubbed file manually @@ -50,7 +50,7 @@ Extract a trace by ID. ### `scrub_trace.sh` -Redact PII fields from trace. +Redact PII fields from trace using recursive field name matching. ```bash ./scrub_trace.sh ",,..." @@ -59,15 +59,16 @@ Redact PII fields from trace. **Output:** `.scrubbed.json` **Common fields to redact:** -- `inputs.messages` - User messages -- `inputs.email` - Email addresses -- `inputs.query` - Search queries -- `outputs.text` - Generated text -- `extra.metadata.session_id` - Session IDs -- `extra.metadata.user_id` - User IDs -- `extra.metadata.api_key` - API keys +- `content` - Message content (finds all content fields) +- `email` - Email addresses +- `messages` - Entire message arrays +- `query` - Search queries +- `text` - Generated text +- `session_id` - Session IDs +- `user_id` - User IDs +- `api_key` - API keys -**Handles nested fields:** Use dot notation (e.g., `extra.metadata.api_key`) +**Recursive matching:** Field names are matched at any depth in the JSON structure, including inside arrays and nested objects. For example, specifying `content` will redact all fields named `content` anywhere in the trace. ### `upload_trace.sh` @@ -88,7 +89,7 @@ export LANGSMITH_API_KEY='lsv2_pt_...' # Scrub ./scrub_trace.sh trace_a1b2c3d4-5678-90ab-cdef-1234567890ab.json \ - "inputs.messages,inputs.email,extra.metadata.session_id" + "content,email,session_id" # Review and send trace_a1b2c3d4-5678-90ab-cdef-1234567890ab.scrubbed.json to support ``` diff --git a/scrub_trace.sh b/scrub_trace.sh index 6d3e21d..86a1d76 100755 --- a/scrub_trace.sh +++ b/scrub_trace.sh @@ -13,24 +13,27 @@ USAGE: ARGUMENTS: trace_file - JSON file with extracted trace - fields - Comma-separated field paths to redact + fields - Comma-separated field names to redact (recursively) OUTPUT: Creates .scrubbed.json EXAMPLES: - # Redact messages and email - $0 trace.json "inputs.messages,inputs.email" + # Redact all 'content' and 'email' fields anywhere in the trace + $0 trace.json "content,email" - # Redact nested metadata - $0 trace.json "extra.metadata.api_key,outputs.user_data" + # Redact nested metadata fields + $0 trace.json "api_key,session_id,user_id" COMMON FIELDS: - inputs.messages - Chat messages - inputs.email - Email addresses - outputs.text - Output text - extra.metadata.session_id - Session IDs - extra.metadata.user_id - User IDs + content - Message content (finds all content fields) + email - Email addresses + messages - Entire messages arrays + session_id - Session IDs + user_id - User IDs + api_key - API keys + +NOTE: Fields are matched recursively at any depth, including inside arrays. EOF } @@ -67,28 +70,21 @@ echo "Output: $OUTPUT_FILE" echo "Fields: $FIELDS" echo "" -# Build jq filter for nested redaction +# Build jq filter for recursive redaction IFS=',' read -ra FIELD_LIST <<< "$FIELDS" -JQ_FILTER='walk(if type == "object" then (' +JQ_FILTER='walk(if type == "object" then' for field in "${FIELD_LIST[@]}"; do # Trim whitespace field="${field#"${field%%[![:space:]]*}"}" # trim leading field="${field%"${field##*[![:space:]]}"}" # trim trailing - # Build path array for getpath/setpath - IFS='.' read -ra PARTS <<< "$field" - JQ_PATH="[" - for part in "${PARTS[@]}"; do - JQ_PATH="$JQ_PATH\"$part\"," - done - JQ_PATH="${JQ_PATH%,}]" - - JQ_FILTER="$JQ_FILTER if getpath($JQ_PATH) then setpath($JQ_PATH; \"[REDACTED]\") else . end |" + # Add recursive field check + JQ_FILTER="$JQ_FILTER if has(\"$field\") then .\"$field\" = \"[REDACTED]\" else . end |" done # Remove trailing pipe and close -JQ_FILTER="${JQ_FILTER% |}) else . end)" +JQ_FILTER="${JQ_FILTER% |} else . end)" # Apply redactions if ! jq "$JQ_FILTER" "$TRACE_FILE" > "$OUTPUT_FILE"; then