feat: Add Module 1 notebooks and shared infrastructure for LangSmith self-hosted workshops

This commit introduces the foundational infrastructure for running LangSmith self-hosted deployment workshops using Jupyter notebooks. - Add `notebooks/shared/_bootstrap.py`: Centralized bootstrap logic that: - Loads environment variables from `.env` or `workshop.env` files - Validates required tools (aws, terraform, helm, kubectl, jq) - Prints AWS identity and region information - Creates artifacts directory for notebook outputs - Automatically installs required Python packages (python-dotenv, pyyaml, requests) - Add `notebooks/shared/_shell.py`: Shell command execution utilities with: - Homebrew path resolution for macOS (fixes PATH issues for subprocess calls) - AWS_PROFILE handling - Streaming and non-streaming command execution - Add `notebooks/shared/_validation.py`: Validation helpers for environment variables and configuration - Add `notebooks/shared/_aws_helpers.py`: AWS-specific helper functions - Add `notebooks/shared/_k8s_helpers.py`: Kubernetes helper functions Create complete set of Module 1 notebooks following the workshop curriculum: - `01_aws_preflight.ipynb`: Pre-deployment environment validation - Tool validation - AWS credentials and region checks - Cluster capacity expectations - Storage prerequisites (EBS CSI, StorageClasses) - S3 blob storage verification - Terraform and Helm repository path validation - `02_terraform_apply.ipynb`: Infrastructure provisioning - Terraform module discovery and validation - Version pinning verification - Remote state configuration - Terraform initialization - Plan creation with environment variable support - Infrastructure application (commented by default) - Output capture for Helm deployment - `03_helm_install_langsmith.ipynb`: LangSmith installation - Helm chart discovery and validation - Chart version pinning - Terraform outputs loading - Values file management - Kubernetes secrets creation - Template rendering before install - Helm installation (commented by default) - `04_validate_ingress_and_ui.ipynb`: Deployment validation - Pod readiness checks - PVC binding verification - Ingress provisioning - Endpoint reachability - UI availability - Diagnostic artifact collection - `99_teardown.ipynb`: Cleanup procedures - Helm uninstall - Kubernetes resource cleanup - Terraform destroy - Verification steps - Add `.gitignore`: Comprehensive ignore patterns for Python, Jupyter, environment files, artifacts, and infrastructure tool outputs - Add `env-samples/workshop.env.example`: Template environment file with: - Workshop configuration variables - AWS settings - Terraform and Helm repository paths - PostgreSQL credentials (POSTGRES_USERNAME, POSTGRES_PASSWORD) - Helm configuration - Add additional example env files for AWS, OIDC, and Module 3 - Environment variable expansion: Supports `$VAR` and `${VAR}` syntax in paths (e.g., `$TERRAFORM_REPO_DIR/aws/langsmith`) - Robust path resolution: Handles different Jupyter working directories and automatically finds the notebooks/shared directory - Error handling: Clear error messages with actionable instructions when required tools, directories, or environment variables are missing - Terraform variable passing: Automatically reads POSTGRES_USERNAME and POSTGRES_PASSWORD from environment and passes them to Terraform commands - Clone instructions: Helpful guidance when Terraform or Helm repositories are not found - Artifact management: Centralized artifacts directory for saving outputs, plans, and diagnostic information All notebooks follow best practices: - Use official repositories (no forking) - Pin versions for reproducibility - Plan before applying - Render templates before installing - Validate before proceeding This establishes a solid foundation for the workshop series, ensuring participants start from a supported baseline configuration.
2026-07-01 20:44:14 -04:00 · 2025-12-26 14:31:37 -08:00
commit 3a190f1c19
17 changed files with 3549 additions and 0 deletions
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+*.ipynb_checkpoints/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+Pipfile.lock
+
+# PEP 582
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+*.env
+!*.env.example
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDEs and editors
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+*.sublime-project
+*.sublime-workspace
+
+# Project-specific
+artifacts/
+*.log
+*.tmp
+
+# Terraform
+.terraform/
+*.tfstate
+*.tfstate.*
+.terraform.lock.hcl
+terraform.tfvars
+!terraform.tfvars.example
+
+# Kubernetes
+kubeconfig
+*.kubeconfig
+
+# AWS
+.aws/
+
+# Helm
+*.tgz
+
@@ -0,0 +1,21 @@
+# ===== AWS / DNS / TLS =====
+# Copy to env/aws.env and source it.
+
+# Optional explicit AWS account guardrail
+AWS_ACCOUNT_ID=""
+
+# DNS name for LangSmith (optional in early modules)
+DOMAIN="langsmith.example.com"
+
+# Route53 hosted zone (optional; if unset, notebooks attempt to infer from DOMAIN)
+ROUTE53_HOSTED_ZONE_ID=""
+
+# ACM certificate (optional; if unset, notebooks can check for a matching cert)
+ACM_CERT_ARN=""
+
+# Network flags
+PRIVATE_CLUSTER="false"   # if true, validation should avoid public endpoints
+
+# Cost-control / safety
+AUTO_TEARDOWN="false"
+TEARDOWN_CONFIRMATION_PHRASE="DELETE"  # must match in teardown notebook
@@ -0,0 +1,12 @@
+# ===== Module 3 (optional load/capacity) =====
+
+# If you generate synthetic traces / load
+LOAD_TEST_ENABLED="false"
+LOAD_TEST_RPS="5"
+LOAD_TEST_DURATION_SECONDS="120"
+
+# Namespace/service identifiers for metrics lookups
+LANGSMITH_SERVICE_NAME="langsmith"
+CLICKHOUSE_SERVICE_NAME="clickhouse"
+REDIS_SERVICE_NAME="redis"
+POSTGRES_ENDPOINT=""
@@ -0,0 +1,37 @@
+# ===== Workshop / Notebook Defaults =====
+# Copy to env/workshop.env and source it:  source env/workshop.env
+
+# General
+WORKSHOP_NAME="langsmith-self-hosted-operator"
+NAMESPACE="langsmith"
+
+# Prefer AWS_PROFILE if you use named profiles. Otherwise rely on default creds.
+AWS_PROFILE=""
+
+# Region (must match where you deploy infra)
+AWS_REGION="us-west-2"
+
+# Naming (used by notebooks for display + validation)
+CLUSTER_NAME="langsmith-workshop"
+
+# Local repo paths (absolute is safest)
+TERRAFORM_REPO_DIR="$HOME/src/langchain-ai/terraform"
+HELM_REPO_DIR="$HOME/src/langchain-ai/helm"
+
+# Where in the terraform repo the AWS self-hosted module lives (adjust as needed)
+TERRAFORM_DIR="$TERRAFORM_REPO_DIR/aws/langsmith"   # <-- update to real path you standardize on
+
+# Helm release + chart reference (chart reference can be local path or OCI/ref)
+HELM_RELEASE="langsmith"
+HELM_NAMESPACE="$NAMESPACE"
+
+# Use a local chart path by default (stable for workshop)
+HELM_CHART_REF="$HELM_REPO_DIR/charts/langsmith"
+
+# Values file for Helm install (checked into your workshop repo)
+VALUES_FILE="./helm/langsmith-values/values.aws-demo.yaml"
+
+# Output/artifacts
+ARTIFACTS_DIR="./artifacts"
+LOG_LEVEL="info"   # info|debug
+DRY_RUN="true"     # true by default; notebooks should flip this explicitly when applying
@@ -0,0 +1,44 @@
+# ===== Workshop / Notebook Defaults =====
+# Copy to env/workshop.env and source it:  source env/workshop.env
+
+# General
+WORKSHOP_NAME="langsmith-self-hosted-operator"
+NAMESPACE="langsmith"
+
+# Prefer AWS_PROFILE if you use named profiles. Otherwise rely on default creds.
+AWS_PROFILE=""
+
+# Region (must match where you deploy infra)
+AWS_REGION="us-east-1"
+
+# AWS account ID
+AWS_ACCOUNT_ID=""
+
+# Naming (used by notebooks for display + validation)
+CLUSTER_NAME="langsmith-workshop"
+
+# Local repo paths (absolute is safest)
+TERRAFORM_REPO_DIR="$HOME/src/langchain-ai/terraform"
+HELM_REPO_DIR="$HOME/src/langchain-ai/helm"
+
+# Where in the terraform repo the AWS self-hosted module lives (adjust as needed)
+TERRAFORM_DIR="$TERRAFORM_REPO_DIR/aws/langsmith"   # <-- update to real path you standardize on
+
+# Helm release + chart reference (chart reference can be local path or OCI/ref)
+HELM_RELEASE="langsmith"
+HELM_NAMESPACE="$NAMESPACE"
+
+# Use a local chart path by default (stable for workshop)
+HELM_CHART_REF="$HELM_REPO_DIR/charts/langsmith"
+
+# Values file for Helm install (checked into your workshop repo)
+VALUES_FILE="./helm/langsmith-values/values.aws-demo.yaml"
+
+# Terraform variables (for RDS PostgreSQL)
+POSTGRES_USERNAME="langsmith"
+POSTGRES_PASSWORD=""  # <-- Set a strong password here
+
+# Output/artifacts
+ARTIFACTS_DIR="./artifacts"
+LOG_LEVEL="info"   # info|debug
+DRY_RUN="true"     # true by default; notebooks should flip this explicitly when applying
@@ -0,0 +1,491 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1: AWS Preflight Checks\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "This notebook validates your environment before deploying LangSmith. Most self-hosted failures occur **before** users ever touch the product due to:\n",
+    "\n",
+    "- Mis-sized clusters\n",
+    "- Unsupported ingress setups\n",
+    "- In-cluster databases used past their limits\n",
+    "- Missing storage primitives (blob, PVs)\n",
+    "\n",
+    "This preflight ensures you start from a **supported baseline**.\n",
+    "\n",
+    "## What We'll Check\n",
+    "\n",
+    "1. ✅ Tooling validation (aws, terraform, kubectl, helm, jq)\n",
+    "2. ✅ AWS credentials & region sanity check\n",
+    "3. ✅ Cluster capacity expectations\n",
+    "4. ✅ Storage prerequisites (EBS CSI, StorageClasses)\n",
+    "5. ✅ Blob storage requirement (S3)\n",
+    "\n",
+    "**Estimated time:** 20-30 minutes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bootstrap environment\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Add notebooks directory to path so we can import shared as a package\n",
+    "# Find the notebooks directory by looking for the shared folder\n",
+    "possible_paths = [\n",
+    "    Path.cwd().parent,  # If cwd is module-1, go up one level to notebooks\n",
+    "    Path.cwd(),  # If cwd is already notebooks\n",
+    "    Path.cwd() / \"notebooks\",  # If cwd is workspace root\n",
+    "]\n",
+    "\n",
+    "notebooks_path = None\n",
+    "for path in possible_paths:\n",
+    "    if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        notebooks_path = path\n",
+    "        break\n",
+    "\n",
+    "if not notebooks_path:\n",
+    "    # Fallback: try workspace root\n",
+    "    notebooks_path = Path.cwd() / \"notebooks\"\n",
+    "    if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
+    "\n",
+    "# Add notebooks directory to path so 'shared' can be imported as a package\n",
+    "if str(notebooks_path) not in sys.path:\n",
+    "    sys.path.insert(0, str(notebooks_path))\n",
+    "\n",
+    "from shared._bootstrap import bootstrap\n",
+    "\n",
+    "# Run bootstrap: loads env, checks tools, validates AWS, creates artifacts dir\n",
+    "bootstrap_info = bootstrap()\n",
+    "print(f\"\\nBootstrap complete! Artifacts directory: {bootstrap_info['artifacts_dir']}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## AWS Account & Region Validation\n",
+    "\n",
+    "Verify you're using the correct AWS account and region. This is critical for avoiding accidental deployments to production or wrong regions.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from shared._aws_helpers import aws_region, sts_identity, assert_account\n",
+    "from shared._validation import require_env, print_config, ok, warn\n",
+    "\n",
+    "# Get AWS configuration\n",
+    "region = aws_region()\n",
+    "identity = sts_identity()\n",
+    "\n",
+    "print(\"### Current AWS Session\")\n",
+    "print(f\"Region: {region}\")\n",
+    "print(f\"Account ID: {identity['Account']}\")\n",
+    "print(f\"User ARN: {identity['Arn']}\")\n",
+    "\n",
+    "# Optional: Validate against expected account (set AWS_ACCOUNT_ID in .env if needed)\n",
+    "expected_account = os.environ.get(\"AWS_ACCOUNT_ID\", \"\").strip()\n",
+    "if expected_account:\n",
+    "    assert_account(expected_account)\n",
+    "else:\n",
+    "    warn(\"AWS_ACCOUNT_ID not set in environment - skipping account validation\")\n",
+    "    print(\"💡 Tip: Set AWS_ACCOUNT_ID in your .env file to add a guardrail against wrong account deployments\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Required Environment Variables\n",
+    "\n",
+    "Verify that all required configuration is present. These values will be used throughout the deployment.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check required environment variables\n",
+    "required_vars = [\n",
+    "    \"WORKSHOP_NAME\",\n",
+    "    \"NAMESPACE\",\n",
+    "    \"AWS_REGION\",\n",
+    "    \"CLUSTER_NAME\",\n",
+    "    \"TERRAFORM_DIR\",\n",
+    "    \"HELM_RELEASE\",\n",
+    "    \"HELM_NAMESPACE\",\n",
+    "    \"HELM_CHART_REF\",\n",
+    "]\n",
+    "\n",
+    "config = require_env(*required_vars)\n",
+    "\n",
+    "# Optional but recommended\n",
+    "optional_vars = {\n",
+    "    \"AWS_PROFILE\": os.environ.get(\"AWS_PROFILE\", \"\"),\n",
+    "    \"AWS_ACCOUNT_ID\": os.environ.get(\"AWS_ACCOUNT_ID\", \"\"),\n",
+    "    \"VALUES_FILE\": os.environ.get(\"VALUES_FILE\", \"\"),\n",
+    "}\n",
+    "\n",
+    "print(\"\\n### Configuration Summary\")\n",
+    "print_config(config, redact_keys={\"AWS_PROFILE\"})\n",
+    "print(\"\\n### Optional Configuration\")\n",
+    "for k, v in optional_vars.items():\n",
+    "    if v:\n",
+    "        print(f\"- {k}: {v}\")\n",
+    "    else:\n",
+    "        print(f\"- {k}: (not set)\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cluster Capacity Expectations\n",
+    "\n",
+    "LangSmith requires adequate cluster resources. Before deploying, understand what you'll need:\n",
+    "\n",
+    "- **Minimum:** 3 nodes, 4 vCPU, 16GB RAM each (for development/testing)\n",
+    "- **Recommended:** 3 nodes, 8 vCPU, 32GB RAM each (for production workloads)\n",
+    "- **Storage:** EBS CSI driver required for ClickHouse PVCs\n",
+    "\n",
+    "Let's check if a cluster already exists and validate its configuration.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shared._aws_helpers import eks_cluster_exists\n",
+    "from shared._shell import run\n",
+    "\n",
+    "cluster_name = os.environ[\"CLUSTER_NAME\"]\n",
+    "region = aws_region()\n",
+    "\n",
+    "print(f\"### Checking EKS Cluster: {cluster_name}\")\n",
+    "print(f\"Region: {region}\\n\")\n",
+    "\n",
+    "if eks_cluster_exists(cluster_name):\n",
+    "    ok(f\"Cluster '{cluster_name}' exists\")\n",
+    "    \n",
+    "    # Get cluster details\n",
+    "    result = run(\n",
+    "        [\"aws\", \"eks\", \"describe-cluster\", \"--name\", cluster_name, \"--region\", region, \"--output\", \"json\"],\n",
+    "        check=True,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    cluster_info = json.loads(result.stdout)[\"cluster\"]\n",
+    "    \n",
+    "    print(f\"\\nCluster Status: {cluster_info['status']}\")\n",
+    "    print(f\"Kubernetes Version: {cluster_info['version']}\")\n",
+    "    print(f\"Platform Version: {cluster_info.get('platformVersion', 'N/A')}\")\n",
+    "    \n",
+    "    # Check node groups\n",
+    "    print(\"\\n### Node Groups\")\n",
+    "    ng_result = run(\n",
+    "        [\"aws\", \"eks\", \"list-nodegroups\", \"--cluster-name\", cluster_name, \"--region\", region, \"--output\", \"json\"],\n",
+    "        check=True,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    nodegroups = json.loads(ng_result.stdout).get(\"nodegroups\", [])\n",
+    "    \n",
+    "    if nodegroups:\n",
+    "        for ng in nodegroups:\n",
+    "            ng_detail = run(\n",
+    "                [\"aws\", \"eks\", \"describe-nodegroup\", \"--cluster-name\", cluster_name, \n",
+    "                 \"--nodegroup-name\", ng, \"--region\", region, \"--output\", \"json\"],\n",
+    "                check=True,\n",
+    "                stream=False\n",
+    "            )\n",
+    "            ng_info = json.loads(ng_detail.stdout)[\"nodegroup\"]\n",
+    "            scaling = ng_info.get(\"scalingConfig\", {})\n",
+    "            print(f\"\\n  Node Group: {ng}\")\n",
+    "            print(f\"    Status: {ng_info['status']}\")\n",
+    "            print(f\"    Desired: {scaling.get('desiredSize', 'N/A')}\")\n",
+    "            print(f\"    Min: {scaling.get('minSize', 'N/A')}\")\n",
+    "            print(f\"    Max: {scaling.get('maxSize', 'N/A')}\")\n",
+    "            print(f\"    Instance Types: {', '.join(ng_info.get('instanceTypes', []))}\")\n",
+    "    else:\n",
+    "        warn(\"No node groups found\")\n",
+    "        print(\"💡 You'll need to create node groups when deploying with Terraform\")\n",
+    "else:\n",
+    "    warn(f\"Cluster '{cluster_name}' does not exist yet\")\n",
+    "    print(\"💡 This is expected if you haven't run Terraform yet. Proceed to notebook 02_terraform_apply.ipynb\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Storage Prerequisites\n",
+    "\n",
+    "LangSmith requires persistent storage for ClickHouse. The EBS CSI driver must be installed and StorageClasses must be configured.\n",
+    "\n",
+    "**Why this matters:** Without EBS CSI, ClickHouse PVCs will remain in `Pending` state forever.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check if kubectl is configured for the cluster\n",
+    "cluster_name = os.environ[\"CLUSTER_NAME\"]\n",
+    "region = aws_region()\n",
+    "\n",
+    "print(\"### Configuring kubectl for EKS cluster\")\n",
+    "try:\n",
+    "    # Update kubeconfig\n",
+    "    run(\n",
+    "        [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
+    "        check=True,\n",
+    "        stream=True\n",
+    "    )\n",
+    "    ok(\"kubectl configured for cluster\")\n",
+    "    \n",
+    "    # Check EBS CSI driver\n",
+    "    print(\"\\n### Checking EBS CSI Driver\")\n",
+    "    result = run(\n",
+    "        [\"kubectl\", \"get\", \"daemonset\", \"-n\", \"kube-system\", \"-l\", \"app=ebs-csi-controller\", \"-o\", \"json\"],\n",
+    "        check=False,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    \n",
+    "    if result.returncode == 0 and result.stdout.strip():\n",
+    "        import json\n",
+    "        ds_info = json.loads(result.stdout)\n",
+    "        if ds_info.get(\"items\"):\n",
+    "            ok(\"EBS CSI driver is installed\")\n",
+    "            print(f\"  DaemonSet: {ds_info['items'][0]['metadata']['name']}\")\n",
+    "        else:\n",
+    "            warn(\"EBS CSI driver not found\")\n",
+    "            print(\"💡 EBS CSI driver must be installed before deploying LangSmith\")\n",
+    "            print(\"   The Terraform module should handle this, but verify after deployment\")\n",
+    "    else:\n",
+    "        warn(\"EBS CSI driver not found\")\n",
+    "        print(\"💡 EBS CSI driver must be installed before deploying LangSmith\")\n",
+    "    \n",
+    "    # Check StorageClasses\n",
+    "    print(\"\\n### Checking StorageClasses\")\n",
+    "    result = run(\n",
+    "        [\"kubectl\", \"get\", \"storageclass\", \"-o\", \"json\"],\n",
+    "        check=True,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    sc_list = json.loads(result.stdout)\n",
+    "    \n",
+    "    ebs_scs = [sc for sc in sc_list.get(\"items\", []) if \"ebs\" in sc[\"metadata\"][\"name\"].lower() or \n",
+    "               sc.get(\"provisioner\", \"\").endswith(\"ebs.csi.aws.com\")]\n",
+    "    \n",
+    "    if ebs_scs:\n",
+    "        ok(f\"Found {len(ebs_scs)} EBS StorageClass(es):\")\n",
+    "        for sc in ebs_scs:\n",
+    "            name = sc[\"metadata\"][\"name\"]\n",
+    "            default = sc.get(\"metadata\", {}).get(\"annotations\", {}).get(\"storageclass.kubernetes.io/is-default-class\", \"false\")\n",
+    "            print(f\"  - {name} (default: {default})\")\n",
+    "    else:\n",
+    "        warn(\"No EBS StorageClasses found\")\n",
+    "        print(\"💡 At least one EBS StorageClass is required for ClickHouse PVCs\")\n",
+    "        \n",
+    "except Exception as e:\n",
+    "    warn(f\"Could not check storage prerequisites: {e}\")\n",
+    "    print(\"💡 This is expected if the cluster doesn't exist yet\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Blob Storage Requirement (S3)\n",
+    "\n",
+    "**Critical:** LangSmith requires S3 for blob storage in production. Inline trace payloads will explode ClickHouse if blob storage is not configured.\n",
+    "\n",
+    "Let's verify S3 access and check if a bucket exists or needs to be created.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shared._shell import run\n",
+    "import json\n",
+    "\n",
+    "region = aws_region()\n",
+    "\n",
+    "print(\"### S3 Access Check\")\n",
+    "print(f\"Region: {region}\\n\")\n",
+    "\n",
+    "# Test S3 access\n",
+    "try:\n",
+    "    result = run(\n",
+    "        [\"aws\", \"s3\", \"ls\", \"--region\", region],\n",
+    "        check=True,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    ok(\"S3 access verified\")\n",
+    "    \n",
+    "    # List buckets\n",
+    "    buckets_result = run(\n",
+    "        [\"aws\", \"s3api\", \"list-buckets\", \"--output\", \"json\"],\n",
+    "        check=True,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    buckets = json.loads(buckets_result.stdout).get(\"Buckets\", [])\n",
+    "    \n",
+    "    print(f\"\\nFound {len(buckets)} S3 bucket(s):\")\n",
+    "    for bucket in buckets[:10]:  # Show first 10\n",
+    "        print(f\"  - {bucket['Name']} (created: {bucket['CreationDate']})\")\n",
+    "    \n",
+    "    if len(buckets) > 10:\n",
+    "        print(f\"  ... and {len(buckets) - 10} more\")\n",
+    "    \n",
+    "    print(\"\\n💡 Note: The Terraform module should create an S3 bucket for LangSmith blob storage\")\n",
+    "    print(\"   Verify the bucket exists after Terraform deployment\")\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    warn(f\"S3 access check failed: {e}\")\n",
+    "    print(\"💡 Ensure your AWS credentials have S3 permissions\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Terraform & Helm Repository Paths\n",
+    "\n",
+    "Verify that the Terraform and Helm repository paths are correctly configured and accessible.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from pathlib import Path\n",
+    "from shared._validation import ok, warn\n",
+    "\n",
+    "def expand_env_vars(path_str: str) -> str:\n",
+    "    \"\"\"Expand environment variable references in a path string.\"\"\"\n",
+    "    # Expand $VAR and ${VAR} references\n",
+    "    def replace_var(match):\n",
+    "        var_name = match.group(1) or match.group(2)\n",
+    "        return os.environ.get(var_name, match.group(0))\n",
+    "    \n",
+    "    # Replace $VAR and ${VAR} patterns\n",
+    "    path_str = re.sub(r'\\$\\{([^}]+)\\}|\\$([a-zA-Z_][a-zA-Z0-9_]*)', replace_var, path_str)\n",
+    "    return path_str\n",
+    "\n",
+    "# Expand environment variables in paths (e.g., $TERRAFORM_REPO_DIR, $HELM_REPO_DIR, $HOME)\n",
+    "terraform_dir_str = expand_env_vars(os.environ[\"TERRAFORM_DIR\"])\n",
+    "terraform_dir = Path(terraform_dir_str).expanduser().resolve()\n",
+    "\n",
+    "helm_chart_ref_str = expand_env_vars(os.environ[\"HELM_CHART_REF\"])\n",
+    "helm_chart_ref = Path(helm_chart_ref_str).expanduser().resolve()\n",
+    "\n",
+    "print(\"### Repository Paths Check\\n\")\n",
+    "\n",
+    "# Check Terraform directory\n",
+    "print(f\"Terraform Directory: {terraform_dir}\")\n",
+    "if terraform_dir.exists():\n",
+    "    ok(f\"Terraform directory exists\")\n",
+    "    \n",
+    "    # Check for main.tf or similar\n",
+    "    tf_files = list(terraform_dir.glob(\"*.tf\"))\n",
+    "    if tf_files:\n",
+    "        print(f\"  Found {len(tf_files)} Terraform file(s)\")\n",
+    "    else:\n",
+    "        warn(\"No .tf files found in Terraform directory\")\n",
+    "        print(\"💡 Ensure you're pointing to the correct Terraform module path\")\n",
+    "else:\n",
+    "    warn(f\"Terraform directory does not exist: {terraform_dir}\")\n",
+    "    print(\"💡 Update TERRAFORM_DIR in your .env file to point to the langchain-ai/terraform repo\")\n",
+    "\n",
+    "# Check Helm chart\n",
+    "print(f\"\\nHelm Chart Reference: {helm_chart_ref}\")\n",
+    "if helm_chart_ref.exists():\n",
+    "    ok(f\"Helm chart path exists\")\n",
+    "    \n",
+    "    # Check for Chart.yaml\n",
+    "    chart_yaml = helm_chart_ref / \"Chart.yaml\"\n",
+    "    if chart_yaml.exists():\n",
+    "        print(f\"  Found Chart.yaml\")\n",
+    "    else:\n",
+    "        warn(\"Chart.yaml not found\")\n",
+    "        print(\"💡 Ensure you're pointing to the correct Helm chart path\")\n",
+    "else:\n",
+    "    warn(f\"Helm chart path does not exist: {helm_chart_ref}\")\n",
+    "    print(\"💡 Update HELM_CHART_REF in your .env file to point to the langchain-ai/helm chart\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preflight Summary\n",
+    "\n",
+    "Review the checklist below. All items should be ✅ before proceeding to Terraform deployment.\n",
+    "\n",
+    "### ✅ Checklist\n",
+    "\n",
+    "- [ ] All required tools installed (aws, terraform, kubectl, helm, jq)\n",
+    "- [ ] AWS credentials valid and correct account/region\n",
+    "- [ ] Required environment variables set\n",
+    "- [ ] Terraform directory path correct\n",
+    "- [ ] Helm chart path correct\n",
+    "- [ ] S3 access verified\n",
+    "- [ ] (If cluster exists) EBS CSI driver installed\n",
+    "- [ ] (If cluster exists) StorageClasses configured\n",
+    "\n",
+    "### Next Steps\n",
+    "\n",
+    "If all checks pass, proceed to **02_terraform_apply.ipynb** to deploy the infrastructure.\n",
+    "\n",
+    "If any checks failed, review the warnings above and fix the issues before continuing.\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.14.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,668 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1: Terraform - Provisioning the Platform Substrate\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "This notebook walks through deploying AWS infrastructure using the **official `langchain-ai/terraform` repository**.\n",
+    "\n",
+    "### Key Principles\n",
+    "\n",
+    "- ✅ Use the **official** Terraform repo (do not fork)\n",
+    "- ✅ Pin module versions for reproducibility\n",
+    "- ✅ Use remote state & locking\n",
+    "- ✅ Plan before applying\n",
+    "- ✅ Capture outputs needed for Helm\n",
+    "\n",
+    "### What We'll Deploy\n",
+    "\n",
+    "- Amazon EKS cluster\n",
+    "- RDS PostgreSQL\n",
+    "- ElastiCache Redis\n",
+    "- S3 bucket for blob storage\n",
+    "- IAM roles and policies\n",
+    "- EBS CSI driver addon\n",
+    "\n",
+    "**Estimated time:** 45-60 minutes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bootstrap environment\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Add notebooks directory to path so we can import shared as a package\n",
+    "# Find the notebooks directory by looking for the shared folder\n",
+    "possible_paths = [\n",
+    "    Path.cwd().parent,  # If cwd is module-1, go up one level to notebooks\n",
+    "    Path.cwd(),  # If cwd is already notebooks\n",
+    "    Path.cwd() / \"notebooks\",  # If cwd is workspace root\n",
+    "]\n",
+    "\n",
+    "notebooks_path = None\n",
+    "for path in possible_paths:\n",
+    "    if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        notebooks_path = path\n",
+    "        break\n",
+    "\n",
+    "if not notebooks_path:\n",
+    "    notebooks_path = Path.cwd() / \"notebooks\"\n",
+    "    if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
+    "\n",
+    "# Add notebooks directory to path so 'shared' can be imported as a package\n",
+    "if str(notebooks_path) not in sys.path:\n",
+    "    sys.path.insert(0, str(notebooks_path))\n",
+    "\n",
+    "from shared._bootstrap import bootstrap\n",
+    "\n",
+    "# Run bootstrap\n",
+    "bootstrap_info = bootstrap()\n",
+    "artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
+    "print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding the Official Terraform Repository\n",
+    "\n",
+    "The `langchain-ai/terraform` repository contains modules for deploying LangSmith infrastructure. We use the **official** repository because:\n",
+    "\n",
+    "1. **Support:** Support will expect to see standard configurations\n",
+    "2. **Updates:** Official modules receive security and feature updates\n",
+    "3. **Documentation:** Official modules are documented and tested\n",
+    "4. **Compatibility:** Ensures compatibility with Helm charts\n",
+    "\n",
+    "**Important:** We do **not** fork the upstream repository. We reference it directly.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "from pathlib import Path\n",
+    "from shared._validation import require_env, ok, warn, fail\n",
+    "from shared._shell import run\n",
+    "\n",
+    "def expand_env_vars(path_str: str) -> str:\n",
+    "    \"\"\"Expand environment variable references in a path string.\"\"\"\n",
+    "    # Expand $VAR and ${VAR} references\n",
+    "    def replace_var(match):\n",
+    "        var_name = match.group(1) or match.group(2)\n",
+    "        return os.environ.get(var_name, match.group(0))\n",
+    "    \n",
+    "    # Replace $VAR and ${VAR} patterns\n",
+    "    path_str = re.sub(r'\\$\\{([^}]+)\\}|\\$([a-zA-Z_][a-zA-Z0-9_]*)', replace_var, path_str)\n",
+    "    return path_str\n",
+    "\n",
+    "# Get required configuration\n",
+    "config = require_env(\"TERRAFORM_DIR\", \"CLUSTER_NAME\", \"AWS_REGION\", \"WORKSHOP_NAME\")\n",
+    "\n",
+    "# Expand environment variables in the path (e.g., $TERRAFORM_REPO_DIR, $HOME)\n",
+    "terraform_dir_str = expand_env_vars(config[\"TERRAFORM_DIR\"])\n",
+    "terraform_dir = Path(terraform_dir_str).expanduser().resolve()\n",
+    "\n",
+    "cluster_name = config[\"CLUSTER_NAME\"]\n",
+    "region = config[\"AWS_REGION\"]\n",
+    "workshop_name = config[\"WORKSHOP_NAME\"]\n",
+    "\n",
+    "print(\"### Terraform Configuration\")\n",
+    "print(f\"Terraform Directory: {terraform_dir}\")\n",
+    "print(f\"Cluster Name: {cluster_name}\")\n",
+    "print(f\"Region: {region}\")\n",
+    "print(f\"Workshop Name: {workshop_name}\\n\")\n",
+    "\n",
+    "if not terraform_dir.exists():\n",
+    "    fail(f\"Terraform directory does not exist: {terraform_dir}\")\n",
+    "    print(\"\\n💡 To fix this:\")\n",
+    "    print(\"   1. Clone the official Terraform repository:\")\n",
+    "    print(\"      git clone https://github.com/langchain-ai/terraform.git <target-directory>\")\n",
+    "    print(\"   2. Update TERRAFORM_DIR in your .env file to point to:\")\n",
+    "    print(f\"      TERRAFORM_DIR=\\\"<target-directory>/aws/langsmith\\\"\")\n",
+    "    print(\"   3. Run this notebook again\")\n",
+    "    raise RuntimeError(f\"Terraform directory not found: {terraform_dir}\")\n",
+    "\n",
+    "ok(f\"Terraform directory exists: {terraform_dir}\")\n",
+    "\n",
+    "# Check Terraform version\n",
+    "print(\"\\n### Terraform Version\")\n",
+    "result = run([\"terraform\", \"version\"], check=True, stream=False)\n",
+    "print(result.stdout)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Identifying the Correct Module Path\n",
+    "\n",
+    "The Terraform repository is organized by cloud provider and deployment type. For AWS self-hosted deployments, we need the AWS module.\n",
+    "\n",
+    "**Typical path structure:**\n",
+    "```\n",
+    "terraform/\n",
+    "  modules/\n",
+    "    aws/\n",
+    "      langsmith/    # <-- This is the module we use\n",
+    "        main.tf\n",
+    "        variables.tf\n",
+    "        outputs.tf\n",
+    "        ...\n",
+    "```\n",
+    "\n",
+    "Let's verify the module structure.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify Terraform module structure\n",
+    "print(\"### Terraform Module Structure\\n\")\n",
+    "\n",
+    "# Check for key files\n",
+    "key_files = [\"main.tf\", \"variables.tf\", \"outputs.tf\"]\n",
+    "found_files = []\n",
+    "\n",
+    "for file in key_files:\n",
+    "    file_path = terraform_dir / file\n",
+    "    if file_path.exists():\n",
+    "        found_files.append(file)\n",
+    "        ok(f\"Found {file}\")\n",
+    "    else:\n",
+    "        warn(f\"Missing {file}\")\n",
+    "\n",
+    "if len(found_files) == len(key_files):\n",
+    "    ok(\"Terraform module structure looks correct\")\n",
+    "else:\n",
+    "    warn(\"Some expected Terraform files are missing\")\n",
+    "    print(\"💡 Ensure TERRAFORM_DIR points to the correct module path (e.g., terraform/aws/langsmith)\")\n",
+    "\n",
+    "# List all .tf files for reference\n",
+    "print(\"\\n### All Terraform Files in Module\")\n",
+    "tf_files = sorted(terraform_dir.glob(\"*.tf\"))\n",
+    "for tf_file in tf_files:\n",
+    "    print(f\"  - {tf_file.name}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pinning Module Versions\n",
+    "\n",
+    "**Critical:** Always pin Terraform module versions for reproducibility. This ensures:\n",
+    "- Consistent deployments across environments\n",
+    "- Predictable behavior\n",
+    "- Ability to roll back if needed\n",
+    "\n",
+    "Check the `versions.tf` or `main.tf` file to see what versions are pinned.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check for version constraints\n",
+    "print(\"### Checking Module Version Constraints\\n\")\n",
+    "\n",
+    "versions_file = terraform_dir / \"versions.tf\"\n",
+    "if versions_file.exists():\n",
+    "    print(\"Found versions.tf:\")\n",
+    "    print(\"=\" * 60)\n",
+    "    with open(versions_file) as f:\n",
+    "        print(f.read())\n",
+    "    print(\"=\" * 60)\n",
+    "else:\n",
+    "    # Check main.tf for version constraints\n",
+    "    main_file = terraform_dir / \"main.tf\"\n",
+    "    if main_file.exists():\n",
+    "        with open(main_file) as f:\n",
+    "            content = f.read()\n",
+    "            if \"required_version\" in content or \"version\" in content.lower():\n",
+    "                print(\"Version constraints found in main.tf:\")\n",
+    "                print(\"=\" * 60)\n",
+    "                # Show relevant lines\n",
+    "                for i, line in enumerate(content.split('\\n'), 1):\n",
+    "                    if 'version' in line.lower() or 'required' in line.lower():\n",
+    "                        print(f\"{i:4}: {line}\")\n",
+    "                print(\"=\" * 60)\n",
+    "            else:\n",
+    "                warn(\"No version constraints found\")\n",
+    "                print(\"💡 Consider adding version constraints to ensure reproducibility\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Remote State & Locking\n",
+    "\n",
+    "**Why remote state matters:**\n",
+    "- Enables team collaboration\n",
+    "- Prevents concurrent modifications\n",
+    "- Provides state backup and recovery\n",
+    "\n",
+    "**Why locking matters:**\n",
+    "- Prevents state corruption from concurrent runs\n",
+    "- Required for production deployments\n",
+    "\n",
+    "Check if remote state backend is configured.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check for backend configuration\n",
+    "print(\"### Checking Backend Configuration\\n\")\n",
+    "\n",
+    "backend_file = terraform_dir / \"backend.tf\"\n",
+    "if backend_file.exists():\n",
+    "    print(\"Found backend.tf:\")\n",
+    "    print(\"=\" * 60)\n",
+    "    with open(backend_file) as f:\n",
+    "        print(f.read())\n",
+    "    print(\"=\" * 60)\n",
+    "    ok(\"Backend configuration found\")\n",
+    "else:\n",
+    "    # Check for backend block in other files\n",
+    "    backend_configs = []\n",
+    "    for tf_file in terraform_dir.glob(\"*.tf\"):\n",
+    "        with open(tf_file) as f:\n",
+    "            content = f.read()\n",
+    "            if \"backend\" in content:\n",
+    "                backend_configs.append(tf_file.name)\n",
+    "    \n",
+    "    if backend_configs:\n",
+    "        print(f\"Backend configuration found in: {', '.join(backend_configs)}\")\n",
+    "        ok(\"Backend configuration exists\")\n",
+    "    else:\n",
+    "        warn(\"No backend configuration found\")\n",
+    "        print(\"💡 For production, configure remote state (S3 + DynamoDB for locking)\")\n",
+    "        print(\"   For workshops, local state may be acceptable\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Terraform Initialization\n",
+    "\n",
+    "Before planning or applying, Terraform must be initialized. This downloads providers and modules.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize Terraform\n",
+    "print(\"### Initializing Terraform\\n\")\n",
+    "print(\"This may take a few minutes as it downloads providers and modules...\\n\")\n",
+    "\n",
+    "result = run(\n",
+    "    [\"terraform\", \"init\"],\n",
+    "    cwd=str(terraform_dir),\n",
+    "    check=True,\n",
+    "    stream=True\n",
+    ")\n",
+    "\n",
+    "ok(\"Terraform initialization complete\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Planning vs Applying\n",
+    "\n",
+    "**Always plan before applying.** The plan shows:\n",
+    "- What resources will be created/modified/destroyed\n",
+    "- Any configuration errors\n",
+    "- Estimated costs (if configured)\n",
+    "\n",
+    "**Review the plan carefully** before proceeding to apply.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create terraform plan\n",
+    "plan_file = artifacts_dir / \"terraform-plan.txt\"\n",
+    "\n",
+    "print(\"### Creating Terraform Plan\\n\")\n",
+    "print(\"This will show what resources Terraform intends to create/modify/destroy.\\n\")\n",
+    "print(\"⚠️  Review the plan carefully before applying!\\n\")\n",
+    "\n",
+    "# Collect Terraform variables from environment\n",
+    "terraform_vars = []\n",
+    "postgres_username = os.environ.get(\"POSTGRES_USERNAME\", \"\").strip()\n",
+    "postgres_password = os.environ.get(\"POSTGRES_PASSWORD\", \"\").strip()\n",
+    "\n",
+    "print(\"### Terraform Variables\\n\")\n",
+    "missing_vars = []\n",
+    "\n",
+    "if postgres_username:\n",
+    "    terraform_vars.extend([\"-var\", f\"postgres_username={postgres_username}\"])\n",
+    "    print(f\"✅ POSTGRES_USERNAME: {postgres_username}\")\n",
+    "else:\n",
+    "    missing_vars.append(\"POSTGRES_USERNAME\")\n",
+    "    warn(\"POSTGRES_USERNAME not set in environment\")\n",
+    "\n",
+    "if postgres_password:\n",
+    "    terraform_vars.extend([\"-var\", f\"postgres_password={postgres_password}\"])\n",
+    "    print(f\"✅ POSTGRES_PASSWORD: {'*' * len(postgres_password)} (hidden)\")\n",
+    "else:\n",
+    "    missing_vars.append(\"POSTGRES_PASSWORD\")\n",
+    "    warn(\"POSTGRES_PASSWORD not set in environment\")\n",
+    "\n",
+    "if missing_vars:\n",
+    "    print(f\"\\n❌ Missing required environment variables: {', '.join(missing_vars)}\")\n",
+    "    print(\"💡 To fix this:\")\n",
+    "    print(\"   1. Add these variables to your .env file (or workshop.env):\")\n",
+    "    for var in missing_vars:\n",
+    "        print(f\"      {var}=\\\"your-value-here\\\"\")\n",
+    "    print(\"   2. Re-run the bootstrap cell (first cell) to reload environment variables\")\n",
+    "    print(\"   3. Re-run this cell\")\n",
+    "    raise RuntimeError(f\"Missing required Terraform variables: {', '.join(missing_vars)}\")\n",
+    "\n",
+    "print(f\"\\n✅ All required variables are set. Passing {len(terraform_vars) // 2} variable(s) to Terraform.\\n\")\n",
+    "\n",
+    "# Build terraform plan command\n",
+    "plan_cmd = [\"terraform\", \"plan\", \"-out=tfplan\"] + terraform_vars\n",
+    "\n",
+    "result = run(\n",
+    "    plan_cmd,\n",
+    "    cwd=str(terraform_dir),\n",
+    "    check=False,  # Don't fail if plan has warnings\n",
+    "    stream=True\n",
+    ")\n",
+    "\n",
+    "# Save plan output\n",
+    "with open(plan_file, \"w\") as f:\n",
+    "    f.write(result.stdout)\n",
+    "    if result.stderr:\n",
+    "        f.write(\"\\n\\nSTDERR:\\n\")\n",
+    "        f.write(result.stderr)\n",
+    "\n",
+    "print(f\"\\n💡 Plan output saved to: {plan_file}\")\n",
+    "\n",
+    "if result.returncode == 0:\n",
+    "    ok(\"Terraform plan completed successfully\")\n",
+    "    print(\"\\n⚠️  Review the plan above. If it looks correct, proceed to the next cell to apply.\")\n",
+    "else:\n",
+    "    warn(f\"Terraform plan had issues (rc={result.returncode})\")\n",
+    "    print(\"💡 Review the errors above before proceeding\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Applying Terraform\n",
+    "\n",
+    "**⚠️ WARNING:** This will create real AWS resources and incur costs.\n",
+    "\n",
+    "Only proceed if:\n",
+    "1. ✅ You've reviewed the plan\n",
+    "2. ✅ You're using the correct AWS account/region\n",
+    "3. ✅ You understand the costs involved\n",
+    "\n",
+    "**Estimated deployment time:** 15-30 minutes (EKS cluster creation takes time)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply Terraform\n",
+    "# ⚠️  UNCOMMENT THE CODE BELOW TO ACTUALLY APPLY\n",
+    "# This is commented out by default to prevent accidental deployments\n",
+    "\n",
+    "print(\"### Applying Terraform Configuration\\n\")\n",
+    "print(\"⚠️  This cell is currently DISABLED to prevent accidental deployments.\\n\")\n",
+    "print(\"To apply Terraform, uncomment the code below and run this cell.\\n\")\n",
+    "\n",
+    "# UNCOMMENT TO APPLY:\n",
+    "# print(\"Applying Terraform... This will take 15-30 minutes.\\n\")\n",
+    "# result = run(\n",
+    "#     [\"terraform\", \"apply\", \"tfplan\"],\n",
+    "#     cwd=str(terraform_dir),\n",
+    "#     check=True,\n",
+    "#     stream=True\n",
+    "# )\n",
+    "# \n",
+    "# ok(\"Terraform apply completed successfully\")\n",
+    "# \n",
+    "# # Save apply output\n",
+    "# apply_file = artifacts_dir / \"terraform-apply.txt\"\n",
+    "# with open(apply_file, \"w\") as f:\n",
+    "#     f.write(result.stdout)\n",
+    "#     if result.stderr:\n",
+    "#         f.write(\"\\n\\nSTDERR:\\n\")\n",
+    "#         f.write(result.stderr)\n",
+    "# \n",
+    "# print(f\"\\n💡 Apply output saved to: {apply_file}\")\n",
+    "\n",
+    "print(\"💡 To apply, edit this cell and uncomment the code above\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Interpreting Terraform Outputs\n",
+    "\n",
+    "After Terraform applies successfully, we need to capture the outputs. These outputs contain information needed for Helm deployment:\n",
+    "\n",
+    "- Cluster name and endpoint\n",
+    "- RDS connection details\n",
+    "- Redis connection details\n",
+    "- S3 bucket name\n",
+    "- IAM role ARNs\n",
+    "\n",
+    "Let's retrieve and save these outputs.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "# Get Terraform outputs\n",
+    "print(\"### Terraform Outputs\\n\")\n",
+    "\n",
+    "result = run(\n",
+    "    [\"terraform\", \"output\", \"-json\"],\n",
+    "    cwd=str(terraform_dir),\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "outputs = json.loads(result.stdout)\n",
+    "\n",
+    "# Save outputs to artifacts\n",
+    "outputs_file = artifacts_dir / \"terraform-outputs.json\"\n",
+    "with open(outputs_file, \"w\") as f:\n",
+    "    json.dump(outputs, f, indent=2)\n",
+    "\n",
+    "print(\"Terraform outputs:\")\n",
+    "print(\"=\" * 60)\n",
+    "for key, value in outputs.items():\n",
+    "    if isinstance(value, dict) and \"value\" in value:\n",
+    "        # Terraform outputs are wrapped in {\"value\": ...}\n",
+    "        val = value[\"value\"]\n",
+    "        if isinstance(val, str) and len(val) > 100:\n",
+    "            print(f\"{key}: {val[:100]}... (truncated)\")\n",
+    "        else:\n",
+    "            print(f\"{key}: {val}\")\n",
+    "    else:\n",
+    "        print(f\"{key}: {value}\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "ok(f\"Outputs saved to: {outputs_file}\")\n",
+    "print(\"\\n💡 These outputs will be needed for Helm deployment in the next notebook\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Verifying Infrastructure\n",
+    "\n",
+    "Let's verify that the key infrastructure components were created successfully.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shared._aws_helpers import eks_cluster_exists, aws_region\n",
+    "\n",
+    "region = aws_region()\n",
+    "\n",
+    "# Verify EKS cluster\n",
+    "print(\"### Verifying EKS Cluster\\n\")\n",
+    "if eks_cluster_exists(cluster_name):\n",
+    "    ok(f\"Cluster '{cluster_name}' exists\")\n",
+    "    \n",
+    "    # Get cluster endpoint\n",
+    "    result = run(\n",
+    "        [\"aws\", \"eks\", \"describe-cluster\", \"--name\", cluster_name, \"--region\", region, \"--output\", \"json\"],\n",
+    "        check=True,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    cluster_info = json.loads(result.stdout)[\"cluster\"]\n",
+    "    print(f\"  Status: {cluster_info['status']}\")\n",
+    "    print(f\"  Endpoint: {cluster_info['endpoint']}\")\n",
+    "    print(f\"  Version: {cluster_info['version']}\")\n",
+    "else:\n",
+    "    warn(f\"Cluster '{cluster_name}' not found\")\n",
+    "\n",
+    "# Verify kubectl access\n",
+    "print(\"\\n### Configuring kubectl\\n\")\n",
+    "try:\n",
+    "    run(\n",
+    "        [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
+    "        check=True,\n",
+    "        stream=True\n",
+    "    )\n",
+    "    ok(\"kubectl configured\")\n",
+    "    \n",
+    "    # Test cluster access\n",
+    "    result = run(\n",
+    "        [\"kubectl\", \"cluster-info\"],\n",
+    "        check=True,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    print(result.stdout)\n",
+    "except Exception as e:\n",
+    "    warn(f\"Could not configure kubectl: {e}\")\n",
+    "\n",
+    "# Check for RDS (if output available)\n",
+    "if \"rds\" in str(outputs).lower() or \"postgres\" in str(outputs).lower():\n",
+    "    print(\"\\n### RDS PostgreSQL\\n\")\n",
+    "    print(\"💡 Verify RDS instance is available in AWS console\")\n",
+    "    print(\"   Check outputs above for connection details\")\n",
+    "\n",
+    "# Check for ElastiCache (if output available)\n",
+    "if \"redis\" in str(outputs).lower() or \"elasticache\" in str(outputs).lower():\n",
+    "    print(\"\\n### ElastiCache Redis\\n\")\n",
+    "    print(\"💡 Verify Redis cluster is available in AWS console\")\n",
+    "    print(\"   Check outputs above for connection details\")\n",
+    "\n",
+    "# Check for S3 bucket\n",
+    "if \"s3\" in str(outputs).lower() or \"bucket\" in str(outputs).lower():\n",
+    "    print(\"\\n### S3 Bucket\\n\")\n",
+    "    print(\"💡 Verify S3 bucket exists in AWS console\")\n",
+    "    print(\"   Check outputs above for bucket name\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "### ✅ What We Accomplished\n",
+    "\n",
+    "- [ ] Initialized Terraform\n",
+    "- [ ] Created and reviewed Terraform plan\n",
+    "- [ ] Applied Terraform configuration (if you uncommented the apply step)\n",
+    "- [ ] Captured Terraform outputs\n",
+    "- [ ] Verified infrastructure components\n",
+    "\n",
+    "### 📋 Key Takeaways\n",
+    "\n",
+    "1. **Use official Terraform repo** - Don't fork, reference directly\n",
+    "2. **Pin versions** - Ensures reproducibility\n",
+    "3. **Use remote state** - Required for production\n",
+    "4. **Always plan first** - Review before applying\n",
+    "5. **Save outputs** - Needed for Helm deployment\n",
+    "\n",
+    "### 🎯 Next Steps\n",
+    "\n",
+    "Proceed to **03_helm_install_langsmith.ipynb** to install LangSmith using Helm.\n",
+    "\n",
+    "**Important:** Make sure you have:\n",
+    "- ✅ Terraform outputs saved\n",
+    "- ✅ Cluster accessible via kubectl\n",
+    "- ✅ LangSmith license key ready\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.14.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,727 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1: Helm - Installing LangSmith\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "This notebook walks through installing LangSmith using the **official `langchain-ai/helm` chart**.\n",
+    "\n",
+    "### Key Principles\n",
+    "\n",
+    "- ✅ Use the **official** Helm chart (do not fork)\n",
+    "- ✅ Pin chart versions for reproducibility\n",
+    "- ✅ Create minimal, sane values file\n",
+    "- ✅ Inject required secrets properly\n",
+    "- ✅ Render templates before install\n",
+    "- ✅ Understand that \"helm install succeeded\" ≠ \"system is healthy\"\n",
+    "\n",
+    "### What We'll Install\n",
+    "\n",
+    "- LangSmith application components\n",
+    "- External service connections (RDS, Redis, S3)\n",
+    "- Resource requests & limits\n",
+    "- Ingress configuration\n",
+    "\n",
+    "**Estimated time:** 45-60 minutes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bootstrap environment\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Add notebooks directory to path so we can import shared as a package\n",
+    "# Find the notebooks directory by looking for the shared folder\n",
+    "possible_paths = [\n",
+    "    Path.cwd().parent,  # If cwd is module-1, go up one level to notebooks\n",
+    "    Path.cwd(),  # If cwd is already notebooks\n",
+    "    Path.cwd() / \"notebooks\",  # If cwd is workspace root\n",
+    "]\n",
+    "\n",
+    "notebooks_path = None\n",
+    "for path in possible_paths:\n",
+    "    if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        notebooks_path = path\n",
+    "        break\n",
+    "\n",
+    "if not notebooks_path:\n",
+    "    notebooks_path = Path.cwd() / \"notebooks\"\n",
+    "    if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
+    "\n",
+    "# Add notebooks directory to path so 'shared' can be imported as a package\n",
+    "if str(notebooks_path) not in sys.path:\n",
+    "    sys.path.insert(0, str(notebooks_path))\n",
+    "\n",
+    "from shared._bootstrap import bootstrap\n",
+    "\n",
+    "# Run bootstrap\n",
+    "bootstrap_info = bootstrap()\n",
+    "artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
+    "print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding the Official Helm Chart\n",
+    "\n",
+    "The `langchain-ai/helm` repository contains the official LangSmith Helm chart. We use the **official** chart because:\n",
+    "\n",
+    "1. **Support:** Support will expect standard configurations\n",
+    "2. **Updates:** Official charts receive security and feature updates\n",
+    "3. **Documentation:** Official charts are documented and tested\n",
+    "4. **Compatibility:** Ensures compatibility with Terraform outputs\n",
+    "\n",
+    "**Important:** We do **not** fork the upstream repository. We reference it directly.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import re\n",
+    "from pathlib import Path\n",
+    "from shared._validation import require_env, ok, warn, fail\n",
+    "from shared._shell import run\n",
+    "\n",
+    "def expand_env_vars(path_str: str) -> str:\n",
+    "    \"\"\"Expand environment variable references in a path string.\"\"\"\n",
+    "    # Expand $VAR and ${VAR} references\n",
+    "    def replace_var(match):\n",
+    "        var_name = match.group(1) or match.group(2)\n",
+    "        return os.environ.get(var_name, match.group(0))\n",
+    "    \n",
+    "    # Replace $VAR and ${VAR} patterns\n",
+    "    path_str = re.sub(r'\\$\\{([^}]+)\\}|\\$([a-zA-Z_][a-zA-Z0-9_]*)', replace_var, path_str)\n",
+    "    return path_str\n",
+    "\n",
+    "# Get required configuration\n",
+    "config = require_env(\n",
+    "    \"HELM_CHART_REF\", \"HELM_RELEASE\", \"HELM_NAMESPACE\", \n",
+    "    \"CLUSTER_NAME\", \"AWS_REGION\", \"NAMESPACE\"\n",
+    ")\n",
+    "\n",
+    "# Expand environment variables in the path (e.g., $HELM_REPO_DIR, $HOME)\n",
+    "helm_chart_ref_str = expand_env_vars(config[\"HELM_CHART_REF\"])\n",
+    "helm_chart_ref = Path(helm_chart_ref_str).expanduser().resolve()\n",
+    "\n",
+    "helm_release = config[\"HELM_RELEASE\"]\n",
+    "helm_namespace = config[\"HELM_NAMESPACE\"]\n",
+    "cluster_name = config[\"CLUSTER_NAME\"]\n",
+    "region = config[\"AWS_REGION\"]\n",
+    "namespace = config[\"NAMESPACE\"]\n",
+    "\n",
+    "print(\"### Helm Configuration\")\n",
+    "print(f\"Chart Reference: {helm_chart_ref}\")\n",
+    "print(f\"Release Name: {helm_release}\")\n",
+    "print(f\"Namespace: {helm_namespace}\")\n",
+    "print(f\"Cluster: {cluster_name}\")\n",
+    "print(f\"Region: {region}\\n\")\n",
+    "\n",
+    "if not helm_chart_ref.exists():\n",
+    "    fail(f\"Helm chart path does not exist: {helm_chart_ref}\")\n",
+    "    print(\"\\n💡 To fix this:\")\n",
+    "    print(\"   1. Clone the official Helm repository:\")\n",
+    "    print(\"      git clone https://github.com/langchain-ai/helm.git <target-directory>\")\n",
+    "    print(\"   2. Update HELM_CHART_REF in your .env file to point to:\")\n",
+    "    print(f\"      HELM_CHART_REF=\\\"<target-directory>/charts/langsmith\\\"\")\n",
+    "    print(\"   3. Run this notebook again\")\n",
+    "    raise RuntimeError(f\"Helm chart path not found: {helm_chart_ref}\")\n",
+    "\n",
+    "ok(f\"Helm chart path exists: {helm_chart_ref}\")\n",
+    "\n",
+    "# Check Helm version\n",
+    "print(\"\\n### Helm Version\")\n",
+    "result = run([\"helm\", \"version\"], check=True, stream=False)\n",
+    "print(result.stdout)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Discovering the Chart Path\n",
+    "\n",
+    "Verify the Helm chart structure and locate the Chart.yaml file.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify Helm chart structure\n",
+    "print(\"### Helm Chart Structure\\n\")\n",
+    "\n",
+    "# Check for Chart.yaml\n",
+    "chart_yaml = helm_chart_ref / \"Chart.yaml\"\n",
+    "if chart_yaml.exists():\n",
+    "    ok(\"Found Chart.yaml\")\n",
+    "    print(\"\\nChart.yaml contents:\")\n",
+    "    print(\"=\" * 60)\n",
+    "    with open(chart_yaml) as f:\n",
+    "        print(f.read())\n",
+    "    print(\"=\" * 60)\n",
+    "else:\n",
+    "    warn(\"Chart.yaml not found\")\n",
+    "    raise RuntimeError(f\"❌ Invalid Helm chart: {helm_chart_ref}\")\n",
+    "\n",
+    "# Check for values.yaml\n",
+    "values_yaml = helm_chart_ref / \"values.yaml\"\n",
+    "if values_yaml.exists():\n",
+    "    ok(\"Found values.yaml (default values)\")\n",
+    "else:\n",
+    "    warn(\"values.yaml not found (may be optional)\")\n",
+    "\n",
+    "# List chart files\n",
+    "print(\"\\n### Chart Files\")\n",
+    "chart_files = sorted(helm_chart_ref.glob(\"*\"))\n",
+    "for f in chart_files[:20]:  # Show first 20\n",
+    "    if f.is_file():\n",
+    "        print(f\"  📄 {f.name}\")\n",
+    "    elif f.is_dir():\n",
+    "        print(f\"  📁 {f.name}/\")\n",
+    "if len(chart_files) > 20:\n",
+    "    print(f\"  ... and {len(chart_files) - 20} more items\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pinning Chart Versions\n",
+    "\n",
+    "**Critical:** Always pin Helm chart versions for reproducibility. Check the Chart.yaml for the version.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract chart version\n",
+    "import yaml\n",
+    "\n",
+    "with open(chart_yaml) as f:\n",
+    "    chart_info = yaml.safe_load(f)\n",
+    "\n",
+    "print(\"### Chart Version Information\\n\")\n",
+    "print(f\"Chart Name: {chart_info.get('name', 'N/A')}\")\n",
+    "print(f\"Chart Version: {chart_info.get('version', 'N/A')}\")\n",
+    "print(f\"App Version: {chart_info.get('appVersion', 'N/A')}\")\n",
+    "print(f\"Description: {chart_info.get('description', 'N/A')[:100]}...\")\n",
+    "\n",
+    "ok(f\"Using chart version: {chart_info.get('version', 'N/A')}\")\n",
+    "print(\"\\n💡 Record this version for reproducibility\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading Terraform Outputs\n",
+    "\n",
+    "We need the Terraform outputs from the previous notebook to configure Helm values (RDS, Redis, S3, etc.).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load Terraform outputs\n",
+    "terraform_outputs_file = artifacts_dir / \"terraform-outputs.json\"\n",
+    "\n",
+    "if not terraform_outputs_file.exists():\n",
+    "    warn(f\"Terraform outputs file not found: {terraform_outputs_file}\")\n",
+    "    print(\"💡 Run notebook 02_terraform_apply.ipynb first to generate outputs\")\n",
+    "    terraform_outputs = {}\n",
+    "else:\n",
+    "    with open(terraform_outputs_file) as f:\n",
+    "        terraform_outputs_raw = json.load(f)\n",
+    "    \n",
+    "    # Unwrap Terraform output format\n",
+    "    terraform_outputs = {}\n",
+    "    for key, value in terraform_outputs_raw.items():\n",
+    "        if isinstance(value, dict) and \"value\" in value:\n",
+    "            terraform_outputs[key] = value[\"value\"]\n",
+    "        else:\n",
+    "            terraform_outputs[key] = value\n",
+    "    \n",
+    "    ok(f\"Loaded Terraform outputs from {terraform_outputs_file}\")\n",
+    "    print(f\"\\nAvailable outputs: {', '.join(terraform_outputs.keys())}\")\n",
+    "    \n",
+    "    # Show key outputs (redacted for secrets)\n",
+    "    print(\"\\n### Key Outputs (for reference):\")\n",
+    "    for key in [\"cluster_name\", \"rds_endpoint\", \"redis_endpoint\", \"s3_bucket\"]:\n",
+    "        if key in terraform_outputs:\n",
+    "            val = str(terraform_outputs[key])\n",
+    "            if len(val) > 50:\n",
+    "                print(f\"  {key}: {val[:50]}...\")\n",
+    "            else:\n",
+    "                print(f\"  {key}: {val}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating a Minimal Values File\n",
+    "\n",
+    "We'll create a minimal, sane values file that:\n",
+    "- Connects to external services (RDS, Redis, S3)\n",
+    "- Sets resource requests & limits\n",
+    "- Configures ingress\n",
+    "- Includes required secrets\n",
+    "\n",
+    "**Important:** Start minimal. Add complexity only as needed.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check if values file is specified\n",
+    "values_file_env = os.environ.get(\"VALUES_FILE\", \"\").strip()\n",
+    "\n",
+    "if values_file_env:\n",
+    "    values_file_path = Path(values_file_env).expanduser().resolve()\n",
+    "    if values_file_path.exists():\n",
+    "        ok(f\"Using values file from environment: {values_file_path}\")\n",
+    "        print(\"💡 Review the values file to ensure it's configured correctly\")\n",
+    "    else:\n",
+    "        warn(f\"Values file from environment not found: {values_file_path}\")\n",
+    "        print(\"💡 Will need to create a values file\")\n",
+    "        values_file_path = None\n",
+    "else:\n",
+    "    values_file_path = None\n",
+    "    print(\"💡 VALUES_FILE not set in environment\")\n",
+    "    print(\"   We'll create a minimal values file for this deployment\")\n",
+    "\n",
+    "# If no values file, we'll create one\n",
+    "if not values_file_path:\n",
+    "    values_file_path = artifacts_dir / \"langsmith-values.yaml\"\n",
+    "    print(f\"\\nWill create values file at: {values_file_path}\")\n",
+    "    print(\"💡 You can customize this file before installation\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Injecting Required Secrets\n",
+    "\n",
+    "LangSmith requires several secrets:\n",
+    "- **License key** (required)\n",
+    "- Database credentials (if not using IAM auth)\n",
+    "- Redis password (if not using IAM auth)\n",
+    "- S3 credentials (if not using IAM roles)\n",
+    "\n",
+    "Let's prepare the secrets.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check for required secrets\n",
+    "print(\"### Required Secrets\\n\")\n",
+    "\n",
+    "# License key (required)\n",
+    "license_key = os.environ.get(\"LANGSMITH_LICENSE_KEY\", \"\").strip()\n",
+    "if license_key:\n",
+    "    ok(\"LANGSMITH_LICENSE_KEY is set\")\n",
+    "    print(\"💡 License key will be used to create Kubernetes secret\")\n",
+    "else:\n",
+    "    warn(\"LANGSMITH_LICENSE_KEY not set\")\n",
+    "    print(\"💡 You must set LANGSMITH_LICENSE_KEY in your .env file\")\n",
+    "    print(\"   Get your license key from LangSmith support\")\n",
+    "\n",
+    "# Database credentials (may be optional if using IAM auth)\n",
+    "db_user = os.environ.get(\"DB_USER\", \"\").strip()\n",
+    "db_password = os.environ.get(\"DB_PASSWORD\", \"\").strip()\n",
+    "if db_user and db_password:\n",
+    "    ok(\"Database credentials are set\")\n",
+    "else:\n",
+    "    print(\"💡 Database credentials may be optional if using IAM authentication\")\n",
+    "    print(\"   Check your Terraform outputs for connection details\")\n",
+    "\n",
+    "# Redis password (may be optional if using IAM auth)\n",
+    "redis_password = os.environ.get(\"REDIS_PASSWORD\", \"\").strip()\n",
+    "if redis_password:\n",
+    "    ok(\"Redis password is set\")\n",
+    "else:\n",
+    "    print(\"💡 Redis password may be optional if using IAM authentication\")\n",
+    "\n",
+    "print(\"\\n💡 Secrets will be created as Kubernetes secrets before Helm install\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparing Kubernetes Namespace\n",
+    "\n",
+    "Create the namespace if it doesn't exist.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shared._k8s_helpers import namespace_exists, kubectl\n",
+    "from shared._aws_helpers import aws_region\n",
+    "\n",
+    "# Ensure kubectl is configured\n",
+    "region = aws_region()\n",
+    "run(\n",
+    "    [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "# Create namespace if needed\n",
+    "print(f\"### Preparing Namespace: {namespace}\\n\")\n",
+    "\n",
+    "if namespace_exists(namespace):\n",
+    "    ok(f\"Namespace '{namespace}' already exists\")\n",
+    "else:\n",
+    "    print(f\"Creating namespace '{namespace}'...\")\n",
+    "    kubectl(\"create\", \"namespace\", namespace, check=True, stream=True)\n",
+    "    ok(f\"Namespace '{namespace}' created\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating Kubernetes Secrets\n",
+    "\n",
+    "Create the required secrets in the namespace.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create secrets\n",
+    "print(\"### Creating Kubernetes Secrets\\n\")\n",
+    "\n",
+    "if not license_key:\n",
+    "    raise RuntimeError(\"❌ LANGSMITH_LICENSE_KEY is required\")\n",
+    "\n",
+    "# Create license key secret\n",
+    "print(\"Creating license key secret...\")\n",
+    "run(\n",
+    "    [\n",
+    "        \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-license\",\n",
+    "        f\"--from-literal=license-key={license_key}\",\n",
+    "        \"-n\", namespace,\n",
+    "        \"--dry-run=client\", \"-o\", \"yaml\"\n",
+    "    ],\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "# Actually create it (remove dry-run)\n",
+    "run(\n",
+    "    [\n",
+    "        \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-license\",\n",
+    "        f\"--from-literal=license-key={license_key}\",\n",
+    "        \"-n\", namespace\n",
+    "    ],\n",
+    "    check=False,  # May already exist\n",
+    "    stream=True\n",
+    ")\n",
+    "ok(\"License key secret created/updated\")\n",
+    "\n",
+    "# Create database secret if credentials provided\n",
+    "if db_user and db_password:\n",
+    "    print(\"\\nCreating database secret...\")\n",
+    "    run(\n",
+    "        [\n",
+    "            \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-db\",\n",
+    "            f\"--from-literal=username={db_user}\",\n",
+    "            f\"--from-literal=password={db_password}\",\n",
+    "            \"-n\", namespace\n",
+    "        ],\n",
+    "        check=False,  # May already exist\n",
+    "        stream=True\n",
+    "    )\n",
+    "    ok(\"Database secret created/updated\")\n",
+    "else:\n",
+    "    print(\"💡 Skipping database secret (using IAM auth or not needed)\")\n",
+    "\n",
+    "# Create Redis secret if password provided\n",
+    "if redis_password:\n",
+    "    print(\"\\nCreating Redis secret...\")\n",
+    "    run(\n",
+    "        [\n",
+    "            \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-redis\",\n",
+    "            f\"--from-literal=password={redis_password}\",\n",
+    "            \"-n\", namespace\n",
+    "        ],\n",
+    "        check=False,  # May already exist\n",
+    "        stream=True\n",
+    "    )\n",
+    "    ok(\"Redis secret created/updated\")\n",
+    "else:\n",
+    "    print(\"💡 Skipping Redis secret (using IAM auth or not needed)\")\n",
+    "\n",
+    "print(\"\\n✅ Secrets preparation complete\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Rendering Templates Before Install\n",
+    "\n",
+    "**Critical:** Always render Helm templates before installing. This lets you:\n",
+    "- Verify the configuration is correct\n",
+    "- Catch errors before deployment\n",
+    "- Review what will be created\n",
+    "\n",
+    "This is especially important for understanding resource requests & limits.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Render Helm templates\n",
+    "print(\"### Rendering Helm Templates\\n\")\n",
+    "print(\"This shows what Kubernetes resources will be created...\\n\")\n",
+    "\n",
+    "# Use values file if it exists, otherwise use empty values\n",
+    "values_arg = []\n",
+    "if values_file_path and values_file_path.exists():\n",
+    "    values_arg = [\"-f\", str(values_file_path)]\n",
+    "    print(f\"Using values file: {values_file_path}\\n\")\n",
+    "\n",
+    "result = run(\n",
+    "    [\n",
+    "        \"helm\", \"template\", helm_release, str(helm_chart_ref),\n",
+    "        \"-n\", namespace,\n",
+    "        *values_arg,\n",
+    "        \"--debug\"  # Show computed values\n",
+    "    ],\n",
+    "    check=False,  # Don't fail on warnings\n",
+    "    stream=True\n",
+    ")\n",
+    "\n",
+    "# Save rendered templates\n",
+    "rendered_file = artifacts_dir / \"helm-rendered-templates.yaml\"\n",
+    "with open(rendered_file, \"w\") as f:\n",
+    "    f.write(result.stdout)\n",
+    "    if result.stderr:\n",
+    "        f.write(\"\\n\\nSTDERR:\\n\")\n",
+    "        f.write(result.stderr)\n",
+    "\n",
+    "print(f\"\\n💡 Rendered templates saved to: {rendered_file}\")\n",
+    "\n",
+    "if result.returncode == 0:\n",
+    "    ok(\"Template rendering successful\")\n",
+    "    print(\"\\n⚠️  Review the rendered templates above. If they look correct, proceed to install.\")\n",
+    "else:\n",
+    "    warn(f\"Template rendering had issues (rc={result.returncode})\")\n",
+    "    print(\"💡 Review the errors above before proceeding\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Installing LangSmith with Helm\n",
+    "\n",
+    "**⚠️ WARNING:** This will install LangSmith into your cluster.\n",
+    "\n",
+    "Only proceed if:\n",
+    "1. ✅ You've reviewed the rendered templates\n",
+    "2. ✅ Secrets are created\n",
+    "3. ✅ Values file is correct\n",
+    "4. ✅ Terraform outputs are loaded\n",
+    "\n",
+    "**Estimated installation time:** 5-10 minutes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install LangSmith with Helm\n",
+    "# ⚠️  UNCOMMENT THE CODE BELOW TO ACTUALLY INSTALL\n",
+    "# This is commented out by default to prevent accidental deployments\n",
+    "\n",
+    "print(\"### Installing LangSmith with Helm\\n\")\n",
+    "print(\"⚠️  This cell is currently DISABLED to prevent accidental deployments.\\n\")\n",
+    "print(\"To install, uncomment the code below and run this cell.\\n\")\n",
+    "\n",
+    "# UNCOMMENT TO INSTALL:\n",
+    "# print(\"Installing LangSmith... This may take 5-10 minutes.\\n\")\n",
+    "# \n",
+    "# values_arg = []\n",
+    "# if values_file_path and values_file_path.exists():\n",
+    "#     values_arg = [\"-f\", str(values_file_path)]\n",
+    "# \n",
+    "# result = run(\n",
+    "#     [\n",
+    "#         \"helm\", \"install\", helm_release, str(helm_chart_ref),\n",
+    "#         \"-n\", namespace,\n",
+    "#         \"--create-namespace\",\n",
+    "#         *values_arg,\n",
+    "#         \"--wait\",  # Wait for deployment to be ready\n",
+    "#         \"--timeout\", \"10m\"\n",
+    "#     ],\n",
+    "#     check=False,  # Don't fail immediately, we'll check status\n",
+    "#     stream=True\n",
+    "# )\n",
+    "# \n",
+    "# # Save install output\n",
+    "# install_file = artifacts_dir / \"helm-install.txt\"\n",
+    "# with open(install_file, \"w\") as f:\n",
+    "#     f.write(result.stdout)\n",
+    "#     if result.stderr:\n",
+    "#         f.write(\"\\n\\nSTDERR:\\n\")\n",
+    "#         f.write(result.stderr)\n",
+    "# \n",
+    "# if result.returncode == 0:\n",
+    "#     ok(\"Helm install completed\")\n",
+    "#     print(f\"\\n💡 Install output saved to: {install_file}\")\n",
+    "# else:\n",
+    "#     warn(f\"Helm install had issues (rc={result.returncode})\")\n",
+    "#     print(\"💡 Check the output above for errors\")\n",
+    "\n",
+    "print(\"💡 To install, edit this cell and uncomment the code above\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Understanding: \"helm install succeeded\" ≠ \"system is healthy\"\n",
+    "\n",
+    "**Important:** A successful Helm install only means:\n",
+    "- Resources were created\n",
+    "- Helm release is tracked\n",
+    "\n",
+    "It does **not** mean:\n",
+    "- Pods are running\n",
+    "- Services are healthy\n",
+    "- Ingress is working\n",
+    "- Database connections work\n",
+    "\n",
+    "We'll validate system health in the next notebook.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check Helm release status\n",
+    "print(\"### Helm Release Status\\n\")\n",
+    "\n",
+    "result = run(\n",
+    "    [\"helm\", \"list\", \"-n\", namespace, \"-o\", \"json\"],\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "releases = json.loads(result.stdout)\n",
+    "langsmith_releases = [r for r in releases if r.get(\"name\") == helm_release]\n",
+    "\n",
+    "if langsmith_releases:\n",
+    "    release = langsmith_releases[0]\n",
+    "    print(f\"Release: {release['name']}\")\n",
+    "    print(f\"Status: {release['status']}\")\n",
+    "    print(f\"Chart: {release['chart']}\")\n",
+    "    print(f\"Namespace: {release['namespace']}\")\n",
+    "    print(f\"Revision: {release['revision']}\")\n",
+    "    \n",
+    "    if release['status'] == 'deployed':\n",
+    "        ok(\"Helm release is deployed\")\n",
+    "        print(\"\\n💡 Remember: 'deployed' doesn't mean healthy!\")\n",
+    "        print(\"   Proceed to validation notebook to check pod status, ingress, etc.\")\n",
+    "    else:\n",
+    "        warn(f\"Helm release status: {release['status']}\")\n",
+    "else:\n",
+    "    warn(f\"Helm release '{helm_release}' not found\")\n",
+    "    print(\"💡 If you just installed, wait a moment and check again\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "### ✅ What We Accomplished\n",
+    "\n",
+    "- [ ] Located and verified Helm chart\n",
+    "- [ ] Pinned chart version\n",
+    "- [ ] Loaded Terraform outputs\n",
+    "- [ ] Created/verified values file\n",
+    "- [ ] Created Kubernetes secrets\n",
+    "- [ ] Rendered templates for review\n",
+    "- [ ] Installed LangSmith (if you uncommented the install step)\n",
+    "- [ ] Checked Helm release status\n",
+    "\n",
+    "### 📋 Key Takeaways\n",
+    "\n",
+    "1. **Use official Helm chart** - Don't fork, reference directly\n",
+    "2. **Pin versions** - Ensures reproducibility\n",
+    "3. **Start minimal** - Add complexity only as needed\n",
+    "4. **Render first** - Always render templates before installing\n",
+    "5. **Secrets matter** - Properly inject required secrets\n",
+    "6. **Install ≠ Healthy** - Validation comes next\n",
+    "\n",
+    "### 🎯 Next Steps\n",
+    "\n",
+    "Proceed to **04_validate_ingress_and_ui.ipynb** to validate:\n",
+    "- Pod readiness\n",
+    "- PVC binding\n",
+    "- Ingress provisioning\n",
+    "- Endpoint reachability\n",
+    "- Basic UI availability\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,590 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1: Validation & Go/No-Go Checklist\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "This notebook validates that your LangSmith deployment is healthy and ready for use. This checklist becomes your **baseline reference** for future troubleshooting.\n",
+    "\n",
+    "### What We'll Validate\n",
+    "\n",
+    "1. ✅ Pod readiness (all pods running)\n",
+    "2. ✅ PVC binding (storage provisioned)\n",
+    "3. ✅ Ingress provisioning (ALB created)\n",
+    "4. ✅ Endpoint reachability (services accessible)\n",
+    "5. ✅ Basic UI availability (web interface works)\n",
+    "\n",
+    "### Why This Matters\n",
+    "\n",
+    "Most issues are caught here, before real users onboard. This validation ensures you're on a **supported path**.\n",
+    "\n",
+    "**Estimated time:** 20-30 minutes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bootstrap environment\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Add notebooks directory to path so we can import shared as a package\n",
+    "# Find the notebooks directory by looking for the shared folder\n",
+    "possible_paths = [\n",
+    "    Path.cwd().parent,  # If cwd is module-1, go up one level to notebooks\n",
+    "    Path.cwd(),  # If cwd is already notebooks\n",
+    "    Path.cwd() / \"notebooks\",  # If cwd is workspace root\n",
+    "]\n",
+    "\n",
+    "notebooks_path = None\n",
+    "for path in possible_paths:\n",
+    "    if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        notebooks_path = path\n",
+    "        break\n",
+    "\n",
+    "if not notebooks_path:\n",
+    "    notebooks_path = Path.cwd() / \"notebooks\"\n",
+    "    if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
+    "\n",
+    "# Add notebooks directory to path so 'shared' can be imported as a package\n",
+    "if str(notebooks_path) not in sys.path:\n",
+    "    sys.path.insert(0, str(notebooks_path))\n",
+    "\n",
+    "from shared._bootstrap import bootstrap\n",
+    "\n",
+    "# Run bootstrap\n",
+    "bootstrap_info = bootstrap()\n",
+    "artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
+    "print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting Up Cluster Access\n",
+    "\n",
+    "Ensure kubectl is configured for the EKS cluster.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from shared._validation import require_env, ok\n",
+    "from shared._aws_helpers import aws_region\n",
+    "from shared._shell import run\n",
+    "\n",
+    "# Get configuration\n",
+    "config = require_env(\"CLUSTER_NAME\", \"AWS_REGION\", \"NAMESPACE\")\n",
+    "cluster_name = config[\"CLUSTER_NAME\"]\n",
+    "region = aws_region()\n",
+    "namespace = config[\"NAMESPACE\"]\n",
+    "\n",
+    "# Configure kubectl\n",
+    "print(\"### Configuring kubectl\\n\")\n",
+    "run(\n",
+    "    [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
+    "    check=True,\n",
+    "    stream=True\n",
+    ")\n",
+    "ok(\"kubectl configured\")\n",
+    "\n",
+    "# Test cluster access\n",
+    "result = run([\"kubectl\", \"cluster-info\"], check=True, stream=False)\n",
+    "print(result.stdout)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Pod Readiness Check\n",
+    "\n",
+    "**Critical:** All pods must be in `Running` state with `Ready` status. This is the foundation of a healthy deployment.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shared._k8s_helpers import get_pods, wait_for_deployments_ready, require_namespace\n",
+    "import json\n",
+    "\n",
+    "# Ensure namespace exists\n",
+    "require_namespace(namespace)\n",
+    "\n",
+    "# Wait for deployments to be ready (with timeout)\n",
+    "print(\"### Waiting for Deployments to be Ready\\n\")\n",
+    "print(\"This may take a few minutes if pods are still starting...\\n\")\n",
+    "\n",
+    "try:\n",
+    "    wait_for_deployments_ready(namespace, timeout=\"10m\")\n",
+    "except Exception as e:\n",
+    "    print(f\"⚠️  Timeout or error waiting for deployments: {e}\")\n",
+    "    print(\"💡 Some pods may still be starting. Continuing with status check...\")\n",
+    "\n",
+    "# Get pod status\n",
+    "print(\"\\n### Pod Status\\n\")\n",
+    "pods_output = get_pods(namespace)\n",
+    "print(pods_output)\n",
+    "\n",
+    "# Parse pod status\n",
+    "result = run(\n",
+    "    [\"kubectl\", \"get\", \"pods\", \"-n\", namespace, \"-o\", \"json\"],\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "pods_data = json.loads(result.stdout)\n",
+    "\n",
+    "# Analyze pod status\n",
+    "running = 0\n",
+    "pending = 0\n",
+    "failed = 0\n",
+    "ready = 0\n",
+    "total = len(pods_data.get(\"items\", []))\n",
+    "\n",
+    "for pod in pods_data.get(\"items\", []):\n",
+    "    status = pod.get(\"status\", {})\n",
+    "    phase = status.get(\"phase\", \"Unknown\")\n",
+    "    conditions = status.get(\"conditions\", [])\n",
+    "    \n",
+    "    if phase == \"Running\":\n",
+    "        running += 1\n",
+    "        # Check ready condition\n",
+    "        for cond in conditions:\n",
+    "            if cond.get(\"type\") == \"Ready\" and cond.get(\"status\") == \"True\":\n",
+    "                ready += 1\n",
+    "                break\n",
+    "    elif phase == \"Pending\":\n",
+    "        pending += 1\n",
+    "    elif phase == \"Failed\":\n",
+    "        failed += 1\n",
+    "\n",
+    "print(f\"\\n### Pod Summary\")\n",
+    "print(f\"Total pods: {total}\")\n",
+    "print(f\"Running: {running}\")\n",
+    "print(f\"Ready: {ready}\")\n",
+    "print(f\"Pending: {pending}\")\n",
+    "print(f\"Failed: {failed}\")\n",
+    "\n",
+    "if ready == total and total > 0:\n",
+    "    ok(f\"All {total} pods are ready\")\n",
+    "elif running == total and total > 0:\n",
+    "    warn(f\"All pods running but {total - ready} not ready yet\")\n",
+    "else:\n",
+    "    warn(f\"Pod status: {running}/{total} running, {ready}/{total} ready\")\n",
+    "    if pending > 0:\n",
+    "        print(\"💡 Some pods are still pending. Check events for issues:\")\n",
+    "        run([\"kubectl\", \"get\", \"events\", \"-n\", namespace, \"--sort-by=.lastTimestamp\"], check=False, stream=True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check PVC status\n",
+    "print(\"### Persistent Volume Claims Status\\n\")\n",
+    "\n",
+    "result = run(\n",
+    "    [\"kubectl\", \"get\", \"pvc\", \"-n\", namespace, \"-o\", \"json\"],\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "pvc_data = json.loads(result.stdout)\n",
+    "\n",
+    "# Display PVCs\n",
+    "print(\"PVC Details:\")\n",
+    "print(\"=\" * 80)\n",
+    "run([\"kubectl\", \"get\", \"pvc\", \"-n\", namespace, \"-o\", \"wide\"], check=True, stream=True)\n",
+    "print(\"=\" * 80)\n",
+    "\n",
+    "# Analyze PVC status\n",
+    "bound = 0\n",
+    "pending = 0\n",
+    "total = len(pvc_data.get(\"items\", []))\n",
+    "\n",
+    "for pvc in pvc_data.get(\"items\", []):\n",
+    "    status = pvc.get(\"status\", {})\n",
+    "    phase = status.get(\"phase\", \"Unknown\")\n",
+    "    \n",
+    "    if phase == \"Bound\":\n",
+    "        bound += 1\n",
+    "    elif phase == \"Pending\":\n",
+    "        pending += 1\n",
+    "        # Show details for pending PVCs\n",
+    "        name = pvc.get(\"metadata\", {}).get(\"name\", \"unknown\")\n",
+    "        print(f\"\\n⚠️  PVC '{name}' is Pending\")\n",
+    "        print(\"   Common causes:\")\n",
+    "        print(\"   - EBS CSI driver not installed\")\n",
+    "        print(\"   - No StorageClass available\")\n",
+    "        print(\"   - Insufficient storage quota\")\n",
+    "\n",
+    "print(f\"\\n### PVC Summary\")\n",
+    "print(f\"Total PVCs: {total}\")\n",
+    "print(f\"Bound: {bound}\")\n",
+    "print(f\"Pending: {pending}\")\n",
+    "\n",
+    "if bound == total and total > 0:\n",
+    "    ok(f\"All {total} PVCs are bound\")\n",
+    "elif pending > 0:\n",
+    "    warn(f\"{pending} PVC(s) still pending - storage issue likely\")\n",
+    "    print(\"💡 Check EBS CSI driver and StorageClasses\")\n",
+    "else:\n",
+    "    ok(\"PVC status looks good\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Ingress Provisioning Check\n",
+    "\n",
+    "**Critical:** The AWS ALB (Application Load Balancer) must be provisioned. This is how external traffic reaches LangSmith.\n",
+    "\n",
+    "Common issue: ALB never appears due to wrong ingress assumptions.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check ingress resources\n",
+    "print(\"### Ingress Resources\\n\")\n",
+    "\n",
+    "# Get ingress\n",
+    "result = run(\n",
+    "    [\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"json\"],\n",
+    "    check=False,  # May not exist yet\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "if result.returncode == 0:\n",
+    "    ingress_data = json.loads(result.stdout)\n",
+    "    ingresses = ingress_data.get(\"items\", [])\n",
+    "    \n",
+    "    if ingresses:\n",
+    "        print(\"Ingress Details:\")\n",
+    "        print(\"=\" * 80)\n",
+    "        run([\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"wide\"], check=True, stream=True)\n",
+    "        print(\"=\" * 80)\n",
+    "        \n",
+    "        for ingress in ingresses:\n",
+    "            name = ingress.get(\"metadata\", {}).get(\"name\", \"unknown\")\n",
+    "            status = ingress.get(\"status\", {})\n",
+    "            load_balancer = status.get(\"loadBalancer\", {})\n",
+    "            ingress_hosts = []\n",
+    "            \n",
+    "            # Get ingress hosts\n",
+    "            rules = ingress.get(\"spec\", {}).get(\"rules\", [])\n",
+    "            for rule in rules:\n",
+    "                host = rule.get(\"host\", \"\")\n",
+    "                if host:\n",
+    "                    ingress_hosts.append(host)\n",
+    "            \n",
+    "            print(f\"\\nIngress: {name}\")\n",
+    "            if ingress_hosts:\n",
+    "                print(f\"  Hosts: {', '.join(ingress_hosts)}\")\n",
+    "            \n",
+    "            # Check for ALB address\n",
+    "            if load_balancer.get(\"ingress\"):\n",
+    "                alb_addresses = [ing.get(\"hostname\", ing.get(\"ip\", \"\")) for ing in load_balancer[\"ingress\"]]\n",
+    "                if alb_addresses:\n",
+    "                    ok(f\"ALB provisioned: {', '.join(alb_addresses)}\")\n",
+    "                    print(f\"  💡 Access LangSmith at: https://{alb_addresses[0]}\")\n",
+    "                else:\n",
+    "                    warn(\"ALB ingress entry exists but no address found\")\n",
+    "            else:\n",
+    "                warn(\"ALB not yet provisioned (may take a few minutes)\")\n",
+    "                print(\"  💡 Wait a few minutes and check again\")\n",
+    "    else:\n",
+    "        warn(\"No ingress resources found\")\n",
+    "        print(\"💡 Ingress may not be configured in Helm values\")\n",
+    "else:\n",
+    "    warn(\"Could not retrieve ingress resources\")\n",
+    "    print(\"💡 Ingress may not exist yet or namespace is incorrect\")\n",
+    "\n",
+    "# Also check for ALB Ingress Controller\n",
+    "print(\"\\n### ALB Ingress Controller\\n\")\n",
+    "result = run(\n",
+    "    [\"kubectl\", \"get\", \"pods\", \"-n\", \"kube-system\", \"-l\", \"app.kubernetes.io/name=aws-load-balancer-controller\", \"-o\", \"json\"],\n",
+    "    check=False,\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "if result.returncode == 0:\n",
+    "    controller_data = json.loads(result.stdout)\n",
+    "    controllers = controller_data.get(\"items\", [])\n",
+    "    if controllers:\n",
+    "        ok(f\"ALB Ingress Controller found ({len(controllers)} pod(s))\")\n",
+    "    else:\n",
+    "        warn(\"ALB Ingress Controller not found\")\n",
+    "        print(\"💡 ALB Ingress Controller must be installed for ingress to work\")\n",
+    "else:\n",
+    "    warn(\"Could not check ALB Ingress Controller status\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Endpoint Reachability Check\n",
+    "\n",
+    "Verify that services are accessible and responding. We'll check:\n",
+    "- Service endpoints\n",
+    "- Health check endpoints (if available)\n",
+    "- Internal service connectivity\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check services\n",
+    "print(\"### Service Endpoints\\n\")\n",
+    "\n",
+    "result = run(\n",
+    "    [\"kubectl\", \"get\", \"svc\", \"-n\", namespace, \"-o\", \"json\"],\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "services_data = json.loads(result.stdout)\n",
+    "\n",
+    "print(\"Services:\")\n",
+    "print(\"=\" * 80)\n",
+    "run([\"kubectl\", \"get\", \"svc\", \"-n\", namespace], check=True, stream=True)\n",
+    "print(\"=\" * 80)\n",
+    "\n",
+    "services = services_data.get(\"items\", [])\n",
+    "if services:\n",
+    "    ok(f\"Found {len(services)} service(s)\")\n",
+    "    \n",
+    "    # Check for LoadBalancer services\n",
+    "    lb_services = [svc for svc in services if svc.get(\"spec\", {}).get(\"type\") == \"LoadBalancer\"]\n",
+    "    if lb_services:\n",
+    "        print(f\"\\nLoadBalancer services: {len(lb_services)}\")\n",
+    "        for svc in lb_services:\n",
+    "            name = svc.get(\"metadata\", {}).get(\"name\", \"unknown\")\n",
+    "            status = svc.get(\"status\", {}).get(\"loadBalancer\", {})\n",
+    "            if status.get(\"ingress\"):\n",
+    "                lb_address = status[\"ingress\"][0].get(\"hostname\") or status[\"ingress\"][0].get(\"ip\")\n",
+    "                ok(f\"Service '{name}' has LoadBalancer: {lb_address}\")\n",
+    "            else:\n",
+    "                warn(f\"Service '{name}' LoadBalancer pending\")\n",
+    "    \n",
+    "    # Test internal connectivity (if we can exec into a pod)\n",
+    "    print(\"\\n### Testing Internal Service Connectivity\\n\")\n",
+    "    # Try to find a pod we can exec into\n",
+    "    result = run(\n",
+    "        [\"kubectl\", \"get\", \"pods\", \"-n\", namespace, \"-o\", \"jsonpath={.items[0].metadata.name}\"],\n",
+    "        check=False,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    \n",
+    "    if result.returncode == 0 and result.stdout.strip():\n",
+    "        test_pod = result.stdout.strip()\n",
+    "        print(f\"Testing connectivity from pod: {test_pod}\")\n",
+    "        # Try a simple DNS lookup or curl\n",
+    "        # This is a basic check - actual health endpoints depend on the application\n",
+    "        print(\"💡 Internal connectivity tests depend on application-specific health endpoints\")\n",
+    "else:\n",
+    "    warn(\"No services found\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Basic UI Availability Check\n",
+    "\n",
+    "**Final validation:** Can we actually access the LangSmith UI through the ingress?\n",
+    "\n",
+    "This is the ultimate test - if the UI loads, everything is working.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from urllib.parse import urlparse\n",
+    "\n",
+    "# Get ingress hostname\n",
+    "print(\"### UI Availability Check\\n\")\n",
+    "\n",
+    "result = run(\n",
+    "    [\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"jsonpath={.items[0].status.loadBalancer.ingress[0].hostname}\"],\n",
+    "    check=False,\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "if result.returncode == 0 and result.stdout.strip():\n",
+    "    ingress_host = result.stdout.strip()\n",
+    "    print(f\"Ingress hostname: {ingress_host}\")\n",
+    "    \n",
+    "    # Try to access the UI (HTTPS)\n",
+    "    ui_url = f\"https://{ingress_host}\"\n",
+    "    print(f\"\\nTesting UI availability at: {ui_url}\")\n",
+    "    print(\"(This may take a moment if ALB is still provisioning...)\\n\")\n",
+    "    \n",
+    "    try:\n",
+    "        # Use a short timeout and allow redirects\n",
+    "        response = requests.get(ui_url, timeout=10, allow_redirects=True, verify=False)\n",
+    "        if response.status_code == 200:\n",
+    "            ok(f\"UI is accessible! Status: {response.status_code}\")\n",
+    "            print(f\"💡 Open in browser: {ui_url}\")\n",
+    "        elif response.status_code in [301, 302, 307, 308]:\n",
+    "            ok(f\"UI redirects (status: {response.status_code}) - likely working\")\n",
+    "            print(f\"💡 Redirect location: {response.headers.get('Location', 'N/A')}\")\n",
+    "            print(f\"💡 Open in browser: {ui_url}\")\n",
+    "        else:\n",
+    "            warn(f\"UI returned status {response.status_code}\")\n",
+    "            print(\"💡 UI may still be starting or there may be a configuration issue\")\n",
+    "    except requests.exceptions.SSLError:\n",
+    "        # SSL errors might be expected if using self-signed certs\n",
+    "        warn(\"SSL verification failed (may be expected with self-signed certs)\")\n",
+    "        print(f\"💡 Try accessing: {ui_url}\")\n",
+    "        print(\"   Browser may show security warning - this is normal for self-signed certs\")\n",
+    "    except requests.exceptions.Timeout:\n",
+    "        warn(\"UI request timed out\")\n",
+    "        print(\"💡 ALB may still be provisioning, or ingress is not fully configured\")\n",
+    "        print(f\"   Try again in a few minutes: {ui_url}\")\n",
+    "    except requests.exceptions.ConnectionError as e:\n",
+    "        warn(f\"Could not connect to UI: {e}\")\n",
+    "        print(\"💡 ALB may still be provisioning\")\n",
+    "        print(f\"   Check AWS console for ALB status, then try: {ui_url}\")\n",
+    "    except Exception as e:\n",
+    "        warn(f\"Error accessing UI: {e}\")\n",
+    "        print(f\"💡 Manual check: Open {ui_url} in a browser\")\n",
+    "else:\n",
+    "    warn(\"Could not determine ingress hostname\")\n",
+    "    print(\"💡 Ingress may not be provisioned yet\")\n",
+    "    print(\"   Run the ingress check above and wait for ALB to be created\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Collecting Diagnostic Artifacts\n",
+    "\n",
+    "Save cluster state snapshots for future troubleshooting reference.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "\n",
+    "# Create diagnostic snapshot\n",
+    "print(\"### Collecting Diagnostic Artifacts\\n\")\n",
+    "\n",
+    "timestamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
+    "diagnostics_dir = artifacts_dir / f\"diagnostics-{timestamp}\"\n",
+    "diagnostics_dir.mkdir(exist_ok=True)\n",
+    "\n",
+    "print(f\"Saving diagnostics to: {diagnostics_dir}\\n\")\n",
+    "\n",
+    "# Save various cluster states\n",
+    "diagnostics = [\n",
+    "    (\"pods\", [\"kubectl\", \"get\", \"pods\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
+    "    (\"services\", [\"kubectl\", \"get\", \"svc\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
+    "    (\"ingress\", [\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
+    "    (\"pvc\", [\"kubectl\", \"get\", \"pvc\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
+    "    (\"deployments\", [\"kubectl\", \"get\", \"deployments\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
+    "    (\"events\", [\"kubectl\", \"get\", \"events\", \"-n\", namespace, \"--sort-by=.lastTimestamp\"]),\n",
+    "]\n",
+    "\n",
+    "for name, cmd in diagnostics:\n",
+    "    try:\n",
+    "        result = run(cmd, check=False, stream=False)\n",
+    "        output_file = diagnostics_dir / f\"{name}.txt\"\n",
+    "        with open(output_file, \"w\") as f:\n",
+    "            f.write(result.stdout)\n",
+    "            if result.stderr:\n",
+    "                f.write(\"\\n\\nSTDERR:\\n\")\n",
+    "                f.write(result.stderr)\n",
+    "        print(f\"✅ Saved {name}\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"⚠️  Could not save {name}: {e}\")\n",
+    "\n",
+    "ok(f\"Diagnostics saved to: {diagnostics_dir}\")\n",
+    "print(\"\\n💡 These artifacts can be used for troubleshooting or support tickets\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Go/No-Go Checklist\n",
+    "\n",
+    "Review this checklist. All items should be ✅ before considering the deployment ready.\n",
+    "\n",
+    "### ✅ Validation Checklist\n",
+    "\n",
+    "- [ ] All pods are running and ready\n",
+    "- [ ] All PVCs are bound\n",
+    "- [ ] Ingress/ALB is provisioned\n",
+    "- [ ] Services are accessible\n",
+    "- [ ] UI is reachable (or ALB is provisioning)\n",
+    "- [ ] Diagnostic artifacts collected\n",
+    "\n",
+    "### 🎯 Next Steps\n",
+    "\n",
+    "**If all checks pass:**\n",
+    "- ✅ You have a working baseline deployment\n",
+    "- ✅ You're on a supported path\n",
+    "- ✅ Ready to proceed to Module 2 (SSO/OIDC configuration)\n",
+    "\n",
+    "**If checks fail:**\n",
+    "- Review the warnings above\n",
+    "- Check diagnostic artifacts\n",
+    "- Common issues:\n",
+    "  - **PVCs pending:** EBS CSI driver not installed\n",
+    "  - **ALB not appearing:** Wrong ingress configuration\n",
+    "  - **Pods not ready:** Check events and logs\n",
+    "  - **UI not accessible:** Wait for ALB provisioning (can take 5-10 minutes)\n",
+    "\n",
+    "### 📋 Baseline Reference\n",
+    "\n",
+    "This validation checklist becomes your **baseline reference** for future troubleshooting. Save the diagnostic artifacts and refer back to this state when investigating issues.\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,406 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Module 1: Teardown & Cleanup\n",
+    "\n",
+    "## Overview\n",
+    "\n",
+    "This notebook helps you clean up the resources created during Module 1. This is important to:\n",
+    "- Avoid ongoing AWS costs\n",
+    "- Clean up test environments\n",
+    "- Practice proper resource lifecycle management\n",
+    "\n",
+    "### ⚠️ Warning\n",
+    "\n",
+    "This will **destroy** your LangSmith deployment and associated AWS resources. Only run this if you're sure you want to remove everything.\n",
+    "\n",
+    "**What will be destroyed:**\n",
+    "- Helm release (LangSmith application)\n",
+    "- Terraform-managed infrastructure (EKS, RDS, ElastiCache, S3, etc.)\n",
+    "- All associated data\n",
+    "\n",
+    "**Estimated time:** 30-45 minutes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bootstrap environment\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Add notebooks directory to path so we can import shared as a package\n",
+    "# Find the notebooks directory by looking for the shared folder\n",
+    "possible_paths = [\n",
+    "    Path.cwd().parent,  # If cwd is module-1, go up one level to notebooks\n",
+    "    Path.cwd(),  # If cwd is already notebooks\n",
+    "    Path.cwd() / \"notebooks\",  # If cwd is workspace root\n",
+    "]\n",
+    "\n",
+    "notebooks_path = None\n",
+    "for path in possible_paths:\n",
+    "    if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        notebooks_path = path\n",
+    "        break\n",
+    "\n",
+    "if not notebooks_path:\n",
+    "    notebooks_path = Path.cwd() / \"notebooks\"\n",
+    "    if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
+    "        raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
+    "\n",
+    "# Add notebooks directory to path so 'shared' can be imported as a package\n",
+    "if str(notebooks_path) not in sys.path:\n",
+    "    sys.path.insert(0, str(notebooks_path))\n",
+    "\n",
+    "from shared._bootstrap import bootstrap\n",
+    "\n",
+    "# Run bootstrap\n",
+    "bootstrap_info = bootstrap()\n",
+    "artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
+    "print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Confirmation\n",
+    "\n",
+    "**⚠️ READ THIS CAREFULLY**\n",
+    "\n",
+    "Before proceeding, confirm:\n",
+    "1. ✅ You want to destroy all resources\n",
+    "2. ✅ You've backed up any important data\n",
+    "3. ✅ You understand this cannot be undone\n",
+    "4. ✅ You're using the correct AWS account/region\n",
+    "\n",
+    "**Double-check your AWS account and region before proceeding!**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from shared._validation import require_env\n",
+    "from shared._aws_helpers import aws_region, sts_identity\n",
+    "\n",
+    "# Show current AWS session\n",
+    "config = require_env(\"CLUSTER_NAME\", \"AWS_REGION\", \"NAMESPACE\", \"HELM_RELEASE\")\n",
+    "region = aws_region()\n",
+    "identity = sts_identity()\n",
+    "\n",
+    "print(\"### Current AWS Session\")\n",
+    "print(\"=\" * 60)\n",
+    "print(f\"Account ID: {identity['Account']}\")\n",
+    "print(f\"Region: {region}\")\n",
+    "print(f\"User ARN: {identity['Arn']}\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "print(f\"\\n### Resources to be Destroyed\")\n",
+    "print(f\"Cluster: {config['CLUSTER_NAME']}\")\n",
+    "print(f\"Namespace: {config['NAMESPACE']}\")\n",
+    "print(f\"Helm Release: {config['HELM_RELEASE']}\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "print(\"\\n⚠️  VERIFY THE ABOVE INFORMATION IS CORRECT!\")\n",
+    "print(\"💡 If this is the wrong account/region, STOP NOW and update your .env file\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Uninstall Helm Release\n",
+    "\n",
+    "First, we'll uninstall the LangSmith Helm release. This removes the application but leaves the infrastructure.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shared._shell import run\n",
+    "from shared._aws_helpers import aws_region\n",
+    "\n",
+    "cluster_name = config[\"CLUSTER_NAME\"]\n",
+    "namespace = config[\"NAMESPACE\"]\n",
+    "helm_release = config[\"HELM_RELEASE\"]\n",
+    "region = aws_region()\n",
+    "\n",
+    "# Ensure kubectl is configured\n",
+    "print(\"### Configuring kubectl\\n\")\n",
+    "run(\n",
+    "    [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
+    "    check=True,\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "# Check if Helm release exists\n",
+    "print(f\"\\n### Checking Helm Release: {helm_release}\\n\")\n",
+    "result = run(\n",
+    "    [\"helm\", \"list\", \"-n\", namespace, \"-o\", \"json\"],\n",
+    "    check=False,\n",
+    "    stream=False\n",
+    ")\n",
+    "\n",
+    "import json\n",
+    "releases = json.loads(result.stdout) if result.returncode == 0 else []\n",
+    "langsmith_releases = [r for r in releases if r.get(\"name\") == helm_release]\n",
+    "\n",
+    "if langsmith_releases:\n",
+    "    release = langsmith_releases[0]\n",
+    "    print(f\"Found Helm release: {release['name']}\")\n",
+    "    print(f\"Status: {release['status']}\")\n",
+    "    print(f\"Chart: {release['chart']}\")\n",
+    "    \n",
+    "    print(f\"\\n⚠️  UNCOMMENT THE CODE BELOW TO UNINSTALL HELM RELEASE\")\n",
+    "    print(\"This will remove the LangSmith application from the cluster.\\n\")\n",
+    "    \n",
+    "    # UNCOMMENT TO UNINSTALL:\n",
+    "    # print(\"Uninstalling Helm release...\\n\")\n",
+    "    # result = run(\n",
+    "    #     [\"helm\", \"uninstall\", helm_release, \"-n\", namespace],\n",
+    "    #     check=True,\n",
+    "    #     stream=True\n",
+    "    # )\n",
+    "    # print(\"\\n✅ Helm release uninstalled\")\n",
+    "else:\n",
+    "    print(f\"Helm release '{helm_release}' not found\")\n",
+    "    print(\"💡 It may have already been uninstalled, or the namespace is different\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Clean Up Kubernetes Resources\n",
+    "\n",
+    "Remove any remaining Kubernetes resources (secrets, PVCs, etc.) that might not be cleaned up by Helm.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check for remaining resources\n",
+    "print(\"### Checking for Remaining Kubernetes Resources\\n\")\n",
+    "\n",
+    "# List resources in namespace\n",
+    "resources_to_check = [\n",
+    "    (\"pods\", [\"kubectl\", \"get\", \"pods\", \"-n\", namespace]),\n",
+    "    (\"services\", [\"kubectl\", \"get\", \"svc\", \"-n\", namespace]),\n",
+    "    (\"secrets\", [\"kubectl\", \"get\", \"secrets\", \"-n\", namespace]),\n",
+    "    (\"pvc\", [\"kubectl\", \"get\", \"pvc\", \"-n\", namespace]),\n",
+    "]\n",
+    "\n",
+    "remaining = []\n",
+    "for resource_type, cmd in resources_to_check:\n",
+    "    result = run(cmd, check=False, stream=False)\n",
+    "    if result.returncode == 0:\n",
+    "        lines = result.stdout.strip().split('\\n')\n",
+    "        # Skip header line\n",
+    "        if len(lines) > 1:\n",
+    "            remaining.append(resource_type)\n",
+    "            print(f\"⚠️  Found {len(lines) - 1} {resource_type}(s)\")\n",
+    "\n",
+    "if remaining:\n",
+    "    print(f\"\\n💡 The following resource types still exist: {', '.join(remaining)}\")\n",
+    "    print(\"   You may want to clean these up manually:\")\n",
+    "    print(f\"   kubectl delete all --all -n {namespace}\")\n",
+    "    print(f\"   kubectl delete pvc --all -n {namespace}\")\n",
+    "    print(f\"   kubectl delete secrets --all -n {namespace}\")\n",
+    "else:\n",
+    "    print(\"✅ No remaining resources found (or namespace is empty)\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3: Destroy Terraform Infrastructure\n",
+    "\n",
+    "**⚠️ CRITICAL:** This will destroy all AWS infrastructure including:\n",
+    "- EKS cluster\n",
+    "- RDS PostgreSQL database (and all data)\n",
+    "- ElastiCache Redis (and all data)\n",
+    "- S3 buckets (and all data)\n",
+    "- IAM roles and policies\n",
+    "- VPC resources (if managed by Terraform)\n",
+    "\n",
+    "**This cannot be undone!**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "terraform_dir = Path(config.get(\"TERRAFORM_DIR\", \"\")).expanduser().resolve()\n",
+    "\n",
+    "if not terraform_dir.exists():\n",
+    "    print(f\"⚠️  Terraform directory not found: {terraform_dir}\")\n",
+    "    print(\"💡 Update TERRAFORM_DIR in your .env file, or destroy infrastructure manually\")\n",
+    "else:\n",
+    "    print(f\"### Terraform Directory: {terraform_dir}\\n\")\n",
+    "    \n",
+    "    # Check Terraform state\n",
+    "    print(\"Checking Terraform state...\\n\")\n",
+    "    result = run(\n",
+    "        [\"terraform\", \"show\", \"-json\"],\n",
+    "        cwd=str(terraform_dir),\n",
+    "        check=False,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    \n",
+    "    if result.returncode == 0:\n",
+    "        state_data = json.loads(result.stdout)\n",
+    "        if state_data.get(\"values\") and state_data[\"values\"].get(\"root_module\"):\n",
+    "            resources = state_data[\"values\"][\"root_module\"].get(\"resources\", [])\n",
+    "            print(f\"Found {len(resources)} resources in Terraform state\")\n",
+    "            print(\"⚠️  These will all be destroyed!\\n\")\n",
+    "        else:\n",
+    "            print(\"Terraform state appears empty or not initialized\")\n",
+    "    else:\n",
+    "        print(\"Could not read Terraform state\")\n",
+    "        print(\"💡 Terraform may not be initialized, or state file doesn't exist\")\n",
+    "    \n",
+    "    print(\"⚠️  UNCOMMENT THE CODE BELOW TO DESTROY TERRAFORM INFRASTRUCTURE\")\n",
+    "    print(\"This will destroy ALL resources managed by Terraform.\\n\")\n",
+    "    \n",
+    "    # UNCOMMENT TO DESTROY:\n",
+    "    # print(\"Destroying Terraform infrastructure...\")\n",
+    "    # print(\"This will take 15-30 minutes...\\n\")\n",
+    "    # \n",
+    "    # result = run(\n",
+    "    #     [\"terraform\", \"destroy\", \"-auto-approve\"],\n",
+    "    #     cwd=str(terraform_dir),\n",
+    "    #     check=False,  # Don't fail on errors, we'll check return code\n",
+    "    #     stream=True\n",
+    "    # )\n",
+    "    # \n",
+    "    # # Save destroy output\n",
+    "    # destroy_file = artifacts_dir / \"terraform-destroy.txt\"\n",
+    "    # with open(destroy_file, \"w\") as f:\n",
+    "    #     f.write(result.stdout)\n",
+    "    #     if result.stderr:\n",
+    "    #         f.write(\"\\n\\nSTDERR:\\n\")\n",
+    "    #         f.write(result.stderr)\n",
+    "    # \n",
+    "    # if result.returncode == 0:\n",
+    "    #     print(\"\\n✅ Terraform destroy completed successfully\")\n",
+    "    #     print(f\"💡 Destroy output saved to: {destroy_file}\")\n",
+    "    # else:\n",
+    "    #     print(f\"\\n⚠️  Terraform destroy had issues (rc={result.returncode})\")\n",
+    "    #     print(\"💡 Review the output above for errors\")\n",
+    "    #     print(f\"   Destroy output saved to: {destroy_file}\")\n",
+    "    \n",
+    "    print(\"💡 To destroy, edit this cell and uncomment the code above\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4: Verify Cleanup\n",
+    "\n",
+    "After teardown, verify that resources have been removed.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from shared._aws_helpers import eks_cluster_exists\n",
+    "\n",
+    "print(\"### Verifying Cleanup\\n\")\n",
+    "\n",
+    "# Check if cluster still exists\n",
+    "cluster_name = config[\"CLUSTER_NAME\"]\n",
+    "region = aws_region()\n",
+    "\n",
+    "if eks_cluster_exists(cluster_name):\n",
+    "    warn(f\"Cluster '{cluster_name}' still exists\")\n",
+    "    print(\"💡 Terraform destroy may not have completed, or cluster was created outside Terraform\")\n",
+    "else:\n",
+    "    ok(f\"Cluster '{cluster_name}' does not exist (destroyed or never created)\")\n",
+    "\n",
+    "# Check for remaining S3 buckets (if we know the bucket name)\n",
+    "print(\"\\n### S3 Buckets\\n\")\n",
+    "print(\"💡 Check AWS console for any remaining S3 buckets\")\n",
+    "print(\"   Terraform should have destroyed buckets it created, but verify manually\")\n",
+    "\n",
+    "# Check for remaining RDS instances\n",
+    "print(\"\\n### RDS Instances\\n\")\n",
+    "print(\"💡 Check AWS console for any remaining RDS instances\")\n",
+    "print(\"   Terraform should have destroyed RDS instances it created\")\n",
+    "\n",
+    "# Check for remaining ElastiCache clusters\n",
+    "print(\"\\n### ElastiCache Clusters\\n\")\n",
+    "print(\"💡 Check AWS console for any remaining ElastiCache clusters\")\n",
+    "print(\"   Terraform should have destroyed ElastiCache clusters it created\")\n",
+    "\n",
+    "print(\"\\n✅ Cleanup verification complete\")\n",
+    "print(\"💡 Review AWS console to ensure all resources are removed\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "### ✅ Teardown Checklist\n",
+    "\n",
+    "- [ ] Helm release uninstalled\n",
+    "- [ ] Kubernetes resources cleaned up\n",
+    "- [ ] Terraform infrastructure destroyed\n",
+    "- [ ] EKS cluster removed\n",
+    "- [ ] RDS instance removed\n",
+    "- [ ] ElastiCache cluster removed\n",
+    "- [ ] S3 buckets removed (or emptied)\n",
+    "- [ ] AWS console verified (no remaining resources)\n",
+    "\n",
+    "### 💡 Important Notes\n",
+    "\n",
+    "1. **Data Loss:** All data in RDS, ElastiCache, and S3 has been permanently deleted\n",
+    "2. **Costs:** You should see AWS costs stop accruing within 24 hours\n",
+    "3. **Artifacts:** Diagnostic artifacts in `artifacts/` directory are preserved for reference\n",
+    "4. **Re-deployment:** You can re-run Module 1 notebooks to create a fresh deployment\n",
+    "\n",
+    "### 🎯 Next Steps\n",
+    "\n",
+    "If you want to start over:\n",
+    "1. Review and update your `.env` file\n",
+    "2. Run `01_aws_preflight.ipynb` again\n",
+    "3. Proceed through the module notebooks\n",
+    "\n",
+    "**Thank you for completing Module 1!**\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,39 @@
+from __future__ import annotations
+import os
+from typing import Optional
+from ._shell import run
+from ._validation import ok, warn
+
+def aws_region() -> str:
+    return os.environ.get("AWS_REGION", "").strip() or os.environ.get("AWS_DEFAULT_REGION", "").strip() or "us-west-2"
+
+def sts_identity() -> dict:
+    r = run(["aws", "sts", "get-caller-identity", "--output", "json"], check=True, stream=False)
+    import json
+    return json.loads(r.stdout)
+
+def assert_account(expected_account_id: Optional[str]) -> None:
+    if not expected_account_id:
+        return
+    ident = sts_identity()
+    actual = ident.get("Account", "")
+    if actual != expected_account_id:
+        raise RuntimeError(f"❌ AWS account mismatch: expected {expected_account_id}, got {actual}")
+    ok(f"AWS account guardrail matched: {actual}")
+
+def eks_cluster_exists(cluster_name: str) -> bool:
+    r = run(["aws", "eks", "describe-cluster", "--name", cluster_name, "--region", aws_region(), "--output", "json"],
+            check=False, stream=False)
+    if r.returncode == 0:
+        return True
+    if "ResourceNotFoundException" in r.stderr or "ResourceNotFoundException" in r.stdout:
+        return False
+    warn("EKS describe-cluster returned an unexpected error; treat as inconclusive.")
+    return False
+
+def alb_target_health(load_balancer_arn: str) -> str:
+    # Caller should provide ARN; this returns raw JSON for inspection.
+    return run(["aws", "elbv2", "describe-target-health",
+                "--target-group-arn", load_balancer_arn,
+                "--region", aws_region(),
+                "--output", "json"], check=False, stream=False).stdout
@@ -0,0 +1,182 @@
+from __future__ import annotations
+import os
+import sys
+import subprocess
+from pathlib import Path
+from typing import Optional
+
+# Install required packages if not available
+def _ensure_packages():
+    """Ensure required Python packages are installed."""
+    required_packages = [
+        "python-dotenv",  # For loading .env files
+        "pyyaml",  # For parsing YAML files (Chart.yaml, etc.)
+        "requests",  # For HTTP requests (UI validation)
+    ]
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *required_packages])
+
+_ensure_packages()
+
+from dotenv import load_dotenv
+from ._shell import run
+from ._validation import ok, warn, fail
+from ._aws_helpers import aws_region, sts_identity
+
+def load_env(env_file: Optional[str] = None) -> None:
+    """
+    Load environment variables from a .env file.
+    If env_file is not provided, looks for .env or workshop.env in the notebooks directory.
+    
+    Raises RuntimeError if neither env file is found.
+    """
+    if env_file is None:
+        # Look for .env file in notebooks root, with fallback to workshop.env
+        notebooks_dir = Path(__file__).parent.parent
+        # Try .env first (standard), then workshop.env (Jupyter-friendly)
+        env_file = notebooks_dir / ".env"
+        if not env_file.exists():
+            env_file = notebooks_dir / "workshop.env"
+    
+    env_path = Path(env_file).expanduser().resolve()
+    
+    if not env_path.exists():
+        # Calculate relative path from repo root for cleaner display
+        repo_root = Path(__file__).parent.parent.parent
+        notebooks_dir = Path(__file__).parent.parent
+        
+        # Show both options in the error message
+        try:
+            notebooks_dir_display = notebooks_dir.relative_to(repo_root)
+        except ValueError:
+            notebooks_dir_display = Path("notebooks")
+        
+        print(f"""❌ Environment file not found
+💡 To fix this, create one of these files:
+   Option 1 (via terminal): {notebooks_dir_display}/.env
+   Option 2 (via Jupyter): {notebooks_dir_display}/workshop.env
+
+   Copy a template:
+   cp env-samples/workshop.env.example {notebooks_dir_display}/.env
+   # OR
+   cp env-samples/workshop.env.example {notebooks_dir_display}/workshop.env
+
+   Then edit the file and fill in your configuration values.
+""")
+        raise RuntimeError(f"Missing environment file. Expected {notebooks_dir_display}/.env or {notebooks_dir_display}/workshop.env")
+    
+    load_dotenv(env_path, override=False)  # Don't override existing env vars
+    ok(f"Loaded environment variables from {env_path.name}")
+
+def check_required_tools() -> None:
+    """
+    Check that all required tools are available:
+    - aws cli
+    - terraform
+    - helm
+    - kubectl
+    - jq
+    """
+    print("### Checking required tools...")
+    tools = [
+        ("aws", ["aws", "--version"]),
+        ("terraform", ["terraform", "version"]),
+        ("helm", ["helm", "version"]),
+        ("kubectl", ["kubectl", "version", "--client"]),
+        ("jq", ["jq", "--version"]),
+    ]
+    
+    missing = []
+    for tool_name, version_cmd in tools:
+        try:
+            result = run(version_cmd, check=False, stream=False)
+            if result.returncode == 0:
+                ok(f"{tool_name} is available")
+            else:
+                missing.append(tool_name)
+                warn(f"{tool_name} check failed (rc={result.returncode})")
+        except FileNotFoundError:
+            missing.append(tool_name)
+            fail(f"{tool_name} not found in PATH")
+        except Exception as e:
+            missing.append(tool_name)
+            warn(f"Error checking {tool_name}: {e}")
+    
+    if missing:
+        fail(f"Missing required tools: {', '.join(missing)}")
+    
+    ok("All required tools are available")
+
+def print_aws_info() -> None:
+    """
+    Print AWS identity and region information.
+    """
+    print("### AWS Configuration")
+    try:
+        region = aws_region()
+        print(f"Region: {region}")
+        
+        identity = sts_identity()
+        account_id = identity.get("Account", "unknown")
+        user_arn = identity.get("Arn", "unknown")
+        user_id = identity.get("UserId", "unknown")
+        
+        print(f"Account ID: {account_id}")
+        print(f"User ARN: {user_arn}")
+        print(f"User ID: {user_id}")
+        ok("AWS credentials are valid")
+    except Exception as e:
+        fail(f"Failed to get AWS identity: {e}")
+
+def setup_artifacts_dir(artifacts_dir: Optional[str] = None) -> Path:
+    """
+    Create the ARTIFACTS_DIR directory if it doesn't exist.
+    Returns the Path to the artifacts directory.
+    """
+    if artifacts_dir is None:
+        artifacts_dir = os.environ.get("ARTIFACTS_DIR", "./artifacts")
+    
+    artifacts_path = Path(artifacts_dir).expanduser().resolve()
+    artifacts_path.mkdir(parents=True, exist_ok=True)
+    ok(f"Artifacts directory ready: {artifacts_path}")
+    
+    # Set it in environment for other notebooks
+    os.environ["ARTIFACTS_DIR"] = str(artifacts_path)
+    
+    return artifacts_path
+
+def bootstrap(env_file: Optional[str] = None, artifacts_dir: Optional[str] = None) -> dict:
+    """
+    Main bootstrap function that:
+    1. Loads environment variables from .env file
+    2. Checks that required tools exist
+    3. Prints AWS identity and region
+    4. Creates ARTIFACTS_DIR
+    
+    Returns a dict with bootstrap information.
+    """
+    print("=" * 60)
+    print("Bootstrapping workshop environment...")
+    print("=" * 60)
+    
+    # Load environment variables
+    load_env(env_file)
+    
+    # Check required tools
+    check_required_tools()
+    
+    # Print AWS info
+    print_aws_info()
+    
+    # Setup artifacts directory
+    artifacts_path = setup_artifacts_dir(artifacts_dir)
+    
+    print("=" * 60)
+    ok("Bootstrap complete!")
+    print("=" * 60)
+    
+    return {
+        "artifacts_dir": str(artifacts_path),
+        "aws_region": aws_region(),
+        "aws_identity": sts_identity(),
+    }
+
@@ -0,0 +1,31 @@
+from __future__ import annotations
+from typing import Optional
+from ._shell import run
+from ._validation import ok, warn, fail
+
+def kubectl(*args: str, namespace: Optional[str] = None, check: bool = True, stream: bool = True):
+    cmd = ["kubectl"]
+    if namespace:
+        cmd += ["-n", namespace]
+    cmd += list(args)
+    return run(cmd, check=check, stream=stream)
+
+def namespace_exists(ns: str) -> bool:
+    r = kubectl("get", "namespace", ns, check=False, stream=False)
+    return r.returncode == 0
+
+def require_namespace(ns: str) -> None:
+    if not namespace_exists(ns):
+        fail(f"Kubernetes namespace '{ns}' does not exist (did you deploy yet?)")
+    ok(f"Namespace exists: {ns}")
+
+def get_pods(ns: str) -> str:
+    return kubectl("get", "pods", "-o", "wide", namespace=ns, stream=False).stdout
+
+def wait_for_deployments_ready(ns: str, timeout: str = "10m") -> None:
+    r = kubectl("wait", "--for=condition=available", "deployment", "--all", f"--timeout={timeout}",
+                namespace=ns, check=False, stream=True)
+    if r.returncode != 0:
+        warn("Not all deployments became ready within timeout. Check pods/events.")
+    else:
+        ok("All deployments available.")
@@ -0,0 +1,102 @@
+from __future__ import annotations
+import os
+import shlex
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Sequence, Union
+
+@dataclass
+class CmdResult:
+    cmd: str
+    returncode: int
+    stdout: str
+    stderr: str
+
+def _env_for_subprocess(extra_env: Optional[dict] = None) -> dict:
+    env = os.environ.copy()
+    
+    # Add common Homebrew paths to PATH if they exist (macOS)
+    # Jupyter notebooks often don't inherit full PATH from launching shell
+    homebrew_paths = [
+        "/opt/homebrew/bin",  # Apple Silicon Macs
+        "/usr/local/bin",    # Intel Macs / Linux
+        "/opt/homebrew/sbin",
+        "/usr/local/sbin",
+    ]
+    
+    current_path = env.get("PATH", "")
+    path_parts = current_path.split(os.pathsep) if current_path else []
+    
+    # Add Homebrew paths if they exist and aren't already in PATH
+    for brew_path in homebrew_paths:
+        if Path(brew_path).exists() and brew_path not in path_parts:
+            path_parts.insert(0, brew_path)
+    
+    if path_parts != current_path.split(os.pathsep) if current_path else []:
+        env["PATH"] = os.pathsep.join(path_parts)
+    
+    # Respect AWS_PROFILE if set (AWS CLI + boto3 will use it)
+    if env.get("AWS_PROFILE", "").strip() == "":
+        env.pop("AWS_PROFILE", None)
+    if extra_env:
+        env.update(extra_env)
+    return env
+
+def run(
+    cmd: Union[str, Sequence[str]],
+    cwd: Optional[str] = None,
+    check: bool = True,
+    stream: bool = True,
+    extra_env: Optional[dict] = None,
+) -> CmdResult:
+    """
+    Run a shell command with optional streaming output.
+    - cmd can be a string or list of args.
+    - stream=True prints output live and still captures it.
+    """
+    if isinstance(cmd, (list, tuple)):
+        cmd_str = " ".join(shlex.quote(str(c)) for c in cmd)
+        popen_args = list(cmd)
+        shell = False
+    else:
+        cmd_str = cmd
+        popen_args = cmd
+        shell = True
+
+    env = _env_for_subprocess(extra_env)
+
+    proc = subprocess.Popen(
+        popen_args,
+        cwd=cwd,
+        shell=shell,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1,
+        universal_newlines=True,
+    )
+
+    out_lines, err_lines = [], []
+    assert proc.stdout and proc.stderr
+
+    if stream:
+        for line in proc.stdout:
+            print(line, end="")
+            out_lines.append(line)
+        for line in proc.stderr:
+            print(line, end="")
+            err_lines.append(line)
+    else:
+        stdout, stderr = proc.communicate()
+        out_lines.append(stdout or "")
+        err_lines.append(stderr or "")
+
+    rc = proc.wait()
+    result = CmdResult(cmd=cmd_str, returncode=rc, stdout="".join(out_lines), stderr="".join(err_lines))
+
+    if check and rc != 0:
+        raise RuntimeError(f"Command failed (rc={rc}): {cmd_str}\n\nSTDERR:\n{result.stderr}")
+
+    return result
@@ -0,0 +1,39 @@
+from __future__ import annotations
+import os
+
+def ok(msg: str) -> None:
+    print(f"✅ {msg}")
+
+def warn(msg: str) -> None:
+    print(f"⚠️  {msg}")
+
+def fail(msg: str) -> None:
+    raise RuntimeError(f"❌ {msg}")
+
+def require_env(*keys: str) -> dict:
+    cfg = {}
+    missing = []
+    for k in keys:
+        v = os.environ.get(k, "").strip()
+        if not v:
+            missing.append(k)
+        cfg[k] = v
+    if missing:
+        fail(f"Missing required environment variables: {', '.join(missing)}")
+    return cfg
+
+def redact(value: str, keep: int = 4) -> str:
+    if not value:
+        return ""
+    if len(value) <= keep:
+        return "*" * len(value)
+    return f"{value[:keep]}…({'*' * 8})"
+
+def print_config(config: dict, redact_keys: set[str] | None = None) -> None:
+    redact_keys = redact_keys or set()
+    print("### Config (redacted)")
+    for k, v in config.items():
+        if k in redact_keys:
+            print(f"- {k}: {redact(str(v))}")
+        else:
+            print(f"- {k}: {v}")