mirror of
https://github.com/langchain-ai/langsmith-self-hosted-workshops.git
synced 2026-07-01 20:44:14 -04:00
feat: Add Module 1 notebooks and shared infrastructure for LangSmith self-hosted workshops
This commit introduces the foundational infrastructure for running LangSmith
self-hosted deployment workshops using Jupyter notebooks.
- Add `notebooks/shared/_bootstrap.py`: Centralized bootstrap logic that:
- Loads environment variables from `.env` or `workshop.env` files
- Validates required tools (aws, terraform, helm, kubectl, jq)
- Prints AWS identity and region information
- Creates artifacts directory for notebook outputs
- Automatically installs required Python packages (python-dotenv, pyyaml, requests)
- Add `notebooks/shared/_shell.py`: Shell command execution utilities with:
- Homebrew path resolution for macOS (fixes PATH issues for subprocess calls)
- AWS_PROFILE handling
- Streaming and non-streaming command execution
- Add `notebooks/shared/_validation.py`: Validation helpers for environment
variables and configuration
- Add `notebooks/shared/_aws_helpers.py`: AWS-specific helper functions
- Add `notebooks/shared/_k8s_helpers.py`: Kubernetes helper functions
Create complete set of Module 1 notebooks following the workshop curriculum:
- `01_aws_preflight.ipynb`: Pre-deployment environment validation
- Tool validation
- AWS credentials and region checks
- Cluster capacity expectations
- Storage prerequisites (EBS CSI, StorageClasses)
- S3 blob storage verification
- Terraform and Helm repository path validation
- `02_terraform_apply.ipynb`: Infrastructure provisioning
- Terraform module discovery and validation
- Version pinning verification
- Remote state configuration
- Terraform initialization
- Plan creation with environment variable support
- Infrastructure application (commented by default)
- Output capture for Helm deployment
- `03_helm_install_langsmith.ipynb`: LangSmith installation
- Helm chart discovery and validation
- Chart version pinning
- Terraform outputs loading
- Values file management
- Kubernetes secrets creation
- Template rendering before install
- Helm installation (commented by default)
- `04_validate_ingress_and_ui.ipynb`: Deployment validation
- Pod readiness checks
- PVC binding verification
- Ingress provisioning
- Endpoint reachability
- UI availability
- Diagnostic artifact collection
- `99_teardown.ipynb`: Cleanup procedures
- Helm uninstall
- Kubernetes resource cleanup
- Terraform destroy
- Verification steps
- Add `.gitignore`: Comprehensive ignore patterns for Python, Jupyter,
environment files, artifacts, and infrastructure tool outputs
- Add `env-samples/workshop.env.example`: Template environment file with:
- Workshop configuration variables
- AWS settings
- Terraform and Helm repository paths
- PostgreSQL credentials (POSTGRES_USERNAME, POSTGRES_PASSWORD)
- Helm configuration
- Add additional example env files for AWS, OIDC, and Module 3
- Environment variable expansion: Supports `$VAR` and `${VAR}` syntax in paths
(e.g., `$TERRAFORM_REPO_DIR/aws/langsmith`)
- Robust path resolution: Handles different Jupyter working directories and
automatically finds the notebooks/shared directory
- Error handling: Clear error messages with actionable instructions when
required tools, directories, or environment variables are missing
- Terraform variable passing: Automatically reads POSTGRES_USERNAME and
POSTGRES_PASSWORD from environment and passes them to Terraform commands
- Clone instructions: Helpful guidance when Terraform or Helm repositories
are not found
- Artifact management: Centralized artifacts directory for saving outputs,
plans, and diagnostic information
All notebooks follow best practices:
- Use official repositories (no forking)
- Pin versions for reproducibility
- Plan before applying
- Render templates before installing
- Validate before proceeding
This establishes a solid foundation for the workshop series, ensuring
participants start from a supported baseline configuration.
This commit is contained in:
+160
@@ -0,0 +1,160 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
*.ipynb_checkpoints/
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
Pipfile.lock
|
||||
|
||||
# PEP 582
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
*.env
|
||||
!*.env.example
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# IDEs and editors
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.DS_Store
|
||||
*.sublime-project
|
||||
*.sublime-workspace
|
||||
|
||||
# Project-specific
|
||||
artifacts/
|
||||
*.log
|
||||
*.tmp
|
||||
|
||||
# Terraform
|
||||
.terraform/
|
||||
*.tfstate
|
||||
*.tfstate.*
|
||||
.terraform.lock.hcl
|
||||
terraform.tfvars
|
||||
!terraform.tfvars.example
|
||||
|
||||
# Kubernetes
|
||||
kubeconfig
|
||||
*.kubeconfig
|
||||
|
||||
# AWS
|
||||
.aws/
|
||||
|
||||
# Helm
|
||||
*.tgz
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
# ===== AWS / DNS / TLS =====
|
||||
# Copy to env/aws.env and source it.
|
||||
|
||||
# Optional explicit AWS account guardrail
|
||||
AWS_ACCOUNT_ID=""
|
||||
|
||||
# DNS name for LangSmith (optional in early modules)
|
||||
DOMAIN="langsmith.example.com"
|
||||
|
||||
# Route53 hosted zone (optional; if unset, notebooks attempt to infer from DOMAIN)
|
||||
ROUTE53_HOSTED_ZONE_ID=""
|
||||
|
||||
# ACM certificate (optional; if unset, notebooks can check for a matching cert)
|
||||
ACM_CERT_ARN=""
|
||||
|
||||
# Network flags
|
||||
PRIVATE_CLUSTER="false" # if true, validation should avoid public endpoints
|
||||
|
||||
# Cost-control / safety
|
||||
AUTO_TEARDOWN="false"
|
||||
TEARDOWN_CONFIRMATION_PHRASE="DELETE" # must match in teardown notebook
|
||||
@@ -0,0 +1,12 @@
|
||||
# ===== Module 3 (optional load/capacity) =====
|
||||
|
||||
# If you generate synthetic traces / load
|
||||
LOAD_TEST_ENABLED="false"
|
||||
LOAD_TEST_RPS="5"
|
||||
LOAD_TEST_DURATION_SECONDS="120"
|
||||
|
||||
# Namespace/service identifiers for metrics lookups
|
||||
LANGSMITH_SERVICE_NAME="langsmith"
|
||||
CLICKHOUSE_SERVICE_NAME="clickhouse"
|
||||
REDIS_SERVICE_NAME="redis"
|
||||
POSTGRES_ENDPOINT=""
|
||||
@@ -0,0 +1,37 @@
|
||||
# ===== Workshop / Notebook Defaults =====
|
||||
# Copy to env/workshop.env and source it: source env/workshop.env
|
||||
|
||||
# General
|
||||
WORKSHOP_NAME="langsmith-self-hosted-operator"
|
||||
NAMESPACE="langsmith"
|
||||
|
||||
# Prefer AWS_PROFILE if you use named profiles. Otherwise rely on default creds.
|
||||
AWS_PROFILE=""
|
||||
|
||||
# Region (must match where you deploy infra)
|
||||
AWS_REGION="us-west-2"
|
||||
|
||||
# Naming (used by notebooks for display + validation)
|
||||
CLUSTER_NAME="langsmith-workshop"
|
||||
|
||||
# Local repo paths (absolute is safest)
|
||||
TERRAFORM_REPO_DIR="$HOME/src/langchain-ai/terraform"
|
||||
HELM_REPO_DIR="$HOME/src/langchain-ai/helm"
|
||||
|
||||
# Where in the terraform repo the AWS self-hosted module lives (adjust as needed)
|
||||
TERRAFORM_DIR="$TERRAFORM_REPO_DIR/aws/langsmith" # <-- update to real path you standardize on
|
||||
|
||||
# Helm release + chart reference (chart reference can be local path or OCI/ref)
|
||||
HELM_RELEASE="langsmith"
|
||||
HELM_NAMESPACE="$NAMESPACE"
|
||||
|
||||
# Use a local chart path by default (stable for workshop)
|
||||
HELM_CHART_REF="$HELM_REPO_DIR/charts/langsmith"
|
||||
|
||||
# Values file for Helm install (checked into your workshop repo)
|
||||
VALUES_FILE="./helm/langsmith-values/values.aws-demo.yaml"
|
||||
|
||||
# Output/artifacts
|
||||
ARTIFACTS_DIR="./artifacts"
|
||||
LOG_LEVEL="info" # info|debug
|
||||
DRY_RUN="true" # true by default; notebooks should flip this explicitly when applying
|
||||
@@ -0,0 +1,44 @@
|
||||
# ===== Workshop / Notebook Defaults =====
|
||||
# Copy to env/workshop.env and source it: source env/workshop.env
|
||||
|
||||
# General
|
||||
WORKSHOP_NAME="langsmith-self-hosted-operator"
|
||||
NAMESPACE="langsmith"
|
||||
|
||||
# Prefer AWS_PROFILE if you use named profiles. Otherwise rely on default creds.
|
||||
AWS_PROFILE=""
|
||||
|
||||
# Region (must match where you deploy infra)
|
||||
AWS_REGION="us-east-1"
|
||||
|
||||
# AWS account ID
|
||||
AWS_ACCOUNT_ID=""
|
||||
|
||||
# Naming (used by notebooks for display + validation)
|
||||
CLUSTER_NAME="langsmith-workshop"
|
||||
|
||||
# Local repo paths (absolute is safest)
|
||||
TERRAFORM_REPO_DIR="$HOME/src/langchain-ai/terraform"
|
||||
HELM_REPO_DIR="$HOME/src/langchain-ai/helm"
|
||||
|
||||
# Where in the terraform repo the AWS self-hosted module lives (adjust as needed)
|
||||
TERRAFORM_DIR="$TERRAFORM_REPO_DIR/aws/langsmith" # <-- update to real path you standardize on
|
||||
|
||||
# Helm release + chart reference (chart reference can be local path or OCI/ref)
|
||||
HELM_RELEASE="langsmith"
|
||||
HELM_NAMESPACE="$NAMESPACE"
|
||||
|
||||
# Use a local chart path by default (stable for workshop)
|
||||
HELM_CHART_REF="$HELM_REPO_DIR/charts/langsmith"
|
||||
|
||||
# Values file for Helm install (checked into your workshop repo)
|
||||
VALUES_FILE="./helm/langsmith-values/values.aws-demo.yaml"
|
||||
|
||||
# Terraform variables (for RDS PostgreSQL)
|
||||
POSTGRES_USERNAME="langsmith"
|
||||
POSTGRES_PASSWORD="" # <-- Set a strong password here
|
||||
|
||||
# Output/artifacts
|
||||
ARTIFACTS_DIR="./artifacts"
|
||||
LOG_LEVEL="info" # info|debug
|
||||
DRY_RUN="true" # true by default; notebooks should flip this explicitly when applying
|
||||
@@ -0,0 +1,491 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Module 1: AWS Preflight Checks\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"This notebook validates your environment before deploying LangSmith. Most self-hosted failures occur **before** users ever touch the product due to:\n",
|
||||
"\n",
|
||||
"- Mis-sized clusters\n",
|
||||
"- Unsupported ingress setups\n",
|
||||
"- In-cluster databases used past their limits\n",
|
||||
"- Missing storage primitives (blob, PVs)\n",
|
||||
"\n",
|
||||
"This preflight ensures you start from a **supported baseline**.\n",
|
||||
"\n",
|
||||
"## What We'll Check\n",
|
||||
"\n",
|
||||
"1. ✅ Tooling validation (aws, terraform, kubectl, helm, jq)\n",
|
||||
"2. ✅ AWS credentials & region sanity check\n",
|
||||
"3. ✅ Cluster capacity expectations\n",
|
||||
"4. ✅ Storage prerequisites (EBS CSI, StorageClasses)\n",
|
||||
"5. ✅ Blob storage requirement (S3)\n",
|
||||
"\n",
|
||||
"**Estimated time:** 20-30 minutes\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Bootstrap environment\n",
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so we can import shared as a package\n",
|
||||
"# Find the notebooks directory by looking for the shared folder\n",
|
||||
"possible_paths = [\n",
|
||||
" Path.cwd().parent, # If cwd is module-1, go up one level to notebooks\n",
|
||||
" Path.cwd(), # If cwd is already notebooks\n",
|
||||
" Path.cwd() / \"notebooks\", # If cwd is workspace root\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"notebooks_path = None\n",
|
||||
"for path in possible_paths:\n",
|
||||
" if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" notebooks_path = path\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"if not notebooks_path:\n",
|
||||
" # Fallback: try workspace root\n",
|
||||
" notebooks_path = Path.cwd() / \"notebooks\"\n",
|
||||
" if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so 'shared' can be imported as a package\n",
|
||||
"if str(notebooks_path) not in sys.path:\n",
|
||||
" sys.path.insert(0, str(notebooks_path))\n",
|
||||
"\n",
|
||||
"from shared._bootstrap import bootstrap\n",
|
||||
"\n",
|
||||
"# Run bootstrap: loads env, checks tools, validates AWS, creates artifacts dir\n",
|
||||
"bootstrap_info = bootstrap()\n",
|
||||
"print(f\"\\nBootstrap complete! Artifacts directory: {bootstrap_info['artifacts_dir']}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## AWS Account & Region Validation\n",
|
||||
"\n",
|
||||
"Verify you're using the correct AWS account and region. This is critical for avoiding accidental deployments to production or wrong regions.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from shared._aws_helpers import aws_region, sts_identity, assert_account\n",
|
||||
"from shared._validation import require_env, print_config, ok, warn\n",
|
||||
"\n",
|
||||
"# Get AWS configuration\n",
|
||||
"region = aws_region()\n",
|
||||
"identity = sts_identity()\n",
|
||||
"\n",
|
||||
"print(\"### Current AWS Session\")\n",
|
||||
"print(f\"Region: {region}\")\n",
|
||||
"print(f\"Account ID: {identity['Account']}\")\n",
|
||||
"print(f\"User ARN: {identity['Arn']}\")\n",
|
||||
"\n",
|
||||
"# Optional: Validate against expected account (set AWS_ACCOUNT_ID in .env if needed)\n",
|
||||
"expected_account = os.environ.get(\"AWS_ACCOUNT_ID\", \"\").strip()\n",
|
||||
"if expected_account:\n",
|
||||
" assert_account(expected_account)\n",
|
||||
"else:\n",
|
||||
" warn(\"AWS_ACCOUNT_ID not set in environment - skipping account validation\")\n",
|
||||
" print(\"💡 Tip: Set AWS_ACCOUNT_ID in your .env file to add a guardrail against wrong account deployments\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Required Environment Variables\n",
|
||||
"\n",
|
||||
"Verify that all required configuration is present. These values will be used throughout the deployment.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check required environment variables\n",
|
||||
"required_vars = [\n",
|
||||
" \"WORKSHOP_NAME\",\n",
|
||||
" \"NAMESPACE\",\n",
|
||||
" \"AWS_REGION\",\n",
|
||||
" \"CLUSTER_NAME\",\n",
|
||||
" \"TERRAFORM_DIR\",\n",
|
||||
" \"HELM_RELEASE\",\n",
|
||||
" \"HELM_NAMESPACE\",\n",
|
||||
" \"HELM_CHART_REF\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"config = require_env(*required_vars)\n",
|
||||
"\n",
|
||||
"# Optional but recommended\n",
|
||||
"optional_vars = {\n",
|
||||
" \"AWS_PROFILE\": os.environ.get(\"AWS_PROFILE\", \"\"),\n",
|
||||
" \"AWS_ACCOUNT_ID\": os.environ.get(\"AWS_ACCOUNT_ID\", \"\"),\n",
|
||||
" \"VALUES_FILE\": os.environ.get(\"VALUES_FILE\", \"\"),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"\\n### Configuration Summary\")\n",
|
||||
"print_config(config, redact_keys={\"AWS_PROFILE\"})\n",
|
||||
"print(\"\\n### Optional Configuration\")\n",
|
||||
"for k, v in optional_vars.items():\n",
|
||||
" if v:\n",
|
||||
" print(f\"- {k}: {v}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"- {k}: (not set)\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cluster Capacity Expectations\n",
|
||||
"\n",
|
||||
"LangSmith requires adequate cluster resources. Before deploying, understand what you'll need:\n",
|
||||
"\n",
|
||||
"- **Minimum:** 3 nodes, 4 vCPU, 16GB RAM each (for development/testing)\n",
|
||||
"- **Recommended:** 3 nodes, 8 vCPU, 32GB RAM each (for production workloads)\n",
|
||||
"- **Storage:** EBS CSI driver required for ClickHouse PVCs\n",
|
||||
"\n",
|
||||
"Let's check if a cluster already exists and validate its configuration.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from shared._aws_helpers import eks_cluster_exists\n",
|
||||
"from shared._shell import run\n",
|
||||
"\n",
|
||||
"cluster_name = os.environ[\"CLUSTER_NAME\"]\n",
|
||||
"region = aws_region()\n",
|
||||
"\n",
|
||||
"print(f\"### Checking EKS Cluster: {cluster_name}\")\n",
|
||||
"print(f\"Region: {region}\\n\")\n",
|
||||
"\n",
|
||||
"if eks_cluster_exists(cluster_name):\n",
|
||||
" ok(f\"Cluster '{cluster_name}' exists\")\n",
|
||||
" \n",
|
||||
" # Get cluster details\n",
|
||||
" result = run(\n",
|
||||
" [\"aws\", \"eks\", \"describe-cluster\", \"--name\", cluster_name, \"--region\", region, \"--output\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" cluster_info = json.loads(result.stdout)[\"cluster\"]\n",
|
||||
" \n",
|
||||
" print(f\"\\nCluster Status: {cluster_info['status']}\")\n",
|
||||
" print(f\"Kubernetes Version: {cluster_info['version']}\")\n",
|
||||
" print(f\"Platform Version: {cluster_info.get('platformVersion', 'N/A')}\")\n",
|
||||
" \n",
|
||||
" # Check node groups\n",
|
||||
" print(\"\\n### Node Groups\")\n",
|
||||
" ng_result = run(\n",
|
||||
" [\"aws\", \"eks\", \"list-nodegroups\", \"--cluster-name\", cluster_name, \"--region\", region, \"--output\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" nodegroups = json.loads(ng_result.stdout).get(\"nodegroups\", [])\n",
|
||||
" \n",
|
||||
" if nodegroups:\n",
|
||||
" for ng in nodegroups:\n",
|
||||
" ng_detail = run(\n",
|
||||
" [\"aws\", \"eks\", \"describe-nodegroup\", \"--cluster-name\", cluster_name, \n",
|
||||
" \"--nodegroup-name\", ng, \"--region\", region, \"--output\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" ng_info = json.loads(ng_detail.stdout)[\"nodegroup\"]\n",
|
||||
" scaling = ng_info.get(\"scalingConfig\", {})\n",
|
||||
" print(f\"\\n Node Group: {ng}\")\n",
|
||||
" print(f\" Status: {ng_info['status']}\")\n",
|
||||
" print(f\" Desired: {scaling.get('desiredSize', 'N/A')}\")\n",
|
||||
" print(f\" Min: {scaling.get('minSize', 'N/A')}\")\n",
|
||||
" print(f\" Max: {scaling.get('maxSize', 'N/A')}\")\n",
|
||||
" print(f\" Instance Types: {', '.join(ng_info.get('instanceTypes', []))}\")\n",
|
||||
" else:\n",
|
||||
" warn(\"No node groups found\")\n",
|
||||
" print(\"💡 You'll need to create node groups when deploying with Terraform\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Cluster '{cluster_name}' does not exist yet\")\n",
|
||||
" print(\"💡 This is expected if you haven't run Terraform yet. Proceed to notebook 02_terraform_apply.ipynb\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Storage Prerequisites\n",
|
||||
"\n",
|
||||
"LangSmith requires persistent storage for ClickHouse. The EBS CSI driver must be installed and StorageClasses must be configured.\n",
|
||||
"\n",
|
||||
"**Why this matters:** Without EBS CSI, ClickHouse PVCs will remain in `Pending` state forever.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check if kubectl is configured for the cluster\n",
|
||||
"cluster_name = os.environ[\"CLUSTER_NAME\"]\n",
|
||||
"region = aws_region()\n",
|
||||
"\n",
|
||||
"print(\"### Configuring kubectl for EKS cluster\")\n",
|
||||
"try:\n",
|
||||
" # Update kubeconfig\n",
|
||||
" run(\n",
|
||||
" [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
|
||||
" check=True,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" ok(\"kubectl configured for cluster\")\n",
|
||||
" \n",
|
||||
" # Check EBS CSI driver\n",
|
||||
" print(\"\\n### Checking EBS CSI Driver\")\n",
|
||||
" result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"daemonset\", \"-n\", \"kube-system\", \"-l\", \"app=ebs-csi-controller\", \"-o\", \"json\"],\n",
|
||||
" check=False,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" if result.returncode == 0 and result.stdout.strip():\n",
|
||||
" import json\n",
|
||||
" ds_info = json.loads(result.stdout)\n",
|
||||
" if ds_info.get(\"items\"):\n",
|
||||
" ok(\"EBS CSI driver is installed\")\n",
|
||||
" print(f\" DaemonSet: {ds_info['items'][0]['metadata']['name']}\")\n",
|
||||
" else:\n",
|
||||
" warn(\"EBS CSI driver not found\")\n",
|
||||
" print(\"💡 EBS CSI driver must be installed before deploying LangSmith\")\n",
|
||||
" print(\" The Terraform module should handle this, but verify after deployment\")\n",
|
||||
" else:\n",
|
||||
" warn(\"EBS CSI driver not found\")\n",
|
||||
" print(\"💡 EBS CSI driver must be installed before deploying LangSmith\")\n",
|
||||
" \n",
|
||||
" # Check StorageClasses\n",
|
||||
" print(\"\\n### Checking StorageClasses\")\n",
|
||||
" result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"storageclass\", \"-o\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" sc_list = json.loads(result.stdout)\n",
|
||||
" \n",
|
||||
" ebs_scs = [sc for sc in sc_list.get(\"items\", []) if \"ebs\" in sc[\"metadata\"][\"name\"].lower() or \n",
|
||||
" sc.get(\"provisioner\", \"\").endswith(\"ebs.csi.aws.com\")]\n",
|
||||
" \n",
|
||||
" if ebs_scs:\n",
|
||||
" ok(f\"Found {len(ebs_scs)} EBS StorageClass(es):\")\n",
|
||||
" for sc in ebs_scs:\n",
|
||||
" name = sc[\"metadata\"][\"name\"]\n",
|
||||
" default = sc.get(\"metadata\", {}).get(\"annotations\", {}).get(\"storageclass.kubernetes.io/is-default-class\", \"false\")\n",
|
||||
" print(f\" - {name} (default: {default})\")\n",
|
||||
" else:\n",
|
||||
" warn(\"No EBS StorageClasses found\")\n",
|
||||
" print(\"💡 At least one EBS StorageClass is required for ClickHouse PVCs\")\n",
|
||||
" \n",
|
||||
"except Exception as e:\n",
|
||||
" warn(f\"Could not check storage prerequisites: {e}\")\n",
|
||||
" print(\"💡 This is expected if the cluster doesn't exist yet\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Blob Storage Requirement (S3)\n",
|
||||
"\n",
|
||||
"**Critical:** LangSmith requires S3 for blob storage in production. Inline trace payloads will explode ClickHouse if blob storage is not configured.\n",
|
||||
"\n",
|
||||
"Let's verify S3 access and check if a bucket exists or needs to be created.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from shared._shell import run\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"region = aws_region()\n",
|
||||
"\n",
|
||||
"print(\"### S3 Access Check\")\n",
|
||||
"print(f\"Region: {region}\\n\")\n",
|
||||
"\n",
|
||||
"# Test S3 access\n",
|
||||
"try:\n",
|
||||
" result = run(\n",
|
||||
" [\"aws\", \"s3\", \"ls\", \"--region\", region],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" ok(\"S3 access verified\")\n",
|
||||
" \n",
|
||||
" # List buckets\n",
|
||||
" buckets_result = run(\n",
|
||||
" [\"aws\", \"s3api\", \"list-buckets\", \"--output\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" buckets = json.loads(buckets_result.stdout).get(\"Buckets\", [])\n",
|
||||
" \n",
|
||||
" print(f\"\\nFound {len(buckets)} S3 bucket(s):\")\n",
|
||||
" for bucket in buckets[:10]: # Show first 10\n",
|
||||
" print(f\" - {bucket['Name']} (created: {bucket['CreationDate']})\")\n",
|
||||
" \n",
|
||||
" if len(buckets) > 10:\n",
|
||||
" print(f\" ... and {len(buckets) - 10} more\")\n",
|
||||
" \n",
|
||||
" print(\"\\n💡 Note: The Terraform module should create an S3 bucket for LangSmith blob storage\")\n",
|
||||
" print(\" Verify the bucket exists after Terraform deployment\")\n",
|
||||
" \n",
|
||||
"except Exception as e:\n",
|
||||
" warn(f\"S3 access check failed: {e}\")\n",
|
||||
" print(\"💡 Ensure your AWS credentials have S3 permissions\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Terraform & Helm Repository Paths\n",
|
||||
"\n",
|
||||
"Verify that the Terraform and Helm repository paths are correctly configured and accessible.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from pathlib import Path\n",
|
||||
"from shared._validation import ok, warn\n",
|
||||
"\n",
|
||||
"def expand_env_vars(path_str: str) -> str:\n",
|
||||
" \"\"\"Expand environment variable references in a path string.\"\"\"\n",
|
||||
" # Expand $VAR and ${VAR} references\n",
|
||||
" def replace_var(match):\n",
|
||||
" var_name = match.group(1) or match.group(2)\n",
|
||||
" return os.environ.get(var_name, match.group(0))\n",
|
||||
" \n",
|
||||
" # Replace $VAR and ${VAR} patterns\n",
|
||||
" path_str = re.sub(r'\\$\\{([^}]+)\\}|\\$([a-zA-Z_][a-zA-Z0-9_]*)', replace_var, path_str)\n",
|
||||
" return path_str\n",
|
||||
"\n",
|
||||
"# Expand environment variables in paths (e.g., $TERRAFORM_REPO_DIR, $HELM_REPO_DIR, $HOME)\n",
|
||||
"terraform_dir_str = expand_env_vars(os.environ[\"TERRAFORM_DIR\"])\n",
|
||||
"terraform_dir = Path(terraform_dir_str).expanduser().resolve()\n",
|
||||
"\n",
|
||||
"helm_chart_ref_str = expand_env_vars(os.environ[\"HELM_CHART_REF\"])\n",
|
||||
"helm_chart_ref = Path(helm_chart_ref_str).expanduser().resolve()\n",
|
||||
"\n",
|
||||
"print(\"### Repository Paths Check\\n\")\n",
|
||||
"\n",
|
||||
"# Check Terraform directory\n",
|
||||
"print(f\"Terraform Directory: {terraform_dir}\")\n",
|
||||
"if terraform_dir.exists():\n",
|
||||
" ok(f\"Terraform directory exists\")\n",
|
||||
" \n",
|
||||
" # Check for main.tf or similar\n",
|
||||
" tf_files = list(terraform_dir.glob(\"*.tf\"))\n",
|
||||
" if tf_files:\n",
|
||||
" print(f\" Found {len(tf_files)} Terraform file(s)\")\n",
|
||||
" else:\n",
|
||||
" warn(\"No .tf files found in Terraform directory\")\n",
|
||||
" print(\"💡 Ensure you're pointing to the correct Terraform module path\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Terraform directory does not exist: {terraform_dir}\")\n",
|
||||
" print(\"💡 Update TERRAFORM_DIR in your .env file to point to the langchain-ai/terraform repo\")\n",
|
||||
"\n",
|
||||
"# Check Helm chart\n",
|
||||
"print(f\"\\nHelm Chart Reference: {helm_chart_ref}\")\n",
|
||||
"if helm_chart_ref.exists():\n",
|
||||
" ok(f\"Helm chart path exists\")\n",
|
||||
" \n",
|
||||
" # Check for Chart.yaml\n",
|
||||
" chart_yaml = helm_chart_ref / \"Chart.yaml\"\n",
|
||||
" if chart_yaml.exists():\n",
|
||||
" print(f\" Found Chart.yaml\")\n",
|
||||
" else:\n",
|
||||
" warn(\"Chart.yaml not found\")\n",
|
||||
" print(\"💡 Ensure you're pointing to the correct Helm chart path\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Helm chart path does not exist: {helm_chart_ref}\")\n",
|
||||
" print(\"💡 Update HELM_CHART_REF in your .env file to point to the langchain-ai/helm chart\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Preflight Summary\n",
|
||||
"\n",
|
||||
"Review the checklist below. All items should be ✅ before proceeding to Terraform deployment.\n",
|
||||
"\n",
|
||||
"### ✅ Checklist\n",
|
||||
"\n",
|
||||
"- [ ] All required tools installed (aws, terraform, kubectl, helm, jq)\n",
|
||||
"- [ ] AWS credentials valid and correct account/region\n",
|
||||
"- [ ] Required environment variables set\n",
|
||||
"- [ ] Terraform directory path correct\n",
|
||||
"- [ ] Helm chart path correct\n",
|
||||
"- [ ] S3 access verified\n",
|
||||
"- [ ] (If cluster exists) EBS CSI driver installed\n",
|
||||
"- [ ] (If cluster exists) StorageClasses configured\n",
|
||||
"\n",
|
||||
"### Next Steps\n",
|
||||
"\n",
|
||||
"If all checks pass, proceed to **02_terraform_apply.ipynb** to deploy the infrastructure.\n",
|
||||
"\n",
|
||||
"If any checks failed, review the warnings above and fix the issues before continuing.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,668 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Module 1: Terraform - Provisioning the Platform Substrate\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"This notebook walks through deploying AWS infrastructure using the **official `langchain-ai/terraform` repository**.\n",
|
||||
"\n",
|
||||
"### Key Principles\n",
|
||||
"\n",
|
||||
"- ✅ Use the **official** Terraform repo (do not fork)\n",
|
||||
"- ✅ Pin module versions for reproducibility\n",
|
||||
"- ✅ Use remote state & locking\n",
|
||||
"- ✅ Plan before applying\n",
|
||||
"- ✅ Capture outputs needed for Helm\n",
|
||||
"\n",
|
||||
"### What We'll Deploy\n",
|
||||
"\n",
|
||||
"- Amazon EKS cluster\n",
|
||||
"- RDS PostgreSQL\n",
|
||||
"- ElastiCache Redis\n",
|
||||
"- S3 bucket for blob storage\n",
|
||||
"- IAM roles and policies\n",
|
||||
"- EBS CSI driver addon\n",
|
||||
"\n",
|
||||
"**Estimated time:** 45-60 minutes\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Bootstrap environment\n",
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so we can import shared as a package\n",
|
||||
"# Find the notebooks directory by looking for the shared folder\n",
|
||||
"possible_paths = [\n",
|
||||
" Path.cwd().parent, # If cwd is module-1, go up one level to notebooks\n",
|
||||
" Path.cwd(), # If cwd is already notebooks\n",
|
||||
" Path.cwd() / \"notebooks\", # If cwd is workspace root\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"notebooks_path = None\n",
|
||||
"for path in possible_paths:\n",
|
||||
" if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" notebooks_path = path\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"if not notebooks_path:\n",
|
||||
" notebooks_path = Path.cwd() / \"notebooks\"\n",
|
||||
" if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so 'shared' can be imported as a package\n",
|
||||
"if str(notebooks_path) not in sys.path:\n",
|
||||
" sys.path.insert(0, str(notebooks_path))\n",
|
||||
"\n",
|
||||
"from shared._bootstrap import bootstrap\n",
|
||||
"\n",
|
||||
"# Run bootstrap\n",
|
||||
"bootstrap_info = bootstrap()\n",
|
||||
"artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
|
||||
"print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Understanding the Official Terraform Repository\n",
|
||||
"\n",
|
||||
"The `langchain-ai/terraform` repository contains modules for deploying LangSmith infrastructure. We use the **official** repository because:\n",
|
||||
"\n",
|
||||
"1. **Support:** Support will expect to see standard configurations\n",
|
||||
"2. **Updates:** Official modules receive security and feature updates\n",
|
||||
"3. **Documentation:** Official modules are documented and tested\n",
|
||||
"4. **Compatibility:** Ensures compatibility with Helm charts\n",
|
||||
"\n",
|
||||
"**Important:** We do **not** fork the upstream repository. We reference it directly.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"from pathlib import Path\n",
|
||||
"from shared._validation import require_env, ok, warn, fail\n",
|
||||
"from shared._shell import run\n",
|
||||
"\n",
|
||||
"def expand_env_vars(path_str: str) -> str:\n",
|
||||
" \"\"\"Expand environment variable references in a path string.\"\"\"\n",
|
||||
" # Expand $VAR and ${VAR} references\n",
|
||||
" def replace_var(match):\n",
|
||||
" var_name = match.group(1) or match.group(2)\n",
|
||||
" return os.environ.get(var_name, match.group(0))\n",
|
||||
" \n",
|
||||
" # Replace $VAR and ${VAR} patterns\n",
|
||||
" path_str = re.sub(r'\\$\\{([^}]+)\\}|\\$([a-zA-Z_][a-zA-Z0-9_]*)', replace_var, path_str)\n",
|
||||
" return path_str\n",
|
||||
"\n",
|
||||
"# Get required configuration\n",
|
||||
"config = require_env(\"TERRAFORM_DIR\", \"CLUSTER_NAME\", \"AWS_REGION\", \"WORKSHOP_NAME\")\n",
|
||||
"\n",
|
||||
"# Expand environment variables in the path (e.g., $TERRAFORM_REPO_DIR, $HOME)\n",
|
||||
"terraform_dir_str = expand_env_vars(config[\"TERRAFORM_DIR\"])\n",
|
||||
"terraform_dir = Path(terraform_dir_str).expanduser().resolve()\n",
|
||||
"\n",
|
||||
"cluster_name = config[\"CLUSTER_NAME\"]\n",
|
||||
"region = config[\"AWS_REGION\"]\n",
|
||||
"workshop_name = config[\"WORKSHOP_NAME\"]\n",
|
||||
"\n",
|
||||
"print(\"### Terraform Configuration\")\n",
|
||||
"print(f\"Terraform Directory: {terraform_dir}\")\n",
|
||||
"print(f\"Cluster Name: {cluster_name}\")\n",
|
||||
"print(f\"Region: {region}\")\n",
|
||||
"print(f\"Workshop Name: {workshop_name}\\n\")\n",
|
||||
"\n",
|
||||
"if not terraform_dir.exists():\n",
|
||||
" fail(f\"Terraform directory does not exist: {terraform_dir}\")\n",
|
||||
" print(\"\\n💡 To fix this:\")\n",
|
||||
" print(\" 1. Clone the official Terraform repository:\")\n",
|
||||
" print(\" git clone https://github.com/langchain-ai/terraform.git <target-directory>\")\n",
|
||||
" print(\" 2. Update TERRAFORM_DIR in your .env file to point to:\")\n",
|
||||
" print(f\" TERRAFORM_DIR=\\\"<target-directory>/aws/langsmith\\\"\")\n",
|
||||
" print(\" 3. Run this notebook again\")\n",
|
||||
" raise RuntimeError(f\"Terraform directory not found: {terraform_dir}\")\n",
|
||||
"\n",
|
||||
"ok(f\"Terraform directory exists: {terraform_dir}\")\n",
|
||||
"\n",
|
||||
"# Check Terraform version\n",
|
||||
"print(\"\\n### Terraform Version\")\n",
|
||||
"result = run([\"terraform\", \"version\"], check=True, stream=False)\n",
|
||||
"print(result.stdout)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Identifying the Correct Module Path\n",
|
||||
"\n",
|
||||
"The Terraform repository is organized by cloud provider and deployment type. For AWS self-hosted deployments, we need the AWS module.\n",
|
||||
"\n",
|
||||
"**Typical path structure:**\n",
|
||||
"```\n",
|
||||
"terraform/\n",
|
||||
" modules/\n",
|
||||
" aws/\n",
|
||||
" langsmith/ # <-- This is the module we use\n",
|
||||
" main.tf\n",
|
||||
" variables.tf\n",
|
||||
" outputs.tf\n",
|
||||
" ...\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Let's verify the module structure.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Verify Terraform module structure\n",
|
||||
"print(\"### Terraform Module Structure\\n\")\n",
|
||||
"\n",
|
||||
"# Check for key files\n",
|
||||
"key_files = [\"main.tf\", \"variables.tf\", \"outputs.tf\"]\n",
|
||||
"found_files = []\n",
|
||||
"\n",
|
||||
"for file in key_files:\n",
|
||||
" file_path = terraform_dir / file\n",
|
||||
" if file_path.exists():\n",
|
||||
" found_files.append(file)\n",
|
||||
" ok(f\"Found {file}\")\n",
|
||||
" else:\n",
|
||||
" warn(f\"Missing {file}\")\n",
|
||||
"\n",
|
||||
"if len(found_files) == len(key_files):\n",
|
||||
" ok(\"Terraform module structure looks correct\")\n",
|
||||
"else:\n",
|
||||
" warn(\"Some expected Terraform files are missing\")\n",
|
||||
" print(\"💡 Ensure TERRAFORM_DIR points to the correct module path (e.g., terraform/aws/langsmith)\")\n",
|
||||
"\n",
|
||||
"# List all .tf files for reference\n",
|
||||
"print(\"\\n### All Terraform Files in Module\")\n",
|
||||
"tf_files = sorted(terraform_dir.glob(\"*.tf\"))\n",
|
||||
"for tf_file in tf_files:\n",
|
||||
" print(f\" - {tf_file.name}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pinning Module Versions\n",
|
||||
"\n",
|
||||
"**Critical:** Always pin Terraform module versions for reproducibility. This ensures:\n",
|
||||
"- Consistent deployments across environments\n",
|
||||
"- Predictable behavior\n",
|
||||
"- Ability to roll back if needed\n",
|
||||
"\n",
|
||||
"Check the `versions.tf` or `main.tf` file to see what versions are pinned.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check for version constraints\n",
|
||||
"print(\"### Checking Module Version Constraints\\n\")\n",
|
||||
"\n",
|
||||
"versions_file = terraform_dir / \"versions.tf\"\n",
|
||||
"if versions_file.exists():\n",
|
||||
" print(\"Found versions.tf:\")\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
" with open(versions_file) as f:\n",
|
||||
" print(f.read())\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
"else:\n",
|
||||
" # Check main.tf for version constraints\n",
|
||||
" main_file = terraform_dir / \"main.tf\"\n",
|
||||
" if main_file.exists():\n",
|
||||
" with open(main_file) as f:\n",
|
||||
" content = f.read()\n",
|
||||
" if \"required_version\" in content or \"version\" in content.lower():\n",
|
||||
" print(\"Version constraints found in main.tf:\")\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
" # Show relevant lines\n",
|
||||
" for i, line in enumerate(content.split('\\n'), 1):\n",
|
||||
" if 'version' in line.lower() or 'required' in line.lower():\n",
|
||||
" print(f\"{i:4}: {line}\")\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
" else:\n",
|
||||
" warn(\"No version constraints found\")\n",
|
||||
" print(\"💡 Consider adding version constraints to ensure reproducibility\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Remote State & Locking\n",
|
||||
"\n",
|
||||
"**Why remote state matters:**\n",
|
||||
"- Enables team collaboration\n",
|
||||
"- Prevents concurrent modifications\n",
|
||||
"- Provides state backup and recovery\n",
|
||||
"\n",
|
||||
"**Why locking matters:**\n",
|
||||
"- Prevents state corruption from concurrent runs\n",
|
||||
"- Required for production deployments\n",
|
||||
"\n",
|
||||
"Check if remote state backend is configured.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check for backend configuration\n",
|
||||
"print(\"### Checking Backend Configuration\\n\")\n",
|
||||
"\n",
|
||||
"backend_file = terraform_dir / \"backend.tf\"\n",
|
||||
"if backend_file.exists():\n",
|
||||
" print(\"Found backend.tf:\")\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
" with open(backend_file) as f:\n",
|
||||
" print(f.read())\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
" ok(\"Backend configuration found\")\n",
|
||||
"else:\n",
|
||||
" # Check for backend block in other files\n",
|
||||
" backend_configs = []\n",
|
||||
" for tf_file in terraform_dir.glob(\"*.tf\"):\n",
|
||||
" with open(tf_file) as f:\n",
|
||||
" content = f.read()\n",
|
||||
" if \"backend\" in content:\n",
|
||||
" backend_configs.append(tf_file.name)\n",
|
||||
" \n",
|
||||
" if backend_configs:\n",
|
||||
" print(f\"Backend configuration found in: {', '.join(backend_configs)}\")\n",
|
||||
" ok(\"Backend configuration exists\")\n",
|
||||
" else:\n",
|
||||
" warn(\"No backend configuration found\")\n",
|
||||
" print(\"💡 For production, configure remote state (S3 + DynamoDB for locking)\")\n",
|
||||
" print(\" For workshops, local state may be acceptable\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Terraform Initialization\n",
|
||||
"\n",
|
||||
"Before planning or applying, Terraform must be initialized. This downloads providers and modules.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize Terraform\n",
|
||||
"print(\"### Initializing Terraform\\n\")\n",
|
||||
"print(\"This may take a few minutes as it downloads providers and modules...\\n\")\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" [\"terraform\", \"init\"],\n",
|
||||
" cwd=str(terraform_dir),\n",
|
||||
" check=True,\n",
|
||||
" stream=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ok(\"Terraform initialization complete\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Planning vs Applying\n",
|
||||
"\n",
|
||||
"**Always plan before applying.** The plan shows:\n",
|
||||
"- What resources will be created/modified/destroyed\n",
|
||||
"- Any configuration errors\n",
|
||||
"- Estimated costs (if configured)\n",
|
||||
"\n",
|
||||
"**Review the plan carefully** before proceeding to apply.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create terraform plan\n",
|
||||
"plan_file = artifacts_dir / \"terraform-plan.txt\"\n",
|
||||
"\n",
|
||||
"print(\"### Creating Terraform Plan\\n\")\n",
|
||||
"print(\"This will show what resources Terraform intends to create/modify/destroy.\\n\")\n",
|
||||
"print(\"⚠️ Review the plan carefully before applying!\\n\")\n",
|
||||
"\n",
|
||||
"# Collect Terraform variables from environment\n",
|
||||
"terraform_vars = []\n",
|
||||
"postgres_username = os.environ.get(\"POSTGRES_USERNAME\", \"\").strip()\n",
|
||||
"postgres_password = os.environ.get(\"POSTGRES_PASSWORD\", \"\").strip()\n",
|
||||
"\n",
|
||||
"print(\"### Terraform Variables\\n\")\n",
|
||||
"missing_vars = []\n",
|
||||
"\n",
|
||||
"if postgres_username:\n",
|
||||
" terraform_vars.extend([\"-var\", f\"postgres_username={postgres_username}\"])\n",
|
||||
" print(f\"✅ POSTGRES_USERNAME: {postgres_username}\")\n",
|
||||
"else:\n",
|
||||
" missing_vars.append(\"POSTGRES_USERNAME\")\n",
|
||||
" warn(\"POSTGRES_USERNAME not set in environment\")\n",
|
||||
"\n",
|
||||
"if postgres_password:\n",
|
||||
" terraform_vars.extend([\"-var\", f\"postgres_password={postgres_password}\"])\n",
|
||||
" print(f\"✅ POSTGRES_PASSWORD: {'*' * len(postgres_password)} (hidden)\")\n",
|
||||
"else:\n",
|
||||
" missing_vars.append(\"POSTGRES_PASSWORD\")\n",
|
||||
" warn(\"POSTGRES_PASSWORD not set in environment\")\n",
|
||||
"\n",
|
||||
"if missing_vars:\n",
|
||||
" print(f\"\\n❌ Missing required environment variables: {', '.join(missing_vars)}\")\n",
|
||||
" print(\"💡 To fix this:\")\n",
|
||||
" print(\" 1. Add these variables to your .env file (or workshop.env):\")\n",
|
||||
" for var in missing_vars:\n",
|
||||
" print(f\" {var}=\\\"your-value-here\\\"\")\n",
|
||||
" print(\" 2. Re-run the bootstrap cell (first cell) to reload environment variables\")\n",
|
||||
" print(\" 3. Re-run this cell\")\n",
|
||||
" raise RuntimeError(f\"Missing required Terraform variables: {', '.join(missing_vars)}\")\n",
|
||||
"\n",
|
||||
"print(f\"\\n✅ All required variables are set. Passing {len(terraform_vars) // 2} variable(s) to Terraform.\\n\")\n",
|
||||
"\n",
|
||||
"# Build terraform plan command\n",
|
||||
"plan_cmd = [\"terraform\", \"plan\", \"-out=tfplan\"] + terraform_vars\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" plan_cmd,\n",
|
||||
" cwd=str(terraform_dir),\n",
|
||||
" check=False, # Don't fail if plan has warnings\n",
|
||||
" stream=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Save plan output\n",
|
||||
"with open(plan_file, \"w\") as f:\n",
|
||||
" f.write(result.stdout)\n",
|
||||
" if result.stderr:\n",
|
||||
" f.write(\"\\n\\nSTDERR:\\n\")\n",
|
||||
" f.write(result.stderr)\n",
|
||||
"\n",
|
||||
"print(f\"\\n💡 Plan output saved to: {plan_file}\")\n",
|
||||
"\n",
|
||||
"if result.returncode == 0:\n",
|
||||
" ok(\"Terraform plan completed successfully\")\n",
|
||||
" print(\"\\n⚠️ Review the plan above. If it looks correct, proceed to the next cell to apply.\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Terraform plan had issues (rc={result.returncode})\")\n",
|
||||
" print(\"💡 Review the errors above before proceeding\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Applying Terraform\n",
|
||||
"\n",
|
||||
"**⚠️ WARNING:** This will create real AWS resources and incur costs.\n",
|
||||
"\n",
|
||||
"Only proceed if:\n",
|
||||
"1. ✅ You've reviewed the plan\n",
|
||||
"2. ✅ You're using the correct AWS account/region\n",
|
||||
"3. ✅ You understand the costs involved\n",
|
||||
"\n",
|
||||
"**Estimated deployment time:** 15-30 minutes (EKS cluster creation takes time)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Apply Terraform\n",
|
||||
"# ⚠️ UNCOMMENT THE CODE BELOW TO ACTUALLY APPLY\n",
|
||||
"# This is commented out by default to prevent accidental deployments\n",
|
||||
"\n",
|
||||
"print(\"### Applying Terraform Configuration\\n\")\n",
|
||||
"print(\"⚠️ This cell is currently DISABLED to prevent accidental deployments.\\n\")\n",
|
||||
"print(\"To apply Terraform, uncomment the code below and run this cell.\\n\")\n",
|
||||
"\n",
|
||||
"# UNCOMMENT TO APPLY:\n",
|
||||
"# print(\"Applying Terraform... This will take 15-30 minutes.\\n\")\n",
|
||||
"# result = run(\n",
|
||||
"# [\"terraform\", \"apply\", \"tfplan\"],\n",
|
||||
"# cwd=str(terraform_dir),\n",
|
||||
"# check=True,\n",
|
||||
"# stream=True\n",
|
||||
"# )\n",
|
||||
"# \n",
|
||||
"# ok(\"Terraform apply completed successfully\")\n",
|
||||
"# \n",
|
||||
"# # Save apply output\n",
|
||||
"# apply_file = artifacts_dir / \"terraform-apply.txt\"\n",
|
||||
"# with open(apply_file, \"w\") as f:\n",
|
||||
"# f.write(result.stdout)\n",
|
||||
"# if result.stderr:\n",
|
||||
"# f.write(\"\\n\\nSTDERR:\\n\")\n",
|
||||
"# f.write(result.stderr)\n",
|
||||
"# \n",
|
||||
"# print(f\"\\n💡 Apply output saved to: {apply_file}\")\n",
|
||||
"\n",
|
||||
"print(\"💡 To apply, edit this cell and uncomment the code above\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Interpreting Terraform Outputs\n",
|
||||
"\n",
|
||||
"After Terraform applies successfully, we need to capture the outputs. These outputs contain information needed for Helm deployment:\n",
|
||||
"\n",
|
||||
"- Cluster name and endpoint\n",
|
||||
"- RDS connection details\n",
|
||||
"- Redis connection details\n",
|
||||
"- S3 bucket name\n",
|
||||
"- IAM role ARNs\n",
|
||||
"\n",
|
||||
"Let's retrieve and save these outputs.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# Get Terraform outputs\n",
|
||||
"print(\"### Terraform Outputs\\n\")\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" [\"terraform\", \"output\", \"-json\"],\n",
|
||||
" cwd=str(terraform_dir),\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"outputs = json.loads(result.stdout)\n",
|
||||
"\n",
|
||||
"# Save outputs to artifacts\n",
|
||||
"outputs_file = artifacts_dir / \"terraform-outputs.json\"\n",
|
||||
"with open(outputs_file, \"w\") as f:\n",
|
||||
" json.dump(outputs, f, indent=2)\n",
|
||||
"\n",
|
||||
"print(\"Terraform outputs:\")\n",
|
||||
"print(\"=\" * 60)\n",
|
||||
"for key, value in outputs.items():\n",
|
||||
" if isinstance(value, dict) and \"value\" in value:\n",
|
||||
" # Terraform outputs are wrapped in {\"value\": ...}\n",
|
||||
" val = value[\"value\"]\n",
|
||||
" if isinstance(val, str) and len(val) > 100:\n",
|
||||
" print(f\"{key}: {val[:100]}... (truncated)\")\n",
|
||||
" else:\n",
|
||||
" print(f\"{key}: {val}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"{key}: {value}\")\n",
|
||||
"print(\"=\" * 60)\n",
|
||||
"\n",
|
||||
"ok(f\"Outputs saved to: {outputs_file}\")\n",
|
||||
"print(\"\\n💡 These outputs will be needed for Helm deployment in the next notebook\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Verifying Infrastructure\n",
|
||||
"\n",
|
||||
"Let's verify that the key infrastructure components were created successfully.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from shared._aws_helpers import eks_cluster_exists, aws_region\n",
|
||||
"\n",
|
||||
"region = aws_region()\n",
|
||||
"\n",
|
||||
"# Verify EKS cluster\n",
|
||||
"print(\"### Verifying EKS Cluster\\n\")\n",
|
||||
"if eks_cluster_exists(cluster_name):\n",
|
||||
" ok(f\"Cluster '{cluster_name}' exists\")\n",
|
||||
" \n",
|
||||
" # Get cluster endpoint\n",
|
||||
" result = run(\n",
|
||||
" [\"aws\", \"eks\", \"describe-cluster\", \"--name\", cluster_name, \"--region\", region, \"--output\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" cluster_info = json.loads(result.stdout)[\"cluster\"]\n",
|
||||
" print(f\" Status: {cluster_info['status']}\")\n",
|
||||
" print(f\" Endpoint: {cluster_info['endpoint']}\")\n",
|
||||
" print(f\" Version: {cluster_info['version']}\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Cluster '{cluster_name}' not found\")\n",
|
||||
"\n",
|
||||
"# Verify kubectl access\n",
|
||||
"print(\"\\n### Configuring kubectl\\n\")\n",
|
||||
"try:\n",
|
||||
" run(\n",
|
||||
" [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
|
||||
" check=True,\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" ok(\"kubectl configured\")\n",
|
||||
" \n",
|
||||
" # Test cluster access\n",
|
||||
" result = run(\n",
|
||||
" [\"kubectl\", \"cluster-info\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" print(result.stdout)\n",
|
||||
"except Exception as e:\n",
|
||||
" warn(f\"Could not configure kubectl: {e}\")\n",
|
||||
"\n",
|
||||
"# Check for RDS (if output available)\n",
|
||||
"if \"rds\" in str(outputs).lower() or \"postgres\" in str(outputs).lower():\n",
|
||||
" print(\"\\n### RDS PostgreSQL\\n\")\n",
|
||||
" print(\"💡 Verify RDS instance is available in AWS console\")\n",
|
||||
" print(\" Check outputs above for connection details\")\n",
|
||||
"\n",
|
||||
"# Check for ElastiCache (if output available)\n",
|
||||
"if \"redis\" in str(outputs).lower() or \"elasticache\" in str(outputs).lower():\n",
|
||||
" print(\"\\n### ElastiCache Redis\\n\")\n",
|
||||
" print(\"💡 Verify Redis cluster is available in AWS console\")\n",
|
||||
" print(\" Check outputs above for connection details\")\n",
|
||||
"\n",
|
||||
"# Check for S3 bucket\n",
|
||||
"if \"s3\" in str(outputs).lower() or \"bucket\" in str(outputs).lower():\n",
|
||||
" print(\"\\n### S3 Bucket\\n\")\n",
|
||||
" print(\"💡 Verify S3 bucket exists in AWS console\")\n",
|
||||
" print(\" Check outputs above for bucket name\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"### ✅ What We Accomplished\n",
|
||||
"\n",
|
||||
"- [ ] Initialized Terraform\n",
|
||||
"- [ ] Created and reviewed Terraform plan\n",
|
||||
"- [ ] Applied Terraform configuration (if you uncommented the apply step)\n",
|
||||
"- [ ] Captured Terraform outputs\n",
|
||||
"- [ ] Verified infrastructure components\n",
|
||||
"\n",
|
||||
"### 📋 Key Takeaways\n",
|
||||
"\n",
|
||||
"1. **Use official Terraform repo** - Don't fork, reference directly\n",
|
||||
"2. **Pin versions** - Ensures reproducibility\n",
|
||||
"3. **Use remote state** - Required for production\n",
|
||||
"4. **Always plan first** - Review before applying\n",
|
||||
"5. **Save outputs** - Needed for Helm deployment\n",
|
||||
"\n",
|
||||
"### 🎯 Next Steps\n",
|
||||
"\n",
|
||||
"Proceed to **03_helm_install_langsmith.ipynb** to install LangSmith using Helm.\n",
|
||||
"\n",
|
||||
"**Important:** Make sure you have:\n",
|
||||
"- ✅ Terraform outputs saved\n",
|
||||
"- ✅ Cluster accessible via kubectl\n",
|
||||
"- ✅ LangSmith license key ready\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,727 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Module 1: Helm - Installing LangSmith\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"This notebook walks through installing LangSmith using the **official `langchain-ai/helm` chart**.\n",
|
||||
"\n",
|
||||
"### Key Principles\n",
|
||||
"\n",
|
||||
"- ✅ Use the **official** Helm chart (do not fork)\n",
|
||||
"- ✅ Pin chart versions for reproducibility\n",
|
||||
"- ✅ Create minimal, sane values file\n",
|
||||
"- ✅ Inject required secrets properly\n",
|
||||
"- ✅ Render templates before install\n",
|
||||
"- ✅ Understand that \"helm install succeeded\" ≠ \"system is healthy\"\n",
|
||||
"\n",
|
||||
"### What We'll Install\n",
|
||||
"\n",
|
||||
"- LangSmith application components\n",
|
||||
"- External service connections (RDS, Redis, S3)\n",
|
||||
"- Resource requests & limits\n",
|
||||
"- Ingress configuration\n",
|
||||
"\n",
|
||||
"**Estimated time:** 45-60 minutes\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Bootstrap environment\n",
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so we can import shared as a package\n",
|
||||
"# Find the notebooks directory by looking for the shared folder\n",
|
||||
"possible_paths = [\n",
|
||||
" Path.cwd().parent, # If cwd is module-1, go up one level to notebooks\n",
|
||||
" Path.cwd(), # If cwd is already notebooks\n",
|
||||
" Path.cwd() / \"notebooks\", # If cwd is workspace root\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"notebooks_path = None\n",
|
||||
"for path in possible_paths:\n",
|
||||
" if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" notebooks_path = path\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"if not notebooks_path:\n",
|
||||
" notebooks_path = Path.cwd() / \"notebooks\"\n",
|
||||
" if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so 'shared' can be imported as a package\n",
|
||||
"if str(notebooks_path) not in sys.path:\n",
|
||||
" sys.path.insert(0, str(notebooks_path))\n",
|
||||
"\n",
|
||||
"from shared._bootstrap import bootstrap\n",
|
||||
"\n",
|
||||
"# Run bootstrap\n",
|
||||
"bootstrap_info = bootstrap()\n",
|
||||
"artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
|
||||
"print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Understanding the Official Helm Chart\n",
|
||||
"\n",
|
||||
"The `langchain-ai/helm` repository contains the official LangSmith Helm chart. We use the **official** chart because:\n",
|
||||
"\n",
|
||||
"1. **Support:** Support will expect standard configurations\n",
|
||||
"2. **Updates:** Official charts receive security and feature updates\n",
|
||||
"3. **Documentation:** Official charts are documented and tested\n",
|
||||
"4. **Compatibility:** Ensures compatibility with Terraform outputs\n",
|
||||
"\n",
|
||||
"**Important:** We do **not** fork the upstream repository. We reference it directly.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"import re\n",
|
||||
"from pathlib import Path\n",
|
||||
"from shared._validation import require_env, ok, warn, fail\n",
|
||||
"from shared._shell import run\n",
|
||||
"\n",
|
||||
"def expand_env_vars(path_str: str) -> str:\n",
|
||||
" \"\"\"Expand environment variable references in a path string.\"\"\"\n",
|
||||
" # Expand $VAR and ${VAR} references\n",
|
||||
" def replace_var(match):\n",
|
||||
" var_name = match.group(1) or match.group(2)\n",
|
||||
" return os.environ.get(var_name, match.group(0))\n",
|
||||
" \n",
|
||||
" # Replace $VAR and ${VAR} patterns\n",
|
||||
" path_str = re.sub(r'\\$\\{([^}]+)\\}|\\$([a-zA-Z_][a-zA-Z0-9_]*)', replace_var, path_str)\n",
|
||||
" return path_str\n",
|
||||
"\n",
|
||||
"# Get required configuration\n",
|
||||
"config = require_env(\n",
|
||||
" \"HELM_CHART_REF\", \"HELM_RELEASE\", \"HELM_NAMESPACE\", \n",
|
||||
" \"CLUSTER_NAME\", \"AWS_REGION\", \"NAMESPACE\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Expand environment variables in the path (e.g., $HELM_REPO_DIR, $HOME)\n",
|
||||
"helm_chart_ref_str = expand_env_vars(config[\"HELM_CHART_REF\"])\n",
|
||||
"helm_chart_ref = Path(helm_chart_ref_str).expanduser().resolve()\n",
|
||||
"\n",
|
||||
"helm_release = config[\"HELM_RELEASE\"]\n",
|
||||
"helm_namespace = config[\"HELM_NAMESPACE\"]\n",
|
||||
"cluster_name = config[\"CLUSTER_NAME\"]\n",
|
||||
"region = config[\"AWS_REGION\"]\n",
|
||||
"namespace = config[\"NAMESPACE\"]\n",
|
||||
"\n",
|
||||
"print(\"### Helm Configuration\")\n",
|
||||
"print(f\"Chart Reference: {helm_chart_ref}\")\n",
|
||||
"print(f\"Release Name: {helm_release}\")\n",
|
||||
"print(f\"Namespace: {helm_namespace}\")\n",
|
||||
"print(f\"Cluster: {cluster_name}\")\n",
|
||||
"print(f\"Region: {region}\\n\")\n",
|
||||
"\n",
|
||||
"if not helm_chart_ref.exists():\n",
|
||||
" fail(f\"Helm chart path does not exist: {helm_chart_ref}\")\n",
|
||||
" print(\"\\n💡 To fix this:\")\n",
|
||||
" print(\" 1. Clone the official Helm repository:\")\n",
|
||||
" print(\" git clone https://github.com/langchain-ai/helm.git <target-directory>\")\n",
|
||||
" print(\" 2. Update HELM_CHART_REF in your .env file to point to:\")\n",
|
||||
" print(f\" HELM_CHART_REF=\\\"<target-directory>/charts/langsmith\\\"\")\n",
|
||||
" print(\" 3. Run this notebook again\")\n",
|
||||
" raise RuntimeError(f\"Helm chart path not found: {helm_chart_ref}\")\n",
|
||||
"\n",
|
||||
"ok(f\"Helm chart path exists: {helm_chart_ref}\")\n",
|
||||
"\n",
|
||||
"# Check Helm version\n",
|
||||
"print(\"\\n### Helm Version\")\n",
|
||||
"result = run([\"helm\", \"version\"], check=True, stream=False)\n",
|
||||
"print(result.stdout)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Discovering the Chart Path\n",
|
||||
"\n",
|
||||
"Verify the Helm chart structure and locate the Chart.yaml file.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Verify Helm chart structure\n",
|
||||
"print(\"### Helm Chart Structure\\n\")\n",
|
||||
"\n",
|
||||
"# Check for Chart.yaml\n",
|
||||
"chart_yaml = helm_chart_ref / \"Chart.yaml\"\n",
|
||||
"if chart_yaml.exists():\n",
|
||||
" ok(\"Found Chart.yaml\")\n",
|
||||
" print(\"\\nChart.yaml contents:\")\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
" with open(chart_yaml) as f:\n",
|
||||
" print(f.read())\n",
|
||||
" print(\"=\" * 60)\n",
|
||||
"else:\n",
|
||||
" warn(\"Chart.yaml not found\")\n",
|
||||
" raise RuntimeError(f\"❌ Invalid Helm chart: {helm_chart_ref}\")\n",
|
||||
"\n",
|
||||
"# Check for values.yaml\n",
|
||||
"values_yaml = helm_chart_ref / \"values.yaml\"\n",
|
||||
"if values_yaml.exists():\n",
|
||||
" ok(\"Found values.yaml (default values)\")\n",
|
||||
"else:\n",
|
||||
" warn(\"values.yaml not found (may be optional)\")\n",
|
||||
"\n",
|
||||
"# List chart files\n",
|
||||
"print(\"\\n### Chart Files\")\n",
|
||||
"chart_files = sorted(helm_chart_ref.glob(\"*\"))\n",
|
||||
"for f in chart_files[:20]: # Show first 20\n",
|
||||
" if f.is_file():\n",
|
||||
" print(f\" 📄 {f.name}\")\n",
|
||||
" elif f.is_dir():\n",
|
||||
" print(f\" 📁 {f.name}/\")\n",
|
||||
"if len(chart_files) > 20:\n",
|
||||
" print(f\" ... and {len(chart_files) - 20} more items\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pinning Chart Versions\n",
|
||||
"\n",
|
||||
"**Critical:** Always pin Helm chart versions for reproducibility. Check the Chart.yaml for the version.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Extract chart version\n",
|
||||
"import yaml\n",
|
||||
"\n",
|
||||
"with open(chart_yaml) as f:\n",
|
||||
" chart_info = yaml.safe_load(f)\n",
|
||||
"\n",
|
||||
"print(\"### Chart Version Information\\n\")\n",
|
||||
"print(f\"Chart Name: {chart_info.get('name', 'N/A')}\")\n",
|
||||
"print(f\"Chart Version: {chart_info.get('version', 'N/A')}\")\n",
|
||||
"print(f\"App Version: {chart_info.get('appVersion', 'N/A')}\")\n",
|
||||
"print(f\"Description: {chart_info.get('description', 'N/A')[:100]}...\")\n",
|
||||
"\n",
|
||||
"ok(f\"Using chart version: {chart_info.get('version', 'N/A')}\")\n",
|
||||
"print(\"\\n💡 Record this version for reproducibility\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading Terraform Outputs\n",
|
||||
"\n",
|
||||
"We need the Terraform outputs from the previous notebook to configure Helm values (RDS, Redis, S3, etc.).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load Terraform outputs\n",
|
||||
"terraform_outputs_file = artifacts_dir / \"terraform-outputs.json\"\n",
|
||||
"\n",
|
||||
"if not terraform_outputs_file.exists():\n",
|
||||
" warn(f\"Terraform outputs file not found: {terraform_outputs_file}\")\n",
|
||||
" print(\"💡 Run notebook 02_terraform_apply.ipynb first to generate outputs\")\n",
|
||||
" terraform_outputs = {}\n",
|
||||
"else:\n",
|
||||
" with open(terraform_outputs_file) as f:\n",
|
||||
" terraform_outputs_raw = json.load(f)\n",
|
||||
" \n",
|
||||
" # Unwrap Terraform output format\n",
|
||||
" terraform_outputs = {}\n",
|
||||
" for key, value in terraform_outputs_raw.items():\n",
|
||||
" if isinstance(value, dict) and \"value\" in value:\n",
|
||||
" terraform_outputs[key] = value[\"value\"]\n",
|
||||
" else:\n",
|
||||
" terraform_outputs[key] = value\n",
|
||||
" \n",
|
||||
" ok(f\"Loaded Terraform outputs from {terraform_outputs_file}\")\n",
|
||||
" print(f\"\\nAvailable outputs: {', '.join(terraform_outputs.keys())}\")\n",
|
||||
" \n",
|
||||
" # Show key outputs (redacted for secrets)\n",
|
||||
" print(\"\\n### Key Outputs (for reference):\")\n",
|
||||
" for key in [\"cluster_name\", \"rds_endpoint\", \"redis_endpoint\", \"s3_bucket\"]:\n",
|
||||
" if key in terraform_outputs:\n",
|
||||
" val = str(terraform_outputs[key])\n",
|
||||
" if len(val) > 50:\n",
|
||||
" print(f\" {key}: {val[:50]}...\")\n",
|
||||
" else:\n",
|
||||
" print(f\" {key}: {val}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating a Minimal Values File\n",
|
||||
"\n",
|
||||
"We'll create a minimal, sane values file that:\n",
|
||||
"- Connects to external services (RDS, Redis, S3)\n",
|
||||
"- Sets resource requests & limits\n",
|
||||
"- Configures ingress\n",
|
||||
"- Includes required secrets\n",
|
||||
"\n",
|
||||
"**Important:** Start minimal. Add complexity only as needed.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check if values file is specified\n",
|
||||
"values_file_env = os.environ.get(\"VALUES_FILE\", \"\").strip()\n",
|
||||
"\n",
|
||||
"if values_file_env:\n",
|
||||
" values_file_path = Path(values_file_env).expanduser().resolve()\n",
|
||||
" if values_file_path.exists():\n",
|
||||
" ok(f\"Using values file from environment: {values_file_path}\")\n",
|
||||
" print(\"💡 Review the values file to ensure it's configured correctly\")\n",
|
||||
" else:\n",
|
||||
" warn(f\"Values file from environment not found: {values_file_path}\")\n",
|
||||
" print(\"💡 Will need to create a values file\")\n",
|
||||
" values_file_path = None\n",
|
||||
"else:\n",
|
||||
" values_file_path = None\n",
|
||||
" print(\"💡 VALUES_FILE not set in environment\")\n",
|
||||
" print(\" We'll create a minimal values file for this deployment\")\n",
|
||||
"\n",
|
||||
"# If no values file, we'll create one\n",
|
||||
"if not values_file_path:\n",
|
||||
" values_file_path = artifacts_dir / \"langsmith-values.yaml\"\n",
|
||||
" print(f\"\\nWill create values file at: {values_file_path}\")\n",
|
||||
" print(\"💡 You can customize this file before installation\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Injecting Required Secrets\n",
|
||||
"\n",
|
||||
"LangSmith requires several secrets:\n",
|
||||
"- **License key** (required)\n",
|
||||
"- Database credentials (if not using IAM auth)\n",
|
||||
"- Redis password (if not using IAM auth)\n",
|
||||
"- S3 credentials (if not using IAM roles)\n",
|
||||
"\n",
|
||||
"Let's prepare the secrets.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check for required secrets\n",
|
||||
"print(\"### Required Secrets\\n\")\n",
|
||||
"\n",
|
||||
"# License key (required)\n",
|
||||
"license_key = os.environ.get(\"LANGSMITH_LICENSE_KEY\", \"\").strip()\n",
|
||||
"if license_key:\n",
|
||||
" ok(\"LANGSMITH_LICENSE_KEY is set\")\n",
|
||||
" print(\"💡 License key will be used to create Kubernetes secret\")\n",
|
||||
"else:\n",
|
||||
" warn(\"LANGSMITH_LICENSE_KEY not set\")\n",
|
||||
" print(\"💡 You must set LANGSMITH_LICENSE_KEY in your .env file\")\n",
|
||||
" print(\" Get your license key from LangSmith support\")\n",
|
||||
"\n",
|
||||
"# Database credentials (may be optional if using IAM auth)\n",
|
||||
"db_user = os.environ.get(\"DB_USER\", \"\").strip()\n",
|
||||
"db_password = os.environ.get(\"DB_PASSWORD\", \"\").strip()\n",
|
||||
"if db_user and db_password:\n",
|
||||
" ok(\"Database credentials are set\")\n",
|
||||
"else:\n",
|
||||
" print(\"💡 Database credentials may be optional if using IAM authentication\")\n",
|
||||
" print(\" Check your Terraform outputs for connection details\")\n",
|
||||
"\n",
|
||||
"# Redis password (may be optional if using IAM auth)\n",
|
||||
"redis_password = os.environ.get(\"REDIS_PASSWORD\", \"\").strip()\n",
|
||||
"if redis_password:\n",
|
||||
" ok(\"Redis password is set\")\n",
|
||||
"else:\n",
|
||||
" print(\"💡 Redis password may be optional if using IAM authentication\")\n",
|
||||
"\n",
|
||||
"print(\"\\n💡 Secrets will be created as Kubernetes secrets before Helm install\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Preparing Kubernetes Namespace\n",
|
||||
"\n",
|
||||
"Create the namespace if it doesn't exist.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from shared._k8s_helpers import namespace_exists, kubectl\n",
|
||||
"from shared._aws_helpers import aws_region\n",
|
||||
"\n",
|
||||
"# Ensure kubectl is configured\n",
|
||||
"region = aws_region()\n",
|
||||
"run(\n",
|
||||
" [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Create namespace if needed\n",
|
||||
"print(f\"### Preparing Namespace: {namespace}\\n\")\n",
|
||||
"\n",
|
||||
"if namespace_exists(namespace):\n",
|
||||
" ok(f\"Namespace '{namespace}' already exists\")\n",
|
||||
"else:\n",
|
||||
" print(f\"Creating namespace '{namespace}'...\")\n",
|
||||
" kubectl(\"create\", \"namespace\", namespace, check=True, stream=True)\n",
|
||||
" ok(f\"Namespace '{namespace}' created\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating Kubernetes Secrets\n",
|
||||
"\n",
|
||||
"Create the required secrets in the namespace.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create secrets\n",
|
||||
"print(\"### Creating Kubernetes Secrets\\n\")\n",
|
||||
"\n",
|
||||
"if not license_key:\n",
|
||||
" raise RuntimeError(\"❌ LANGSMITH_LICENSE_KEY is required\")\n",
|
||||
"\n",
|
||||
"# Create license key secret\n",
|
||||
"print(\"Creating license key secret...\")\n",
|
||||
"run(\n",
|
||||
" [\n",
|
||||
" \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-license\",\n",
|
||||
" f\"--from-literal=license-key={license_key}\",\n",
|
||||
" \"-n\", namespace,\n",
|
||||
" \"--dry-run=client\", \"-o\", \"yaml\"\n",
|
||||
" ],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"# Actually create it (remove dry-run)\n",
|
||||
"run(\n",
|
||||
" [\n",
|
||||
" \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-license\",\n",
|
||||
" f\"--from-literal=license-key={license_key}\",\n",
|
||||
" \"-n\", namespace\n",
|
||||
" ],\n",
|
||||
" check=False, # May already exist\n",
|
||||
" stream=True\n",
|
||||
")\n",
|
||||
"ok(\"License key secret created/updated\")\n",
|
||||
"\n",
|
||||
"# Create database secret if credentials provided\n",
|
||||
"if db_user and db_password:\n",
|
||||
" print(\"\\nCreating database secret...\")\n",
|
||||
" run(\n",
|
||||
" [\n",
|
||||
" \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-db\",\n",
|
||||
" f\"--from-literal=username={db_user}\",\n",
|
||||
" f\"--from-literal=password={db_password}\",\n",
|
||||
" \"-n\", namespace\n",
|
||||
" ],\n",
|
||||
" check=False, # May already exist\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" ok(\"Database secret created/updated\")\n",
|
||||
"else:\n",
|
||||
" print(\"💡 Skipping database secret (using IAM auth or not needed)\")\n",
|
||||
"\n",
|
||||
"# Create Redis secret if password provided\n",
|
||||
"if redis_password:\n",
|
||||
" print(\"\\nCreating Redis secret...\")\n",
|
||||
" run(\n",
|
||||
" [\n",
|
||||
" \"kubectl\", \"create\", \"secret\", \"generic\", \"langsmith-redis\",\n",
|
||||
" f\"--from-literal=password={redis_password}\",\n",
|
||||
" \"-n\", namespace\n",
|
||||
" ],\n",
|
||||
" check=False, # May already exist\n",
|
||||
" stream=True\n",
|
||||
" )\n",
|
||||
" ok(\"Redis secret created/updated\")\n",
|
||||
"else:\n",
|
||||
" print(\"💡 Skipping Redis secret (using IAM auth or not needed)\")\n",
|
||||
"\n",
|
||||
"print(\"\\n✅ Secrets preparation complete\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Rendering Templates Before Install\n",
|
||||
"\n",
|
||||
"**Critical:** Always render Helm templates before installing. This lets you:\n",
|
||||
"- Verify the configuration is correct\n",
|
||||
"- Catch errors before deployment\n",
|
||||
"- Review what will be created\n",
|
||||
"\n",
|
||||
"This is especially important for understanding resource requests & limits.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Render Helm templates\n",
|
||||
"print(\"### Rendering Helm Templates\\n\")\n",
|
||||
"print(\"This shows what Kubernetes resources will be created...\\n\")\n",
|
||||
"\n",
|
||||
"# Use values file if it exists, otherwise use empty values\n",
|
||||
"values_arg = []\n",
|
||||
"if values_file_path and values_file_path.exists():\n",
|
||||
" values_arg = [\"-f\", str(values_file_path)]\n",
|
||||
" print(f\"Using values file: {values_file_path}\\n\")\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" [\n",
|
||||
" \"helm\", \"template\", helm_release, str(helm_chart_ref),\n",
|
||||
" \"-n\", namespace,\n",
|
||||
" *values_arg,\n",
|
||||
" \"--debug\" # Show computed values\n",
|
||||
" ],\n",
|
||||
" check=False, # Don't fail on warnings\n",
|
||||
" stream=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Save rendered templates\n",
|
||||
"rendered_file = artifacts_dir / \"helm-rendered-templates.yaml\"\n",
|
||||
"with open(rendered_file, \"w\") as f:\n",
|
||||
" f.write(result.stdout)\n",
|
||||
" if result.stderr:\n",
|
||||
" f.write(\"\\n\\nSTDERR:\\n\")\n",
|
||||
" f.write(result.stderr)\n",
|
||||
"\n",
|
||||
"print(f\"\\n💡 Rendered templates saved to: {rendered_file}\")\n",
|
||||
"\n",
|
||||
"if result.returncode == 0:\n",
|
||||
" ok(\"Template rendering successful\")\n",
|
||||
" print(\"\\n⚠️ Review the rendered templates above. If they look correct, proceed to install.\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Template rendering had issues (rc={result.returncode})\")\n",
|
||||
" print(\"💡 Review the errors above before proceeding\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installing LangSmith with Helm\n",
|
||||
"\n",
|
||||
"**⚠️ WARNING:** This will install LangSmith into your cluster.\n",
|
||||
"\n",
|
||||
"Only proceed if:\n",
|
||||
"1. ✅ You've reviewed the rendered templates\n",
|
||||
"2. ✅ Secrets are created\n",
|
||||
"3. ✅ Values file is correct\n",
|
||||
"4. ✅ Terraform outputs are loaded\n",
|
||||
"\n",
|
||||
"**Estimated installation time:** 5-10 minutes\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install LangSmith with Helm\n",
|
||||
"# ⚠️ UNCOMMENT THE CODE BELOW TO ACTUALLY INSTALL\n",
|
||||
"# This is commented out by default to prevent accidental deployments\n",
|
||||
"\n",
|
||||
"print(\"### Installing LangSmith with Helm\\n\")\n",
|
||||
"print(\"⚠️ This cell is currently DISABLED to prevent accidental deployments.\\n\")\n",
|
||||
"print(\"To install, uncomment the code below and run this cell.\\n\")\n",
|
||||
"\n",
|
||||
"# UNCOMMENT TO INSTALL:\n",
|
||||
"# print(\"Installing LangSmith... This may take 5-10 minutes.\\n\")\n",
|
||||
"# \n",
|
||||
"# values_arg = []\n",
|
||||
"# if values_file_path and values_file_path.exists():\n",
|
||||
"# values_arg = [\"-f\", str(values_file_path)]\n",
|
||||
"# \n",
|
||||
"# result = run(\n",
|
||||
"# [\n",
|
||||
"# \"helm\", \"install\", helm_release, str(helm_chart_ref),\n",
|
||||
"# \"-n\", namespace,\n",
|
||||
"# \"--create-namespace\",\n",
|
||||
"# *values_arg,\n",
|
||||
"# \"--wait\", # Wait for deployment to be ready\n",
|
||||
"# \"--timeout\", \"10m\"\n",
|
||||
"# ],\n",
|
||||
"# check=False, # Don't fail immediately, we'll check status\n",
|
||||
"# stream=True\n",
|
||||
"# )\n",
|
||||
"# \n",
|
||||
"# # Save install output\n",
|
||||
"# install_file = artifacts_dir / \"helm-install.txt\"\n",
|
||||
"# with open(install_file, \"w\") as f:\n",
|
||||
"# f.write(result.stdout)\n",
|
||||
"# if result.stderr:\n",
|
||||
"# f.write(\"\\n\\nSTDERR:\\n\")\n",
|
||||
"# f.write(result.stderr)\n",
|
||||
"# \n",
|
||||
"# if result.returncode == 0:\n",
|
||||
"# ok(\"Helm install completed\")\n",
|
||||
"# print(f\"\\n💡 Install output saved to: {install_file}\")\n",
|
||||
"# else:\n",
|
||||
"# warn(f\"Helm install had issues (rc={result.returncode})\")\n",
|
||||
"# print(\"💡 Check the output above for errors\")\n",
|
||||
"\n",
|
||||
"print(\"💡 To install, edit this cell and uncomment the code above\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Understanding: \"helm install succeeded\" ≠ \"system is healthy\"\n",
|
||||
"\n",
|
||||
"**Important:** A successful Helm install only means:\n",
|
||||
"- Resources were created\n",
|
||||
"- Helm release is tracked\n",
|
||||
"\n",
|
||||
"It does **not** mean:\n",
|
||||
"- Pods are running\n",
|
||||
"- Services are healthy\n",
|
||||
"- Ingress is working\n",
|
||||
"- Database connections work\n",
|
||||
"\n",
|
||||
"We'll validate system health in the next notebook.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check Helm release status\n",
|
||||
"print(\"### Helm Release Status\\n\")\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" [\"helm\", \"list\", \"-n\", namespace, \"-o\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"releases = json.loads(result.stdout)\n",
|
||||
"langsmith_releases = [r for r in releases if r.get(\"name\") == helm_release]\n",
|
||||
"\n",
|
||||
"if langsmith_releases:\n",
|
||||
" release = langsmith_releases[0]\n",
|
||||
" print(f\"Release: {release['name']}\")\n",
|
||||
" print(f\"Status: {release['status']}\")\n",
|
||||
" print(f\"Chart: {release['chart']}\")\n",
|
||||
" print(f\"Namespace: {release['namespace']}\")\n",
|
||||
" print(f\"Revision: {release['revision']}\")\n",
|
||||
" \n",
|
||||
" if release['status'] == 'deployed':\n",
|
||||
" ok(\"Helm release is deployed\")\n",
|
||||
" print(\"\\n💡 Remember: 'deployed' doesn't mean healthy!\")\n",
|
||||
" print(\" Proceed to validation notebook to check pod status, ingress, etc.\")\n",
|
||||
" else:\n",
|
||||
" warn(f\"Helm release status: {release['status']}\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Helm release '{helm_release}' not found\")\n",
|
||||
" print(\"💡 If you just installed, wait a moment and check again\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"### ✅ What We Accomplished\n",
|
||||
"\n",
|
||||
"- [ ] Located and verified Helm chart\n",
|
||||
"- [ ] Pinned chart version\n",
|
||||
"- [ ] Loaded Terraform outputs\n",
|
||||
"- [ ] Created/verified values file\n",
|
||||
"- [ ] Created Kubernetes secrets\n",
|
||||
"- [ ] Rendered templates for review\n",
|
||||
"- [ ] Installed LangSmith (if you uncommented the install step)\n",
|
||||
"- [ ] Checked Helm release status\n",
|
||||
"\n",
|
||||
"### 📋 Key Takeaways\n",
|
||||
"\n",
|
||||
"1. **Use official Helm chart** - Don't fork, reference directly\n",
|
||||
"2. **Pin versions** - Ensures reproducibility\n",
|
||||
"3. **Start minimal** - Add complexity only as needed\n",
|
||||
"4. **Render first** - Always render templates before installing\n",
|
||||
"5. **Secrets matter** - Properly inject required secrets\n",
|
||||
"6. **Install ≠ Healthy** - Validation comes next\n",
|
||||
"\n",
|
||||
"### 🎯 Next Steps\n",
|
||||
"\n",
|
||||
"Proceed to **04_validate_ingress_and_ui.ipynb** to validate:\n",
|
||||
"- Pod readiness\n",
|
||||
"- PVC binding\n",
|
||||
"- Ingress provisioning\n",
|
||||
"- Endpoint reachability\n",
|
||||
"- Basic UI availability\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,590 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Module 1: Validation & Go/No-Go Checklist\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"This notebook validates that your LangSmith deployment is healthy and ready for use. This checklist becomes your **baseline reference** for future troubleshooting.\n",
|
||||
"\n",
|
||||
"### What We'll Validate\n",
|
||||
"\n",
|
||||
"1. ✅ Pod readiness (all pods running)\n",
|
||||
"2. ✅ PVC binding (storage provisioned)\n",
|
||||
"3. ✅ Ingress provisioning (ALB created)\n",
|
||||
"4. ✅ Endpoint reachability (services accessible)\n",
|
||||
"5. ✅ Basic UI availability (web interface works)\n",
|
||||
"\n",
|
||||
"### Why This Matters\n",
|
||||
"\n",
|
||||
"Most issues are caught here, before real users onboard. This validation ensures you're on a **supported path**.\n",
|
||||
"\n",
|
||||
"**Estimated time:** 20-30 minutes\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Bootstrap environment\n",
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so we can import shared as a package\n",
|
||||
"# Find the notebooks directory by looking for the shared folder\n",
|
||||
"possible_paths = [\n",
|
||||
" Path.cwd().parent, # If cwd is module-1, go up one level to notebooks\n",
|
||||
" Path.cwd(), # If cwd is already notebooks\n",
|
||||
" Path.cwd() / \"notebooks\", # If cwd is workspace root\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"notebooks_path = None\n",
|
||||
"for path in possible_paths:\n",
|
||||
" if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" notebooks_path = path\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"if not notebooks_path:\n",
|
||||
" notebooks_path = Path.cwd() / \"notebooks\"\n",
|
||||
" if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so 'shared' can be imported as a package\n",
|
||||
"if str(notebooks_path) not in sys.path:\n",
|
||||
" sys.path.insert(0, str(notebooks_path))\n",
|
||||
"\n",
|
||||
"from shared._bootstrap import bootstrap\n",
|
||||
"\n",
|
||||
"# Run bootstrap\n",
|
||||
"bootstrap_info = bootstrap()\n",
|
||||
"artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
|
||||
"print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setting Up Cluster Access\n",
|
||||
"\n",
|
||||
"Ensure kubectl is configured for the EKS cluster.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from shared._validation import require_env, ok\n",
|
||||
"from shared._aws_helpers import aws_region\n",
|
||||
"from shared._shell import run\n",
|
||||
"\n",
|
||||
"# Get configuration\n",
|
||||
"config = require_env(\"CLUSTER_NAME\", \"AWS_REGION\", \"NAMESPACE\")\n",
|
||||
"cluster_name = config[\"CLUSTER_NAME\"]\n",
|
||||
"region = aws_region()\n",
|
||||
"namespace = config[\"NAMESPACE\"]\n",
|
||||
"\n",
|
||||
"# Configure kubectl\n",
|
||||
"print(\"### Configuring kubectl\\n\")\n",
|
||||
"run(\n",
|
||||
" [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
|
||||
" check=True,\n",
|
||||
" stream=True\n",
|
||||
")\n",
|
||||
"ok(\"kubectl configured\")\n",
|
||||
"\n",
|
||||
"# Test cluster access\n",
|
||||
"result = run([\"kubectl\", \"cluster-info\"], check=True, stream=False)\n",
|
||||
"print(result.stdout)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. Pod Readiness Check\n",
|
||||
"\n",
|
||||
"**Critical:** All pods must be in `Running` state with `Ready` status. This is the foundation of a healthy deployment.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from shared._k8s_helpers import get_pods, wait_for_deployments_ready, require_namespace\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# Ensure namespace exists\n",
|
||||
"require_namespace(namespace)\n",
|
||||
"\n",
|
||||
"# Wait for deployments to be ready (with timeout)\n",
|
||||
"print(\"### Waiting for Deployments to be Ready\\n\")\n",
|
||||
"print(\"This may take a few minutes if pods are still starting...\\n\")\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" wait_for_deployments_ready(namespace, timeout=\"10m\")\n",
|
||||
"except Exception as e:\n",
|
||||
" print(f\"⚠️ Timeout or error waiting for deployments: {e}\")\n",
|
||||
" print(\"💡 Some pods may still be starting. Continuing with status check...\")\n",
|
||||
"\n",
|
||||
"# Get pod status\n",
|
||||
"print(\"\\n### Pod Status\\n\")\n",
|
||||
"pods_output = get_pods(namespace)\n",
|
||||
"print(pods_output)\n",
|
||||
"\n",
|
||||
"# Parse pod status\n",
|
||||
"result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"pods\", \"-n\", namespace, \"-o\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"pods_data = json.loads(result.stdout)\n",
|
||||
"\n",
|
||||
"# Analyze pod status\n",
|
||||
"running = 0\n",
|
||||
"pending = 0\n",
|
||||
"failed = 0\n",
|
||||
"ready = 0\n",
|
||||
"total = len(pods_data.get(\"items\", []))\n",
|
||||
"\n",
|
||||
"for pod in pods_data.get(\"items\", []):\n",
|
||||
" status = pod.get(\"status\", {})\n",
|
||||
" phase = status.get(\"phase\", \"Unknown\")\n",
|
||||
" conditions = status.get(\"conditions\", [])\n",
|
||||
" \n",
|
||||
" if phase == \"Running\":\n",
|
||||
" running += 1\n",
|
||||
" # Check ready condition\n",
|
||||
" for cond in conditions:\n",
|
||||
" if cond.get(\"type\") == \"Ready\" and cond.get(\"status\") == \"True\":\n",
|
||||
" ready += 1\n",
|
||||
" break\n",
|
||||
" elif phase == \"Pending\":\n",
|
||||
" pending += 1\n",
|
||||
" elif phase == \"Failed\":\n",
|
||||
" failed += 1\n",
|
||||
"\n",
|
||||
"print(f\"\\n### Pod Summary\")\n",
|
||||
"print(f\"Total pods: {total}\")\n",
|
||||
"print(f\"Running: {running}\")\n",
|
||||
"print(f\"Ready: {ready}\")\n",
|
||||
"print(f\"Pending: {pending}\")\n",
|
||||
"print(f\"Failed: {failed}\")\n",
|
||||
"\n",
|
||||
"if ready == total and total > 0:\n",
|
||||
" ok(f\"All {total} pods are ready\")\n",
|
||||
"elif running == total and total > 0:\n",
|
||||
" warn(f\"All pods running but {total - ready} not ready yet\")\n",
|
||||
"else:\n",
|
||||
" warn(f\"Pod status: {running}/{total} running, {ready}/{total} ready\")\n",
|
||||
" if pending > 0:\n",
|
||||
" print(\"💡 Some pods are still pending. Check events for issues:\")\n",
|
||||
" run([\"kubectl\", \"get\", \"events\", \"-n\", namespace, \"--sort-by=.lastTimestamp\"], check=False, stream=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check PVC status\n",
|
||||
"print(\"### Persistent Volume Claims Status\\n\")\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"pvc\", \"-n\", namespace, \"-o\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"pvc_data = json.loads(result.stdout)\n",
|
||||
"\n",
|
||||
"# Display PVCs\n",
|
||||
"print(\"PVC Details:\")\n",
|
||||
"print(\"=\" * 80)\n",
|
||||
"run([\"kubectl\", \"get\", \"pvc\", \"-n\", namespace, \"-o\", \"wide\"], check=True, stream=True)\n",
|
||||
"print(\"=\" * 80)\n",
|
||||
"\n",
|
||||
"# Analyze PVC status\n",
|
||||
"bound = 0\n",
|
||||
"pending = 0\n",
|
||||
"total = len(pvc_data.get(\"items\", []))\n",
|
||||
"\n",
|
||||
"for pvc in pvc_data.get(\"items\", []):\n",
|
||||
" status = pvc.get(\"status\", {})\n",
|
||||
" phase = status.get(\"phase\", \"Unknown\")\n",
|
||||
" \n",
|
||||
" if phase == \"Bound\":\n",
|
||||
" bound += 1\n",
|
||||
" elif phase == \"Pending\":\n",
|
||||
" pending += 1\n",
|
||||
" # Show details for pending PVCs\n",
|
||||
" name = pvc.get(\"metadata\", {}).get(\"name\", \"unknown\")\n",
|
||||
" print(f\"\\n⚠️ PVC '{name}' is Pending\")\n",
|
||||
" print(\" Common causes:\")\n",
|
||||
" print(\" - EBS CSI driver not installed\")\n",
|
||||
" print(\" - No StorageClass available\")\n",
|
||||
" print(\" - Insufficient storage quota\")\n",
|
||||
"\n",
|
||||
"print(f\"\\n### PVC Summary\")\n",
|
||||
"print(f\"Total PVCs: {total}\")\n",
|
||||
"print(f\"Bound: {bound}\")\n",
|
||||
"print(f\"Pending: {pending}\")\n",
|
||||
"\n",
|
||||
"if bound == total and total > 0:\n",
|
||||
" ok(f\"All {total} PVCs are bound\")\n",
|
||||
"elif pending > 0:\n",
|
||||
" warn(f\"{pending} PVC(s) still pending - storage issue likely\")\n",
|
||||
" print(\"💡 Check EBS CSI driver and StorageClasses\")\n",
|
||||
"else:\n",
|
||||
" ok(\"PVC status looks good\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Ingress Provisioning Check\n",
|
||||
"\n",
|
||||
"**Critical:** The AWS ALB (Application Load Balancer) must be provisioned. This is how external traffic reaches LangSmith.\n",
|
||||
"\n",
|
||||
"Common issue: ALB never appears due to wrong ingress assumptions.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check ingress resources\n",
|
||||
"print(\"### Ingress Resources\\n\")\n",
|
||||
"\n",
|
||||
"# Get ingress\n",
|
||||
"result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"json\"],\n",
|
||||
" check=False, # May not exist yet\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if result.returncode == 0:\n",
|
||||
" ingress_data = json.loads(result.stdout)\n",
|
||||
" ingresses = ingress_data.get(\"items\", [])\n",
|
||||
" \n",
|
||||
" if ingresses:\n",
|
||||
" print(\"Ingress Details:\")\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
" run([\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"wide\"], check=True, stream=True)\n",
|
||||
" print(\"=\" * 80)\n",
|
||||
" \n",
|
||||
" for ingress in ingresses:\n",
|
||||
" name = ingress.get(\"metadata\", {}).get(\"name\", \"unknown\")\n",
|
||||
" status = ingress.get(\"status\", {})\n",
|
||||
" load_balancer = status.get(\"loadBalancer\", {})\n",
|
||||
" ingress_hosts = []\n",
|
||||
" \n",
|
||||
" # Get ingress hosts\n",
|
||||
" rules = ingress.get(\"spec\", {}).get(\"rules\", [])\n",
|
||||
" for rule in rules:\n",
|
||||
" host = rule.get(\"host\", \"\")\n",
|
||||
" if host:\n",
|
||||
" ingress_hosts.append(host)\n",
|
||||
" \n",
|
||||
" print(f\"\\nIngress: {name}\")\n",
|
||||
" if ingress_hosts:\n",
|
||||
" print(f\" Hosts: {', '.join(ingress_hosts)}\")\n",
|
||||
" \n",
|
||||
" # Check for ALB address\n",
|
||||
" if load_balancer.get(\"ingress\"):\n",
|
||||
" alb_addresses = [ing.get(\"hostname\", ing.get(\"ip\", \"\")) for ing in load_balancer[\"ingress\"]]\n",
|
||||
" if alb_addresses:\n",
|
||||
" ok(f\"ALB provisioned: {', '.join(alb_addresses)}\")\n",
|
||||
" print(f\" 💡 Access LangSmith at: https://{alb_addresses[0]}\")\n",
|
||||
" else:\n",
|
||||
" warn(\"ALB ingress entry exists but no address found\")\n",
|
||||
" else:\n",
|
||||
" warn(\"ALB not yet provisioned (may take a few minutes)\")\n",
|
||||
" print(\" 💡 Wait a few minutes and check again\")\n",
|
||||
" else:\n",
|
||||
" warn(\"No ingress resources found\")\n",
|
||||
" print(\"💡 Ingress may not be configured in Helm values\")\n",
|
||||
"else:\n",
|
||||
" warn(\"Could not retrieve ingress resources\")\n",
|
||||
" print(\"💡 Ingress may not exist yet or namespace is incorrect\")\n",
|
||||
"\n",
|
||||
"# Also check for ALB Ingress Controller\n",
|
||||
"print(\"\\n### ALB Ingress Controller\\n\")\n",
|
||||
"result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"pods\", \"-n\", \"kube-system\", \"-l\", \"app.kubernetes.io/name=aws-load-balancer-controller\", \"-o\", \"json\"],\n",
|
||||
" check=False,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if result.returncode == 0:\n",
|
||||
" controller_data = json.loads(result.stdout)\n",
|
||||
" controllers = controller_data.get(\"items\", [])\n",
|
||||
" if controllers:\n",
|
||||
" ok(f\"ALB Ingress Controller found ({len(controllers)} pod(s))\")\n",
|
||||
" else:\n",
|
||||
" warn(\"ALB Ingress Controller not found\")\n",
|
||||
" print(\"💡 ALB Ingress Controller must be installed for ingress to work\")\n",
|
||||
"else:\n",
|
||||
" warn(\"Could not check ALB Ingress Controller status\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. Endpoint Reachability Check\n",
|
||||
"\n",
|
||||
"Verify that services are accessible and responding. We'll check:\n",
|
||||
"- Service endpoints\n",
|
||||
"- Health check endpoints (if available)\n",
|
||||
"- Internal service connectivity\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check services\n",
|
||||
"print(\"### Service Endpoints\\n\")\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"svc\", \"-n\", namespace, \"-o\", \"json\"],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"services_data = json.loads(result.stdout)\n",
|
||||
"\n",
|
||||
"print(\"Services:\")\n",
|
||||
"print(\"=\" * 80)\n",
|
||||
"run([\"kubectl\", \"get\", \"svc\", \"-n\", namespace], check=True, stream=True)\n",
|
||||
"print(\"=\" * 80)\n",
|
||||
"\n",
|
||||
"services = services_data.get(\"items\", [])\n",
|
||||
"if services:\n",
|
||||
" ok(f\"Found {len(services)} service(s)\")\n",
|
||||
" \n",
|
||||
" # Check for LoadBalancer services\n",
|
||||
" lb_services = [svc for svc in services if svc.get(\"spec\", {}).get(\"type\") == \"LoadBalancer\"]\n",
|
||||
" if lb_services:\n",
|
||||
" print(f\"\\nLoadBalancer services: {len(lb_services)}\")\n",
|
||||
" for svc in lb_services:\n",
|
||||
" name = svc.get(\"metadata\", {}).get(\"name\", \"unknown\")\n",
|
||||
" status = svc.get(\"status\", {}).get(\"loadBalancer\", {})\n",
|
||||
" if status.get(\"ingress\"):\n",
|
||||
" lb_address = status[\"ingress\"][0].get(\"hostname\") or status[\"ingress\"][0].get(\"ip\")\n",
|
||||
" ok(f\"Service '{name}' has LoadBalancer: {lb_address}\")\n",
|
||||
" else:\n",
|
||||
" warn(f\"Service '{name}' LoadBalancer pending\")\n",
|
||||
" \n",
|
||||
" # Test internal connectivity (if we can exec into a pod)\n",
|
||||
" print(\"\\n### Testing Internal Service Connectivity\\n\")\n",
|
||||
" # Try to find a pod we can exec into\n",
|
||||
" result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"pods\", \"-n\", namespace, \"-o\", \"jsonpath={.items[0].metadata.name}\"],\n",
|
||||
" check=False,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" if result.returncode == 0 and result.stdout.strip():\n",
|
||||
" test_pod = result.stdout.strip()\n",
|
||||
" print(f\"Testing connectivity from pod: {test_pod}\")\n",
|
||||
" # Try a simple DNS lookup or curl\n",
|
||||
" # This is a basic check - actual health endpoints depend on the application\n",
|
||||
" print(\"💡 Internal connectivity tests depend on application-specific health endpoints\")\n",
|
||||
"else:\n",
|
||||
" warn(\"No services found\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5. Basic UI Availability Check\n",
|
||||
"\n",
|
||||
"**Final validation:** Can we actually access the LangSmith UI through the ingress?\n",
|
||||
"\n",
|
||||
"This is the ultimate test - if the UI loads, everything is working.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from urllib.parse import urlparse\n",
|
||||
"\n",
|
||||
"# Get ingress hostname\n",
|
||||
"print(\"### UI Availability Check\\n\")\n",
|
||||
"\n",
|
||||
"result = run(\n",
|
||||
" [\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"jsonpath={.items[0].status.loadBalancer.ingress[0].hostname}\"],\n",
|
||||
" check=False,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"if result.returncode == 0 and result.stdout.strip():\n",
|
||||
" ingress_host = result.stdout.strip()\n",
|
||||
" print(f\"Ingress hostname: {ingress_host}\")\n",
|
||||
" \n",
|
||||
" # Try to access the UI (HTTPS)\n",
|
||||
" ui_url = f\"https://{ingress_host}\"\n",
|
||||
" print(f\"\\nTesting UI availability at: {ui_url}\")\n",
|
||||
" print(\"(This may take a moment if ALB is still provisioning...)\\n\")\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" # Use a short timeout and allow redirects\n",
|
||||
" response = requests.get(ui_url, timeout=10, allow_redirects=True, verify=False)\n",
|
||||
" if response.status_code == 200:\n",
|
||||
" ok(f\"UI is accessible! Status: {response.status_code}\")\n",
|
||||
" print(f\"💡 Open in browser: {ui_url}\")\n",
|
||||
" elif response.status_code in [301, 302, 307, 308]:\n",
|
||||
" ok(f\"UI redirects (status: {response.status_code}) - likely working\")\n",
|
||||
" print(f\"💡 Redirect location: {response.headers.get('Location', 'N/A')}\")\n",
|
||||
" print(f\"💡 Open in browser: {ui_url}\")\n",
|
||||
" else:\n",
|
||||
" warn(f\"UI returned status {response.status_code}\")\n",
|
||||
" print(\"💡 UI may still be starting or there may be a configuration issue\")\n",
|
||||
" except requests.exceptions.SSLError:\n",
|
||||
" # SSL errors might be expected if using self-signed certs\n",
|
||||
" warn(\"SSL verification failed (may be expected with self-signed certs)\")\n",
|
||||
" print(f\"💡 Try accessing: {ui_url}\")\n",
|
||||
" print(\" Browser may show security warning - this is normal for self-signed certs\")\n",
|
||||
" except requests.exceptions.Timeout:\n",
|
||||
" warn(\"UI request timed out\")\n",
|
||||
" print(\"💡 ALB may still be provisioning, or ingress is not fully configured\")\n",
|
||||
" print(f\" Try again in a few minutes: {ui_url}\")\n",
|
||||
" except requests.exceptions.ConnectionError as e:\n",
|
||||
" warn(f\"Could not connect to UI: {e}\")\n",
|
||||
" print(\"💡 ALB may still be provisioning\")\n",
|
||||
" print(f\" Check AWS console for ALB status, then try: {ui_url}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" warn(f\"Error accessing UI: {e}\")\n",
|
||||
" print(f\"💡 Manual check: Open {ui_url} in a browser\")\n",
|
||||
"else:\n",
|
||||
" warn(\"Could not determine ingress hostname\")\n",
|
||||
" print(\"💡 Ingress may not be provisioned yet\")\n",
|
||||
" print(\" Run the ingress check above and wait for ALB to be created\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Collecting Diagnostic Artifacts\n",
|
||||
"\n",
|
||||
"Save cluster state snapshots for future troubleshooting reference.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"# Create diagnostic snapshot\n",
|
||||
"print(\"### Collecting Diagnostic Artifacts\\n\")\n",
|
||||
"\n",
|
||||
"timestamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
|
||||
"diagnostics_dir = artifacts_dir / f\"diagnostics-{timestamp}\"\n",
|
||||
"diagnostics_dir.mkdir(exist_ok=True)\n",
|
||||
"\n",
|
||||
"print(f\"Saving diagnostics to: {diagnostics_dir}\\n\")\n",
|
||||
"\n",
|
||||
"# Save various cluster states\n",
|
||||
"diagnostics = [\n",
|
||||
" (\"pods\", [\"kubectl\", \"get\", \"pods\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
|
||||
" (\"services\", [\"kubectl\", \"get\", \"svc\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
|
||||
" (\"ingress\", [\"kubectl\", \"get\", \"ingress\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
|
||||
" (\"pvc\", [\"kubectl\", \"get\", \"pvc\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
|
||||
" (\"deployments\", [\"kubectl\", \"get\", \"deployments\", \"-n\", namespace, \"-o\", \"yaml\"]),\n",
|
||||
" (\"events\", [\"kubectl\", \"get\", \"events\", \"-n\", namespace, \"--sort-by=.lastTimestamp\"]),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for name, cmd in diagnostics:\n",
|
||||
" try:\n",
|
||||
" result = run(cmd, check=False, stream=False)\n",
|
||||
" output_file = diagnostics_dir / f\"{name}.txt\"\n",
|
||||
" with open(output_file, \"w\") as f:\n",
|
||||
" f.write(result.stdout)\n",
|
||||
" if result.stderr:\n",
|
||||
" f.write(\"\\n\\nSTDERR:\\n\")\n",
|
||||
" f.write(result.stderr)\n",
|
||||
" print(f\"✅ Saved {name}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"⚠️ Could not save {name}: {e}\")\n",
|
||||
"\n",
|
||||
"ok(f\"Diagnostics saved to: {diagnostics_dir}\")\n",
|
||||
"print(\"\\n💡 These artifacts can be used for troubleshooting or support tickets\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Go/No-Go Checklist\n",
|
||||
"\n",
|
||||
"Review this checklist. All items should be ✅ before considering the deployment ready.\n",
|
||||
"\n",
|
||||
"### ✅ Validation Checklist\n",
|
||||
"\n",
|
||||
"- [ ] All pods are running and ready\n",
|
||||
"- [ ] All PVCs are bound\n",
|
||||
"- [ ] Ingress/ALB is provisioned\n",
|
||||
"- [ ] Services are accessible\n",
|
||||
"- [ ] UI is reachable (or ALB is provisioning)\n",
|
||||
"- [ ] Diagnostic artifacts collected\n",
|
||||
"\n",
|
||||
"### 🎯 Next Steps\n",
|
||||
"\n",
|
||||
"**If all checks pass:**\n",
|
||||
"- ✅ You have a working baseline deployment\n",
|
||||
"- ✅ You're on a supported path\n",
|
||||
"- ✅ Ready to proceed to Module 2 (SSO/OIDC configuration)\n",
|
||||
"\n",
|
||||
"**If checks fail:**\n",
|
||||
"- Review the warnings above\n",
|
||||
"- Check diagnostic artifacts\n",
|
||||
"- Common issues:\n",
|
||||
" - **PVCs pending:** EBS CSI driver not installed\n",
|
||||
" - **ALB not appearing:** Wrong ingress configuration\n",
|
||||
" - **Pods not ready:** Check events and logs\n",
|
||||
" - **UI not accessible:** Wait for ALB provisioning (can take 5-10 minutes)\n",
|
||||
"\n",
|
||||
"### 📋 Baseline Reference\n",
|
||||
"\n",
|
||||
"This validation checklist becomes your **baseline reference** for future troubleshooting. Save the diagnostic artifacts and refer back to this state when investigating issues.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,406 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Module 1: Teardown & Cleanup\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"This notebook helps you clean up the resources created during Module 1. This is important to:\n",
|
||||
"- Avoid ongoing AWS costs\n",
|
||||
"- Clean up test environments\n",
|
||||
"- Practice proper resource lifecycle management\n",
|
||||
"\n",
|
||||
"### ⚠️ Warning\n",
|
||||
"\n",
|
||||
"This will **destroy** your LangSmith deployment and associated AWS resources. Only run this if you're sure you want to remove everything.\n",
|
||||
"\n",
|
||||
"**What will be destroyed:**\n",
|
||||
"- Helm release (LangSmith application)\n",
|
||||
"- Terraform-managed infrastructure (EKS, RDS, ElastiCache, S3, etc.)\n",
|
||||
"- All associated data\n",
|
||||
"\n",
|
||||
"**Estimated time:** 30-45 minutes\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Bootstrap environment\n",
|
||||
"import sys\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so we can import shared as a package\n",
|
||||
"# Find the notebooks directory by looking for the shared folder\n",
|
||||
"possible_paths = [\n",
|
||||
" Path.cwd().parent, # If cwd is module-1, go up one level to notebooks\n",
|
||||
" Path.cwd(), # If cwd is already notebooks\n",
|
||||
" Path.cwd() / \"notebooks\", # If cwd is workspace root\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"notebooks_path = None\n",
|
||||
"for path in possible_paths:\n",
|
||||
" if path and (path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" notebooks_path = path\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"if not notebooks_path:\n",
|
||||
" notebooks_path = Path.cwd() / \"notebooks\"\n",
|
||||
" if not (notebooks_path / \"shared\" / \"_bootstrap.py\").exists():\n",
|
||||
" raise RuntimeError(f\"Could not find notebooks/shared directory. Current dir: {Path.cwd()}\")\n",
|
||||
"\n",
|
||||
"# Add notebooks directory to path so 'shared' can be imported as a package\n",
|
||||
"if str(notebooks_path) not in sys.path:\n",
|
||||
" sys.path.insert(0, str(notebooks_path))\n",
|
||||
"\n",
|
||||
"from shared._bootstrap import bootstrap\n",
|
||||
"\n",
|
||||
"# Run bootstrap\n",
|
||||
"bootstrap_info = bootstrap()\n",
|
||||
"artifacts_dir = Path(bootstrap_info['artifacts_dir'])\n",
|
||||
"print(f\"\\nArtifacts directory: {artifacts_dir}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Confirmation\n",
|
||||
"\n",
|
||||
"**⚠️ READ THIS CAREFULLY**\n",
|
||||
"\n",
|
||||
"Before proceeding, confirm:\n",
|
||||
"1. ✅ You want to destroy all resources\n",
|
||||
"2. ✅ You've backed up any important data\n",
|
||||
"3. ✅ You understand this cannot be undone\n",
|
||||
"4. ✅ You're using the correct AWS account/region\n",
|
||||
"\n",
|
||||
"**Double-check your AWS account and region before proceeding!**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from shared._validation import require_env\n",
|
||||
"from shared._aws_helpers import aws_region, sts_identity\n",
|
||||
"\n",
|
||||
"# Show current AWS session\n",
|
||||
"config = require_env(\"CLUSTER_NAME\", \"AWS_REGION\", \"NAMESPACE\", \"HELM_RELEASE\")\n",
|
||||
"region = aws_region()\n",
|
||||
"identity = sts_identity()\n",
|
||||
"\n",
|
||||
"print(\"### Current AWS Session\")\n",
|
||||
"print(\"=\" * 60)\n",
|
||||
"print(f\"Account ID: {identity['Account']}\")\n",
|
||||
"print(f\"Region: {region}\")\n",
|
||||
"print(f\"User ARN: {identity['Arn']}\")\n",
|
||||
"print(\"=\" * 60)\n",
|
||||
"\n",
|
||||
"print(f\"\\n### Resources to be Destroyed\")\n",
|
||||
"print(f\"Cluster: {config['CLUSTER_NAME']}\")\n",
|
||||
"print(f\"Namespace: {config['NAMESPACE']}\")\n",
|
||||
"print(f\"Helm Release: {config['HELM_RELEASE']}\")\n",
|
||||
"print(\"=\" * 60)\n",
|
||||
"\n",
|
||||
"print(\"\\n⚠️ VERIFY THE ABOVE INFORMATION IS CORRECT!\")\n",
|
||||
"print(\"💡 If this is the wrong account/region, STOP NOW and update your .env file\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 1: Uninstall Helm Release\n",
|
||||
"\n",
|
||||
"First, we'll uninstall the LangSmith Helm release. This removes the application but leaves the infrastructure.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from shared._shell import run\n",
|
||||
"from shared._aws_helpers import aws_region\n",
|
||||
"\n",
|
||||
"cluster_name = config[\"CLUSTER_NAME\"]\n",
|
||||
"namespace = config[\"NAMESPACE\"]\n",
|
||||
"helm_release = config[\"HELM_RELEASE\"]\n",
|
||||
"region = aws_region()\n",
|
||||
"\n",
|
||||
"# Ensure kubectl is configured\n",
|
||||
"print(\"### Configuring kubectl\\n\")\n",
|
||||
"run(\n",
|
||||
" [\"aws\", \"eks\", \"update-kubeconfig\", \"--name\", cluster_name, \"--region\", region],\n",
|
||||
" check=True,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Check if Helm release exists\n",
|
||||
"print(f\"\\n### Checking Helm Release: {helm_release}\\n\")\n",
|
||||
"result = run(\n",
|
||||
" [\"helm\", \"list\", \"-n\", namespace, \"-o\", \"json\"],\n",
|
||||
" check=False,\n",
|
||||
" stream=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"import json\n",
|
||||
"releases = json.loads(result.stdout) if result.returncode == 0 else []\n",
|
||||
"langsmith_releases = [r for r in releases if r.get(\"name\") == helm_release]\n",
|
||||
"\n",
|
||||
"if langsmith_releases:\n",
|
||||
" release = langsmith_releases[0]\n",
|
||||
" print(f\"Found Helm release: {release['name']}\")\n",
|
||||
" print(f\"Status: {release['status']}\")\n",
|
||||
" print(f\"Chart: {release['chart']}\")\n",
|
||||
" \n",
|
||||
" print(f\"\\n⚠️ UNCOMMENT THE CODE BELOW TO UNINSTALL HELM RELEASE\")\n",
|
||||
" print(\"This will remove the LangSmith application from the cluster.\\n\")\n",
|
||||
" \n",
|
||||
" # UNCOMMENT TO UNINSTALL:\n",
|
||||
" # print(\"Uninstalling Helm release...\\n\")\n",
|
||||
" # result = run(\n",
|
||||
" # [\"helm\", \"uninstall\", helm_release, \"-n\", namespace],\n",
|
||||
" # check=True,\n",
|
||||
" # stream=True\n",
|
||||
" # )\n",
|
||||
" # print(\"\\n✅ Helm release uninstalled\")\n",
|
||||
"else:\n",
|
||||
" print(f\"Helm release '{helm_release}' not found\")\n",
|
||||
" print(\"💡 It may have already been uninstalled, or the namespace is different\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 2: Clean Up Kubernetes Resources\n",
|
||||
"\n",
|
||||
"Remove any remaining Kubernetes resources (secrets, PVCs, etc.) that might not be cleaned up by Helm.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check for remaining resources\n",
|
||||
"print(\"### Checking for Remaining Kubernetes Resources\\n\")\n",
|
||||
"\n",
|
||||
"# List resources in namespace\n",
|
||||
"resources_to_check = [\n",
|
||||
" (\"pods\", [\"kubectl\", \"get\", \"pods\", \"-n\", namespace]),\n",
|
||||
" (\"services\", [\"kubectl\", \"get\", \"svc\", \"-n\", namespace]),\n",
|
||||
" (\"secrets\", [\"kubectl\", \"get\", \"secrets\", \"-n\", namespace]),\n",
|
||||
" (\"pvc\", [\"kubectl\", \"get\", \"pvc\", \"-n\", namespace]),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"remaining = []\n",
|
||||
"for resource_type, cmd in resources_to_check:\n",
|
||||
" result = run(cmd, check=False, stream=False)\n",
|
||||
" if result.returncode == 0:\n",
|
||||
" lines = result.stdout.strip().split('\\n')\n",
|
||||
" # Skip header line\n",
|
||||
" if len(lines) > 1:\n",
|
||||
" remaining.append(resource_type)\n",
|
||||
" print(f\"⚠️ Found {len(lines) - 1} {resource_type}(s)\")\n",
|
||||
"\n",
|
||||
"if remaining:\n",
|
||||
" print(f\"\\n💡 The following resource types still exist: {', '.join(remaining)}\")\n",
|
||||
" print(\" You may want to clean these up manually:\")\n",
|
||||
" print(f\" kubectl delete all --all -n {namespace}\")\n",
|
||||
" print(f\" kubectl delete pvc --all -n {namespace}\")\n",
|
||||
" print(f\" kubectl delete secrets --all -n {namespace}\")\n",
|
||||
"else:\n",
|
||||
" print(\"✅ No remaining resources found (or namespace is empty)\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 3: Destroy Terraform Infrastructure\n",
|
||||
"\n",
|
||||
"**⚠️ CRITICAL:** This will destroy all AWS infrastructure including:\n",
|
||||
"- EKS cluster\n",
|
||||
"- RDS PostgreSQL database (and all data)\n",
|
||||
"- ElastiCache Redis (and all data)\n",
|
||||
"- S3 buckets (and all data)\n",
|
||||
"- IAM roles and policies\n",
|
||||
"- VPC resources (if managed by Terraform)\n",
|
||||
"\n",
|
||||
"**This cannot be undone!**\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"terraform_dir = Path(config.get(\"TERRAFORM_DIR\", \"\")).expanduser().resolve()\n",
|
||||
"\n",
|
||||
"if not terraform_dir.exists():\n",
|
||||
" print(f\"⚠️ Terraform directory not found: {terraform_dir}\")\n",
|
||||
" print(\"💡 Update TERRAFORM_DIR in your .env file, or destroy infrastructure manually\")\n",
|
||||
"else:\n",
|
||||
" print(f\"### Terraform Directory: {terraform_dir}\\n\")\n",
|
||||
" \n",
|
||||
" # Check Terraform state\n",
|
||||
" print(\"Checking Terraform state...\\n\")\n",
|
||||
" result = run(\n",
|
||||
" [\"terraform\", \"show\", \"-json\"],\n",
|
||||
" cwd=str(terraform_dir),\n",
|
||||
" check=False,\n",
|
||||
" stream=False\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" if result.returncode == 0:\n",
|
||||
" state_data = json.loads(result.stdout)\n",
|
||||
" if state_data.get(\"values\") and state_data[\"values\"].get(\"root_module\"):\n",
|
||||
" resources = state_data[\"values\"][\"root_module\"].get(\"resources\", [])\n",
|
||||
" print(f\"Found {len(resources)} resources in Terraform state\")\n",
|
||||
" print(\"⚠️ These will all be destroyed!\\n\")\n",
|
||||
" else:\n",
|
||||
" print(\"Terraform state appears empty or not initialized\")\n",
|
||||
" else:\n",
|
||||
" print(\"Could not read Terraform state\")\n",
|
||||
" print(\"💡 Terraform may not be initialized, or state file doesn't exist\")\n",
|
||||
" \n",
|
||||
" print(\"⚠️ UNCOMMENT THE CODE BELOW TO DESTROY TERRAFORM INFRASTRUCTURE\")\n",
|
||||
" print(\"This will destroy ALL resources managed by Terraform.\\n\")\n",
|
||||
" \n",
|
||||
" # UNCOMMENT TO DESTROY:\n",
|
||||
" # print(\"Destroying Terraform infrastructure...\")\n",
|
||||
" # print(\"This will take 15-30 minutes...\\n\")\n",
|
||||
" # \n",
|
||||
" # result = run(\n",
|
||||
" # [\"terraform\", \"destroy\", \"-auto-approve\"],\n",
|
||||
" # cwd=str(terraform_dir),\n",
|
||||
" # check=False, # Don't fail on errors, we'll check return code\n",
|
||||
" # stream=True\n",
|
||||
" # )\n",
|
||||
" # \n",
|
||||
" # # Save destroy output\n",
|
||||
" # destroy_file = artifacts_dir / \"terraform-destroy.txt\"\n",
|
||||
" # with open(destroy_file, \"w\") as f:\n",
|
||||
" # f.write(result.stdout)\n",
|
||||
" # if result.stderr:\n",
|
||||
" # f.write(\"\\n\\nSTDERR:\\n\")\n",
|
||||
" # f.write(result.stderr)\n",
|
||||
" # \n",
|
||||
" # if result.returncode == 0:\n",
|
||||
" # print(\"\\n✅ Terraform destroy completed successfully\")\n",
|
||||
" # print(f\"💡 Destroy output saved to: {destroy_file}\")\n",
|
||||
" # else:\n",
|
||||
" # print(f\"\\n⚠️ Terraform destroy had issues (rc={result.returncode})\")\n",
|
||||
" # print(\"💡 Review the output above for errors\")\n",
|
||||
" # print(f\" Destroy output saved to: {destroy_file}\")\n",
|
||||
" \n",
|
||||
" print(\"💡 To destroy, edit this cell and uncomment the code above\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 4: Verify Cleanup\n",
|
||||
"\n",
|
||||
"After teardown, verify that resources have been removed.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from shared._aws_helpers import eks_cluster_exists\n",
|
||||
"\n",
|
||||
"print(\"### Verifying Cleanup\\n\")\n",
|
||||
"\n",
|
||||
"# Check if cluster still exists\n",
|
||||
"cluster_name = config[\"CLUSTER_NAME\"]\n",
|
||||
"region = aws_region()\n",
|
||||
"\n",
|
||||
"if eks_cluster_exists(cluster_name):\n",
|
||||
" warn(f\"Cluster '{cluster_name}' still exists\")\n",
|
||||
" print(\"💡 Terraform destroy may not have completed, or cluster was created outside Terraform\")\n",
|
||||
"else:\n",
|
||||
" ok(f\"Cluster '{cluster_name}' does not exist (destroyed or never created)\")\n",
|
||||
"\n",
|
||||
"# Check for remaining S3 buckets (if we know the bucket name)\n",
|
||||
"print(\"\\n### S3 Buckets\\n\")\n",
|
||||
"print(\"💡 Check AWS console for any remaining S3 buckets\")\n",
|
||||
"print(\" Terraform should have destroyed buckets it created, but verify manually\")\n",
|
||||
"\n",
|
||||
"# Check for remaining RDS instances\n",
|
||||
"print(\"\\n### RDS Instances\\n\")\n",
|
||||
"print(\"💡 Check AWS console for any remaining RDS instances\")\n",
|
||||
"print(\" Terraform should have destroyed RDS instances it created\")\n",
|
||||
"\n",
|
||||
"# Check for remaining ElastiCache clusters\n",
|
||||
"print(\"\\n### ElastiCache Clusters\\n\")\n",
|
||||
"print(\"💡 Check AWS console for any remaining ElastiCache clusters\")\n",
|
||||
"print(\" Terraform should have destroyed ElastiCache clusters it created\")\n",
|
||||
"\n",
|
||||
"print(\"\\n✅ Cleanup verification complete\")\n",
|
||||
"print(\"💡 Review AWS console to ensure all resources are removed\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"### ✅ Teardown Checklist\n",
|
||||
"\n",
|
||||
"- [ ] Helm release uninstalled\n",
|
||||
"- [ ] Kubernetes resources cleaned up\n",
|
||||
"- [ ] Terraform infrastructure destroyed\n",
|
||||
"- [ ] EKS cluster removed\n",
|
||||
"- [ ] RDS instance removed\n",
|
||||
"- [ ] ElastiCache cluster removed\n",
|
||||
"- [ ] S3 buckets removed (or emptied)\n",
|
||||
"- [ ] AWS console verified (no remaining resources)\n",
|
||||
"\n",
|
||||
"### 💡 Important Notes\n",
|
||||
"\n",
|
||||
"1. **Data Loss:** All data in RDS, ElastiCache, and S3 has been permanently deleted\n",
|
||||
"2. **Costs:** You should see AWS costs stop accruing within 24 hours\n",
|
||||
"3. **Artifacts:** Diagnostic artifacts in `artifacts/` directory are preserved for reference\n",
|
||||
"4. **Re-deployment:** You can re-run Module 1 notebooks to create a fresh deployment\n",
|
||||
"\n",
|
||||
"### 🎯 Next Steps\n",
|
||||
"\n",
|
||||
"If you want to start over:\n",
|
||||
"1. Review and update your `.env` file\n",
|
||||
"2. Run `01_aws_preflight.ipynb` again\n",
|
||||
"3. Proceed through the module notebooks\n",
|
||||
"\n",
|
||||
"**Thank you for completing Module 1!**\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
from typing import Optional
|
||||
from ._shell import run
|
||||
from ._validation import ok, warn
|
||||
|
||||
def aws_region() -> str:
|
||||
return os.environ.get("AWS_REGION", "").strip() or os.environ.get("AWS_DEFAULT_REGION", "").strip() or "us-west-2"
|
||||
|
||||
def sts_identity() -> dict:
|
||||
r = run(["aws", "sts", "get-caller-identity", "--output", "json"], check=True, stream=False)
|
||||
import json
|
||||
return json.loads(r.stdout)
|
||||
|
||||
def assert_account(expected_account_id: Optional[str]) -> None:
|
||||
if not expected_account_id:
|
||||
return
|
||||
ident = sts_identity()
|
||||
actual = ident.get("Account", "")
|
||||
if actual != expected_account_id:
|
||||
raise RuntimeError(f"❌ AWS account mismatch: expected {expected_account_id}, got {actual}")
|
||||
ok(f"AWS account guardrail matched: {actual}")
|
||||
|
||||
def eks_cluster_exists(cluster_name: str) -> bool:
|
||||
r = run(["aws", "eks", "describe-cluster", "--name", cluster_name, "--region", aws_region(), "--output", "json"],
|
||||
check=False, stream=False)
|
||||
if r.returncode == 0:
|
||||
return True
|
||||
if "ResourceNotFoundException" in r.stderr or "ResourceNotFoundException" in r.stdout:
|
||||
return False
|
||||
warn("EKS describe-cluster returned an unexpected error; treat as inconclusive.")
|
||||
return False
|
||||
|
||||
def alb_target_health(load_balancer_arn: str) -> str:
|
||||
# Caller should provide ARN; this returns raw JSON for inspection.
|
||||
return run(["aws", "elbv2", "describe-target-health",
|
||||
"--target-group-arn", load_balancer_arn,
|
||||
"--region", aws_region(),
|
||||
"--output", "json"], check=False, stream=False).stdout
|
||||
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Install required packages if not available
|
||||
def _ensure_packages():
|
||||
"""Ensure required Python packages are installed."""
|
||||
required_packages = [
|
||||
"python-dotenv", # For loading .env files
|
||||
"pyyaml", # For parsing YAML files (Chart.yaml, etc.)
|
||||
"requests", # For HTTP requests (UI validation)
|
||||
]
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *required_packages])
|
||||
|
||||
_ensure_packages()
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from ._shell import run
|
||||
from ._validation import ok, warn, fail
|
||||
from ._aws_helpers import aws_region, sts_identity
|
||||
|
||||
def load_env(env_file: Optional[str] = None) -> None:
|
||||
"""
|
||||
Load environment variables from a .env file.
|
||||
If env_file is not provided, looks for .env or workshop.env in the notebooks directory.
|
||||
|
||||
Raises RuntimeError if neither env file is found.
|
||||
"""
|
||||
if env_file is None:
|
||||
# Look for .env file in notebooks root, with fallback to workshop.env
|
||||
notebooks_dir = Path(__file__).parent.parent
|
||||
# Try .env first (standard), then workshop.env (Jupyter-friendly)
|
||||
env_file = notebooks_dir / ".env"
|
||||
if not env_file.exists():
|
||||
env_file = notebooks_dir / "workshop.env"
|
||||
|
||||
env_path = Path(env_file).expanduser().resolve()
|
||||
|
||||
if not env_path.exists():
|
||||
# Calculate relative path from repo root for cleaner display
|
||||
repo_root = Path(__file__).parent.parent.parent
|
||||
notebooks_dir = Path(__file__).parent.parent
|
||||
|
||||
# Show both options in the error message
|
||||
try:
|
||||
notebooks_dir_display = notebooks_dir.relative_to(repo_root)
|
||||
except ValueError:
|
||||
notebooks_dir_display = Path("notebooks")
|
||||
|
||||
print(f"""❌ Environment file not found
|
||||
💡 To fix this, create one of these files:
|
||||
Option 1 (via terminal): {notebooks_dir_display}/.env
|
||||
Option 2 (via Jupyter): {notebooks_dir_display}/workshop.env
|
||||
|
||||
Copy a template:
|
||||
cp env-samples/workshop.env.example {notebooks_dir_display}/.env
|
||||
# OR
|
||||
cp env-samples/workshop.env.example {notebooks_dir_display}/workshop.env
|
||||
|
||||
Then edit the file and fill in your configuration values.
|
||||
""")
|
||||
raise RuntimeError(f"Missing environment file. Expected {notebooks_dir_display}/.env or {notebooks_dir_display}/workshop.env")
|
||||
|
||||
load_dotenv(env_path, override=False) # Don't override existing env vars
|
||||
ok(f"Loaded environment variables from {env_path.name}")
|
||||
|
||||
def check_required_tools() -> None:
|
||||
"""
|
||||
Check that all required tools are available:
|
||||
- aws cli
|
||||
- terraform
|
||||
- helm
|
||||
- kubectl
|
||||
- jq
|
||||
"""
|
||||
print("### Checking required tools...")
|
||||
tools = [
|
||||
("aws", ["aws", "--version"]),
|
||||
("terraform", ["terraform", "version"]),
|
||||
("helm", ["helm", "version"]),
|
||||
("kubectl", ["kubectl", "version", "--client"]),
|
||||
("jq", ["jq", "--version"]),
|
||||
]
|
||||
|
||||
missing = []
|
||||
for tool_name, version_cmd in tools:
|
||||
try:
|
||||
result = run(version_cmd, check=False, stream=False)
|
||||
if result.returncode == 0:
|
||||
ok(f"{tool_name} is available")
|
||||
else:
|
||||
missing.append(tool_name)
|
||||
warn(f"{tool_name} check failed (rc={result.returncode})")
|
||||
except FileNotFoundError:
|
||||
missing.append(tool_name)
|
||||
fail(f"{tool_name} not found in PATH")
|
||||
except Exception as e:
|
||||
missing.append(tool_name)
|
||||
warn(f"Error checking {tool_name}: {e}")
|
||||
|
||||
if missing:
|
||||
fail(f"Missing required tools: {', '.join(missing)}")
|
||||
|
||||
ok("All required tools are available")
|
||||
|
||||
def print_aws_info() -> None:
|
||||
"""
|
||||
Print AWS identity and region information.
|
||||
"""
|
||||
print("### AWS Configuration")
|
||||
try:
|
||||
region = aws_region()
|
||||
print(f"Region: {region}")
|
||||
|
||||
identity = sts_identity()
|
||||
account_id = identity.get("Account", "unknown")
|
||||
user_arn = identity.get("Arn", "unknown")
|
||||
user_id = identity.get("UserId", "unknown")
|
||||
|
||||
print(f"Account ID: {account_id}")
|
||||
print(f"User ARN: {user_arn}")
|
||||
print(f"User ID: {user_id}")
|
||||
ok("AWS credentials are valid")
|
||||
except Exception as e:
|
||||
fail(f"Failed to get AWS identity: {e}")
|
||||
|
||||
def setup_artifacts_dir(artifacts_dir: Optional[str] = None) -> Path:
|
||||
"""
|
||||
Create the ARTIFACTS_DIR directory if it doesn't exist.
|
||||
Returns the Path to the artifacts directory.
|
||||
"""
|
||||
if artifacts_dir is None:
|
||||
artifacts_dir = os.environ.get("ARTIFACTS_DIR", "./artifacts")
|
||||
|
||||
artifacts_path = Path(artifacts_dir).expanduser().resolve()
|
||||
artifacts_path.mkdir(parents=True, exist_ok=True)
|
||||
ok(f"Artifacts directory ready: {artifacts_path}")
|
||||
|
||||
# Set it in environment for other notebooks
|
||||
os.environ["ARTIFACTS_DIR"] = str(artifacts_path)
|
||||
|
||||
return artifacts_path
|
||||
|
||||
def bootstrap(env_file: Optional[str] = None, artifacts_dir: Optional[str] = None) -> dict:
|
||||
"""
|
||||
Main bootstrap function that:
|
||||
1. Loads environment variables from .env file
|
||||
2. Checks that required tools exist
|
||||
3. Prints AWS identity and region
|
||||
4. Creates ARTIFACTS_DIR
|
||||
|
||||
Returns a dict with bootstrap information.
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Bootstrapping workshop environment...")
|
||||
print("=" * 60)
|
||||
|
||||
# Load environment variables
|
||||
load_env(env_file)
|
||||
|
||||
# Check required tools
|
||||
check_required_tools()
|
||||
|
||||
# Print AWS info
|
||||
print_aws_info()
|
||||
|
||||
# Setup artifacts directory
|
||||
artifacts_path = setup_artifacts_dir(artifacts_dir)
|
||||
|
||||
print("=" * 60)
|
||||
ok("Bootstrap complete!")
|
||||
print("=" * 60)
|
||||
|
||||
return {
|
||||
"artifacts_dir": str(artifacts_path),
|
||||
"aws_region": aws_region(),
|
||||
"aws_identity": sts_identity(),
|
||||
}
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
from typing import Optional
|
||||
from ._shell import run
|
||||
from ._validation import ok, warn, fail
|
||||
|
||||
def kubectl(*args: str, namespace: Optional[str] = None, check: bool = True, stream: bool = True):
|
||||
cmd = ["kubectl"]
|
||||
if namespace:
|
||||
cmd += ["-n", namespace]
|
||||
cmd += list(args)
|
||||
return run(cmd, check=check, stream=stream)
|
||||
|
||||
def namespace_exists(ns: str) -> bool:
|
||||
r = kubectl("get", "namespace", ns, check=False, stream=False)
|
||||
return r.returncode == 0
|
||||
|
||||
def require_namespace(ns: str) -> None:
|
||||
if not namespace_exists(ns):
|
||||
fail(f"Kubernetes namespace '{ns}' does not exist (did you deploy yet?)")
|
||||
ok(f"Namespace exists: {ns}")
|
||||
|
||||
def get_pods(ns: str) -> str:
|
||||
return kubectl("get", "pods", "-o", "wide", namespace=ns, stream=False).stdout
|
||||
|
||||
def wait_for_deployments_ready(ns: str, timeout: str = "10m") -> None:
|
||||
r = kubectl("wait", "--for=condition=available", "deployment", "--all", f"--timeout={timeout}",
|
||||
namespace=ns, check=False, stream=True)
|
||||
if r.returncode != 0:
|
||||
warn("Not all deployments became ready within timeout. Check pods/events.")
|
||||
else:
|
||||
ok("All deployments available.")
|
||||
@@ -0,0 +1,102 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional, Sequence, Union
|
||||
|
||||
@dataclass
|
||||
class CmdResult:
|
||||
cmd: str
|
||||
returncode: int
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
def _env_for_subprocess(extra_env: Optional[dict] = None) -> dict:
|
||||
env = os.environ.copy()
|
||||
|
||||
# Add common Homebrew paths to PATH if they exist (macOS)
|
||||
# Jupyter notebooks often don't inherit full PATH from launching shell
|
||||
homebrew_paths = [
|
||||
"/opt/homebrew/bin", # Apple Silicon Macs
|
||||
"/usr/local/bin", # Intel Macs / Linux
|
||||
"/opt/homebrew/sbin",
|
||||
"/usr/local/sbin",
|
||||
]
|
||||
|
||||
current_path = env.get("PATH", "")
|
||||
path_parts = current_path.split(os.pathsep) if current_path else []
|
||||
|
||||
# Add Homebrew paths if they exist and aren't already in PATH
|
||||
for brew_path in homebrew_paths:
|
||||
if Path(brew_path).exists() and brew_path not in path_parts:
|
||||
path_parts.insert(0, brew_path)
|
||||
|
||||
if path_parts != current_path.split(os.pathsep) if current_path else []:
|
||||
env["PATH"] = os.pathsep.join(path_parts)
|
||||
|
||||
# Respect AWS_PROFILE if set (AWS CLI + boto3 will use it)
|
||||
if env.get("AWS_PROFILE", "").strip() == "":
|
||||
env.pop("AWS_PROFILE", None)
|
||||
if extra_env:
|
||||
env.update(extra_env)
|
||||
return env
|
||||
|
||||
def run(
|
||||
cmd: Union[str, Sequence[str]],
|
||||
cwd: Optional[str] = None,
|
||||
check: bool = True,
|
||||
stream: bool = True,
|
||||
extra_env: Optional[dict] = None,
|
||||
) -> CmdResult:
|
||||
"""
|
||||
Run a shell command with optional streaming output.
|
||||
- cmd can be a string or list of args.
|
||||
- stream=True prints output live and still captures it.
|
||||
"""
|
||||
if isinstance(cmd, (list, tuple)):
|
||||
cmd_str = " ".join(shlex.quote(str(c)) for c in cmd)
|
||||
popen_args = list(cmd)
|
||||
shell = False
|
||||
else:
|
||||
cmd_str = cmd
|
||||
popen_args = cmd
|
||||
shell = True
|
||||
|
||||
env = _env_for_subprocess(extra_env)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
popen_args,
|
||||
cwd=cwd,
|
||||
shell=shell,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
universal_newlines=True,
|
||||
)
|
||||
|
||||
out_lines, err_lines = [], []
|
||||
assert proc.stdout and proc.stderr
|
||||
|
||||
if stream:
|
||||
for line in proc.stdout:
|
||||
print(line, end="")
|
||||
out_lines.append(line)
|
||||
for line in proc.stderr:
|
||||
print(line, end="")
|
||||
err_lines.append(line)
|
||||
else:
|
||||
stdout, stderr = proc.communicate()
|
||||
out_lines.append(stdout or "")
|
||||
err_lines.append(stderr or "")
|
||||
|
||||
rc = proc.wait()
|
||||
result = CmdResult(cmd=cmd_str, returncode=rc, stdout="".join(out_lines), stderr="".join(err_lines))
|
||||
|
||||
if check and rc != 0:
|
||||
raise RuntimeError(f"Command failed (rc={rc}): {cmd_str}\n\nSTDERR:\n{result.stderr}")
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,39 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
||||
def ok(msg: str) -> None:
|
||||
print(f"✅ {msg}")
|
||||
|
||||
def warn(msg: str) -> None:
|
||||
print(f"⚠️ {msg}")
|
||||
|
||||
def fail(msg: str) -> None:
|
||||
raise RuntimeError(f"❌ {msg}")
|
||||
|
||||
def require_env(*keys: str) -> dict:
|
||||
cfg = {}
|
||||
missing = []
|
||||
for k in keys:
|
||||
v = os.environ.get(k, "").strip()
|
||||
if not v:
|
||||
missing.append(k)
|
||||
cfg[k] = v
|
||||
if missing:
|
||||
fail(f"Missing required environment variables: {', '.join(missing)}")
|
||||
return cfg
|
||||
|
||||
def redact(value: str, keep: int = 4) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
if len(value) <= keep:
|
||||
return "*" * len(value)
|
||||
return f"{value[:keep]}…({'*' * 8})"
|
||||
|
||||
def print_config(config: dict, redact_keys: set[str] | None = None) -> None:
|
||||
redact_keys = redact_keys or set()
|
||||
print("### Config (redacted)")
|
||||
for k, v in config.items():
|
||||
if k in redact_keys:
|
||||
print(f"- {k}: {redact(str(v))}")
|
||||
else:
|
||||
print(f"- {k}: {v}")
|
||||
Reference in New Issue
Block a user