fix(migration-analyzer): correct misleading lock messaging and scoring (#40913)

2026-02-04 03:01:23 +01:00 · 2025-11-05 14:21:38 +01:00
parent 8455fb7ef3
commit 9d7d8801dc
7 changed files with 102 additions and 56 deletions
--- a/.github/workflows/ci-backend.yml
+++ b/.github/workflows/ci-backend.yml
@@ -225,32 +225,41 @@ jobs:
                    fi
                  done

-                  # Delete previous comments from this workflow
+                  # Get existing comments
                  COMMENTS=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
                    -H "Accept: application/vnd.github.v3+json" \
                    "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments")

-                  echo "Output from listing comments: $COMMENTS"
-                  echo "$COMMENTS" | jq -r '.[] | select(.body | startswith("## Migration SQL Changes")) | .id' | while read -r comment_id; do
-                    echo "Deleting comment $comment_id"
-                    curl -X DELETE \
-                      -H "Authorization: token $GITHUB_TOKEN" \
-                      -H "Accept: application/vnd.github.v3+json" \
-                      "https://api.github.com/repos/${{ github.repository }}/issues/comments/$comment_id"
-                  done
+                  # Extract comment ID if exists
+                  SQL_COMMENT_ID=$(echo "$COMMENTS" | jq -r '.[] | select(.body | startswith("## Migration SQL Changes")) | .id' | head -1)
+
+                  # Add timestamp and commit SHA to SQL changes
+                  TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M UTC')
+                  COMMIT_SHA="${{ github.event.pull_request.head.sha }}"
+                  COMMIT_SHORT="${COMMIT_SHA:0:7}"
+                  COMMENT_BODY+="\n*Last updated: $TIMESTAMP ([${COMMIT_SHORT}](https://github.com/${{ github.repository }}/commit/${COMMIT_SHA}))*"

                  # Convert \n into actual newlines
                  COMMENT_BODY=$(printf '%b' "$COMMENT_BODY")
                  COMMENT_BODY_JSON=$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')

-                  # Post SQL comment to PR
-                  echo "Posting SQL comment to PR"
-                  echo "$COMMENT_BODY_JSON"
-                  curl -X POST \
-                    -H "Authorization: token $GITHUB_TOKEN" \
-                    -H "Accept: application/vnd.github.v3+json" \
-                    "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
-                    -d "$COMMENT_BODY_JSON"
+                  if [ -n "$SQL_COMMENT_ID" ]; then
+                    # Update existing comment
+                    echo "Updating existing SQL comment $SQL_COMMENT_ID"
+                    curl -X PATCH \
+                      -H "Authorization: token $GITHUB_TOKEN" \
+                      -H "Accept: application/vnd.github.v3+json" \
+                      "https://api.github.com/repos/${{ github.repository }}/issues/comments/$SQL_COMMENT_ID" \
+                      -d "$COMMENT_BODY_JSON"
+                  else
+                    # Post new SQL comment to PR
+                    echo "Posting new SQL comment to PR"
+                    curl -X POST \
+                      -H "Authorization: token $GITHUB_TOKEN" \
+                      -H "Accept: application/vnd.github.v3+json" \
+                      "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
+                      -d "$COMMENT_BODY_JSON"
+                  fi

            - name: Run migration risk analysis and post comment
              if: github.event_name == 'pull_request'
@@ -263,34 +272,54 @@ jobs:
                  EXIT_CODE=$?
                  set -e  # Re-enable exit on error

+                  # Save analysis to file for artifact upload
+                  if [ -n "$RISK_ANALYSIS" ]; then
+                    echo "$RISK_ANALYSIS" > migration_analysis.md
+                  fi
+
                  # Get existing comments
                  COMMENTS=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \
                    -H "Accept: application/vnd.github.v3+json" \
                    "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments")

-                  # Delete previous risk analysis comments
-                  echo "$COMMENTS" | jq -r '.[] | select(.body | startswith("## 🔍 Migration Risk Analysis")) | .id' | while read -r comment_id; do
-                    echo "Deleting risk analysis comment $comment_id"
-                    curl -X DELETE \
-                      -H "Authorization: token $GITHUB_TOKEN" \
-                      -H "Accept: application/vnd.github.v3+json" \
-                      "https://api.github.com/repos/${{ github.repository }}/issues/comments/$comment_id"
-                  done
+                  # Extract comment ID if exists
+                  COMMENT_ID=$(echo "$COMMENTS" | jq -r '.[] | select(.body | startswith("## 🔍 Migration Risk Analysis")) | .id' | head -1)

                  if [ -n "$RISK_ANALYSIS" ] && echo "$RISK_ANALYSIS" | grep -q "Summary:"; then
-                    # Post risk analysis comment (output already contains markdown formatting with legend)
-                    RISK_COMMENT="## 🔍 Migration Risk Analysis\n\nWe've analyzed your migrations for potential risks.\n\n$RISK_ANALYSIS"
+                    # Add timestamp and commit SHA to analysis
+                    TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M UTC')
+                    COMMIT_SHA="${{ github.event.pull_request.head.sha }}"
+                    COMMIT_SHORT="${COMMIT_SHA:0:7}"
+                    RISK_COMMENT="## 🔍 Migration Risk Analysis\n\nWe've analyzed your migrations for potential risks.\n\n$RISK_ANALYSIS\n\n*Last updated: $TIMESTAMP ([${COMMIT_SHORT}](https://github.com/${{ github.repository }}/commit/${COMMIT_SHA}))*"
                    RISK_COMMENT=$(printf '%b' "$RISK_COMMENT")
                    RISK_COMMENT_JSON=$(jq -n --arg body "$RISK_COMMENT" '{body: $body}')

-                    echo "Posting risk analysis comment to PR"
-                    curl -X POST \
+                    if [ -n "$COMMENT_ID" ]; then
+                      # Update existing comment
+                      echo "Updating existing risk analysis comment $COMMENT_ID"
+                      curl -X PATCH \
+                        -H "Authorization: token $GITHUB_TOKEN" \
+                        -H "Accept: application/vnd.github.v3+json" \
+                        "https://api.github.com/repos/${{ github.repository }}/issues/comments/$COMMENT_ID" \
+                        -d "$RISK_COMMENT_JSON"
+                    else
+                      # Create new comment if none exists
+                      echo "Posting new risk analysis comment to PR"
+                      curl -X POST \
+                        -H "Authorization: token $GITHUB_TOKEN" \
+                        -H "Accept: application/vnd.github.v3+json" \
+                        "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
+                        -d "$RISK_COMMENT_JSON"
+                    fi
+                  elif [ -n "$COMMENT_ID" ]; then
+                    # No migrations to analyze but comment exists - delete it
+                    echo "Deleting risk analysis comment (no migrations to analyze)"
+                    curl -X DELETE \
                      -H "Authorization: token $GITHUB_TOKEN" \
                      -H "Accept: application/vnd.github.v3+json" \
-                      "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
-                      -d "$RISK_COMMENT_JSON"
+                      "https://api.github.com/repos/${{ github.repository }}/issues/comments/$COMMENT_ID"
                  else
-                    echo "No migrations to analyze or risk analysis output empty - previous comments cleaned up"
+                    echo "No migrations to analyze and no existing comment"
                  fi

                  # Fail the job if there were blocked migrations
@@ -298,6 +327,14 @@ jobs:
                    exit $EXIT_CODE
                  fi

+            - name: Upload migration analysis artifact
+              if: always() && github.event_name == 'pull_request'
+              uses: actions/upload-artifact@v4
+              with:
+                  name: migration-analysis
+                  path: migration_analysis.md
+                  if-no-files-found: ignore
+
            - name: Run migrations for this PR
              run: |
                  # Run Django migrations first (excluding managed=False models)
--- a/posthog/management/commands/analyze_migration_risk.py
+++ b/posthog/management/commands/analyze_migration_risk.py
@@ -143,7 +143,7 @@ class Command(BaseCommand):
                        return ""

                    # Prepend Summary for CI workflow, wrap Django's output in code block
-                    return f"**Summary:** ⚠️ Missing migrations detected\n\n```\n{filtered_output}```\n\nRun `python manage.py makemigrations` to create them.\n"
+                    return f"**Summary:** ⚠️ Missing migrations detected\n\n```\n{filtered_output}\n```\n\nRun `python manage.py makemigrations` to create them.\n"
                return ""
        except Exception:
            # Ignore other errors (e.g., can't connect to DB)
--- a/posthog/management/migration_analysis/analyzer.py
+++ b/posthog/management/migration_analysis/analyzer.py
@@ -28,9 +28,10 @@ class RiskAnalyzer:
    Analyzes Django migration operations and assigns risk scores.

    Risk scoring rules:
-    0-1: Safe - No locks, backwards compatible
-    2-3: Needs Review - May have performance impact or needs careful deployment
-    4-5: Blocked - Causes locks, breaks backwards compatibility, or can't rollback
+    0: Safe - No contention risk (new tables, concurrent operations)
+    1: Needs Review - Brief lock required, review for high-traffic tables
+    2-3: Needs Review - Extended operations or performance impact
+    4-5: Blocked - Table rewrites, breaks backwards compatibility, or can't rollback
    """

    # Registry of operation analyzers
--- a/posthog/management/migration_analysis/formatters.py
+++ b/posthog/management/migration_analysis/formatters.py
@@ -69,8 +69,8 @@ class ConsoleTreeFormatter(RiskFormatter):
        # Add explanation for each level
        explanations = {
            RiskLevel.BLOCKED: "Causes locks or breaks compatibility",
-            RiskLevel.NEEDS_REVIEW: "May have performance impact",
-            RiskLevel.SAFE: "No locks, backwards compatible",
+            RiskLevel.NEEDS_REVIEW: "Requires brief lock, review for high-traffic tables",
+            RiskLevel.SAFE: "No contention risk, backwards compatible",
        }
        explanation = explanations.get(level, "")
        lines.append(f"## {icon} {level.category}\n")
--- a/posthog/management/migration_analysis/models.py
+++ b/posthog/management/migration_analysis/models.py
@@ -7,8 +7,8 @@ from enum import Enum
 class RiskLevel(Enum):
    """Risk level definitions with scoring ranges"""

-    SAFE = ("Safe", 0, 1)
-    NEEDS_REVIEW = ("Needs Review", 2, 3)
+    SAFE = ("Safe", 0, 0)
+    NEEDS_REVIEW = ("Needs Review", 1, 3)
    BLOCKED = ("Blocked", 4, 5)

    def __init__(self, category: str, min_score: int, max_score: int):
--- a/posthog/management/migration_analysis/operations.py
+++ b/posthog/management/migration_analysis/operations.py
@@ -45,12 +45,20 @@ class AddFieldAnalyzer(OperationAnalyzer):
        return self._analyze_not_null_with_default(op, field)

    def _analyze_nullable_field(self, op) -> OperationRisk:
-        """Nullable fields are always safe."""
+        """Nullable fields require brief lock but no table rewrite."""
        return OperationRisk(
            type=self.operation_type,
-            score=0,
-            reason="Adding nullable field is safe",
+            score=1,
+            reason="Adding nullable field requires brief lock",
            details={"model": op.model_name, "field": op.name},
+            guidance="""While this operation doesn't rewrite the table, it still acquires an ACCESS EXCLUSIVE lock briefly.
+
+For high-traffic tables, consider:
+- Deploy during low-traffic periods
+- Monitor lock contention and query timeouts during deployment
+- Have a rollback plan ready
+
+For low-traffic tables, this operation is generally safe to deploy anytime.""",
        )

    def _risk_not_null_no_default(self, op) -> OperationRisk:
--- a/posthog/management/migration_analysis/test/test_risk_analyzer.py
+++ b/posthog/management/migration_analysis/test/test_risk_analyzer.py
@@ -18,9 +18,9 @@ def create_mock_operation(op_class, **kwargs):
 class TestRiskLevelScoring:
    def test_safe_scores(self):
        assert RiskLevel.from_score(0) == RiskLevel.SAFE
-        assert RiskLevel.from_score(1) == RiskLevel.SAFE

    def test_needs_review_scores(self):
+        assert RiskLevel.from_score(1) == RiskLevel.NEEDS_REVIEW
        assert RiskLevel.from_score(2) == RiskLevel.NEEDS_REVIEW
        assert RiskLevel.from_score(3) == RiskLevel.NEEDS_REVIEW

@@ -48,9 +48,9 @@ class TestAddFieldOperations:

        risk = self.analyzer.analyze_operation(op)

-        assert risk.score == 0
+        assert risk.score == 1
        assert "nullable" in risk.reason.lower()
-        assert risk.level == RiskLevel.SAFE
+        assert risk.level == RiskLevel.NEEDS_REVIEW

    def test_add_blank_field_without_null(self):
        """blank=True doesn't make database safe - only null=True does."""
@@ -66,7 +66,7 @@ class TestAddFieldOperations:

        # blank=True is just form validation, so this needs a default to be safe
        assert risk.score == 1
-        assert risk.level == RiskLevel.SAFE
+        assert risk.level == RiskLevel.NEEDS_REVIEW

    def test_add_not_null_with_default(self):
        field: models.Field = models.CharField(max_length=100, default="test", null=False, blank=False)
@@ -81,7 +81,7 @@ class TestAddFieldOperations:

        assert risk.score == 1
        assert "constant" in risk.reason.lower()
-        assert risk.level == RiskLevel.SAFE
+        assert risk.level == RiskLevel.NEEDS_REVIEW

    def test_add_not_null_without_default(self):
        """Test NOT NULL field without default - Django doesn't set default, it's NOT_PROVIDED by default."""
@@ -374,7 +374,7 @@ class TestRunSQLOperations:
        assert risk.level == RiskLevel.NEEDS_REVIEW

    def test_run_sql_with_concurrent_index_with_if_not_exists(self):
-        """Test CREATE INDEX CONCURRENTLY with IF NOT EXISTS - score 1 (SAFE)."""
+        """Test CREATE INDEX CONCURRENTLY with IF NOT EXISTS - score 1 (NEEDS_REVIEW)."""
        op = create_mock_operation(
            migrations.RunSQL,
            sql="CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_foo ON users(foo);",
@@ -383,7 +383,7 @@ class TestRunSQLOperations:
        risk = self.analyzer.analyze_operation(op)

        assert risk.score == 1
-        assert risk.level == RiskLevel.SAFE
+        assert risk.level == RiskLevel.NEEDS_REVIEW
        assert "safe" in risk.reason.lower() or "non-blocking" in risk.reason.lower()

    def test_run_sql_with_concurrent_index_without_if_not_exists(self):
@@ -400,7 +400,7 @@ class TestRunSQLOperations:
        assert risk.guidance and "if not exists" in risk.guidance.lower()

    def test_run_sql_with_drop_index_concurrent_with_if_exists(self):
-        """Test DROP INDEX CONCURRENTLY with IF EXISTS - score 1 (SAFE)."""
+        """Test DROP INDEX CONCURRENTLY with IF EXISTS - score 1 (NEEDS_REVIEW)."""
        op = create_mock_operation(
            migrations.RunSQL,
            sql="DROP INDEX CONCURRENTLY IF EXISTS idx_foo;",
@@ -409,7 +409,7 @@ class TestRunSQLOperations:
        risk = self.analyzer.analyze_operation(op)

        assert risk.score == 1
-        assert risk.level == RiskLevel.SAFE
+        assert risk.level == RiskLevel.NEEDS_REVIEW
        assert "safe" in risk.reason.lower() or "non-blocking" in risk.reason.lower()

    def test_run_sql_with_drop_index_concurrent_without_if_exists(self):
@@ -426,7 +426,7 @@ class TestRunSQLOperations:
        assert risk.guidance and "if exists" in risk.guidance.lower()

    def test_run_sql_with_reindex_concurrent(self):
-        """Test REINDEX CONCURRENTLY - should be safe (score 1)."""
+        """Test REINDEX CONCURRENTLY - should be needs review (score 1)."""
        op = create_mock_operation(
            migrations.RunSQL,
            sql="REINDEX INDEX CONCURRENTLY idx_foo;",
@@ -435,11 +435,11 @@ class TestRunSQLOperations:
        risk = self.analyzer.analyze_operation(op)

        assert risk.score == 1
-        assert risk.level == RiskLevel.SAFE
+        assert risk.level == RiskLevel.NEEDS_REVIEW
        assert "safe" in risk.reason.lower() or "non-blocking" in risk.reason.lower()

    def test_run_sql_add_constraint_not_valid(self):
-        """Test ADD CONSTRAINT ... NOT VALID - safe (score 1)."""
+        """Test ADD CONSTRAINT ... NOT VALID - needs review (score 1)."""
        op = create_mock_operation(
            migrations.RunSQL,
            sql="ALTER TABLE users ADD CONSTRAINT check_age CHECK (age >= 0) NOT VALID;",
@@ -448,7 +448,7 @@ class TestRunSQLOperations:
        risk = self.analyzer.analyze_operation(op)

        assert risk.score == 1
-        assert risk.level == RiskLevel.SAFE
+        assert risk.level == RiskLevel.NEEDS_REVIEW
        assert "not valid" in risk.reason.lower() or "validates new rows" in risk.reason.lower()

    def test_run_sql_validate_constraint(self):