mirror of
https://github.com/BillyOutlast/posthog.git
synced 2026-02-04 03:01:23 +01:00
refactor(decide): Send error metrics to logs instead of sentry for kn… (#15223)
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import base64
|
||||
import json
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.core.cache import cache
|
||||
from django.db import connection
|
||||
@@ -1055,7 +1056,8 @@ class TestDecide(BaseTest, QueryMatchingTest):
|
||||
) # different hash, different variant assigned
|
||||
self.assertFalse(response.json()["errorsWhileComputingFlags"])
|
||||
|
||||
def test_feature_flags_v3_with_database_errors(self):
|
||||
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
|
||||
def test_feature_flags_v3_with_database_errors(self, mock_counter):
|
||||
self.team.app_urls = ["https://example.com"]
|
||||
self.team.save()
|
||||
self.client.logout()
|
||||
@@ -1141,6 +1143,8 @@ class TestDecide(BaseTest, QueryMatchingTest):
|
||||
self.assertEqual("first-variant", response.json()["featureFlags"]["multivariate-flag"])
|
||||
self.assertTrue(response.json()["errorsWhileComputingFlags"])
|
||||
|
||||
mock_counter.labels.assert_called_once_with(reason="timeout")
|
||||
|
||||
def test_feature_flags_v3_with_database_errors_and_no_flags(self):
|
||||
self.team.app_urls = ["https://example.com"]
|
||||
self.team.save()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import datetime
|
||||
import json
|
||||
from typing import Dict, List, Optional
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import call, patch
|
||||
|
||||
from django.core.cache import cache
|
||||
from django.db import connection
|
||||
@@ -2614,7 +2614,7 @@ class TestBlastRadius(ClickhouseTestMixin, APIBaseTest):
|
||||
class QueryTimeoutWrapper:
|
||||
def __call__(self, execute, *args, **kwargs):
|
||||
|
||||
raise OperationalError("I am a timeout error")
|
||||
raise OperationalError("canceling statement due to statement timeout")
|
||||
# return execute(*args, **kwargs)
|
||||
|
||||
|
||||
@@ -2722,7 +2722,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue("default-flag" not in all_flags)
|
||||
self.assertTrue(errors)
|
||||
|
||||
def test_feature_flags_v3_with_person_properties(self):
|
||||
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
|
||||
def test_feature_flags_v3_with_person_properties(self, mock_counter):
|
||||
self.organization = Organization.objects.create(name="test")
|
||||
self.team = Team.objects.create(organization=self.organization)
|
||||
self.user = User.objects.create_and_join(self.organization, "random@test.com", "password", "first_name")
|
||||
@@ -2792,6 +2793,10 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue(all_flags["default-flag"])
|
||||
self.assertFalse(errors)
|
||||
|
||||
mock_counter.labels.assert_called_once_with(reason="timeout")
|
||||
mock_counter.labels.return_value.inc.assert_called_once_with()
|
||||
|
||||
mock_counter.reset_mock()
|
||||
# # now db is down, but decide was sent email parameter with different email
|
||||
with self.assertNumQueries(0):
|
||||
all_flags, _, _, errors = get_all_feature_flags(
|
||||
@@ -2801,6 +2806,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue(all_flags["default-flag"])
|
||||
self.assertFalse(errors)
|
||||
|
||||
mock_counter.labels.assert_not_called()
|
||||
|
||||
def test_feature_flags_v3_with_a_working_slow_db(self):
|
||||
self.organization = Organization.objects.create(name="test")
|
||||
self.team = Team.objects.create(organization=self.organization)
|
||||
@@ -2882,7 +2889,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue(all_flags["default-flag"])
|
||||
self.assertFalse(errors)
|
||||
|
||||
def test_feature_flags_v3_with_slow_db_doesnt_try_to_compute_conditions_again(self):
|
||||
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
|
||||
def test_feature_flags_v3_with_slow_db_doesnt_try_to_compute_conditions_again(self, mock_counter):
|
||||
self.organization = Organization.objects.create(name="test")
|
||||
self.team = Team.objects.create(organization=self.organization)
|
||||
self.user = User.objects.create_and_join(self.organization, "random@test.com", "password", "first_name")
|
||||
@@ -2955,7 +2963,17 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue(all_flags["default-flag"])
|
||||
self.assertTrue(errors)
|
||||
|
||||
def test_feature_flags_v3_with_group_properties_and_slow_db(self):
|
||||
mock_counter.labels.assert_has_calls(
|
||||
[
|
||||
call(reason="timeout"),
|
||||
call().inc(),
|
||||
call(reason="flag_condition_retry"),
|
||||
call().inc(),
|
||||
]
|
||||
)
|
||||
|
||||
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
|
||||
def test_feature_flags_v3_with_group_properties_and_slow_db(self, mock_counter):
|
||||
self.organization = Organization.objects.create(name="test")
|
||||
self.team = Team.objects.create(organization=self.organization)
|
||||
self.user = User.objects.create_and_join(self.organization, "randomXYZ@test.com", "password", "first_name")
|
||||
@@ -3039,6 +3057,15 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue("default-flag" not in all_flags)
|
||||
self.assertTrue(errors)
|
||||
|
||||
mock_counter.labels.assert_has_calls(
|
||||
[
|
||||
call(reason="timeout"),
|
||||
call().inc(),
|
||||
call(reason="group_mapping_retry"),
|
||||
call().inc(),
|
||||
]
|
||||
)
|
||||
|
||||
# # now db is down, but decide was sent different group property overrides
|
||||
with self.assertNumQueries(2):
|
||||
all_flags, _, _, errors = get_all_feature_flags(
|
||||
@@ -3052,7 +3079,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue("default-flag" not in all_flags)
|
||||
self.assertTrue(errors)
|
||||
|
||||
def test_feature_flags_v3_with_experience_continuity_working_slow_db(self):
|
||||
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
|
||||
def test_feature_flags_v3_with_experience_continuity_working_slow_db(self, mock_counter):
|
||||
self.organization = Organization.objects.create(name="test")
|
||||
self.team = Team.objects.create(organization=self.organization)
|
||||
self.user = User.objects.create_and_join(self.organization, "random12@test.com", "password", "first_name")
|
||||
@@ -3125,3 +3153,10 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
|
||||
self.assertTrue("property-flag" not in all_flags)
|
||||
self.assertTrue(all_flags["default-flag"])
|
||||
self.assertTrue(errors)
|
||||
|
||||
mock_counter.labels.assert_has_calls(
|
||||
[
|
||||
call(reason="timeout"),
|
||||
call().inc(),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -5,7 +5,8 @@ import time
|
||||
import structlog
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from django.db import DatabaseError, IntegrityError
|
||||
from prometheus_client import Counter
|
||||
from django.db import DatabaseError, IntegrityError, OperationalError
|
||||
from django.db.models.expressions import ExpressionWrapper, RawSQL
|
||||
from django.db.models.fields import BooleanField
|
||||
from django.db.models.query import QuerySet
|
||||
@@ -34,6 +35,12 @@ __LONG_SCALE__ = float(0xFFFFFFFFFFFFFFF)
|
||||
|
||||
FLAG_MATCHING_QUERY_TIMEOUT_MS = 1 * 1000 # 1 second. Any longer and we'll just error out.
|
||||
|
||||
FLAG_EVALUATION_ERROR_COUNTER = Counter(
|
||||
"flag_evaluation_error_total",
|
||||
"Failed decide requests with reason.",
|
||||
labelnames=["reason"],
|
||||
)
|
||||
|
||||
|
||||
class FeatureFlagMatchReason(str, Enum):
|
||||
CONDITION_MATCH = "condition_match"
|
||||
@@ -180,7 +187,7 @@ class FeatureFlagMatcher:
|
||||
}
|
||||
except Exception as err:
|
||||
faced_error_computing_flags = True
|
||||
capture_exception(err)
|
||||
handle_feature_flag_exception(err, "[Feature Flags] Error computing flags")
|
||||
|
||||
return flag_values, flag_evaluation_reasons, flag_payloads, faced_error_computing_flags
|
||||
|
||||
@@ -504,7 +511,7 @@ def get_all_feature_flags(
|
||||
# since the set_feature_flag_hash_key_overrides call will fail.
|
||||
|
||||
# For this case, and for any other case, do not error out on decide, just continue assuming continuity couldn't happen.
|
||||
capture_exception(e)
|
||||
handle_feature_flag_exception(e, "[Feature Flags] Error while setting feature flag hash key overrides")
|
||||
|
||||
# This is the read-path for experience continuity. We need to get the overrides, and to do that, we get the person_id.
|
||||
try:
|
||||
@@ -593,3 +600,27 @@ def set_feature_flag_hash_key_overrides(team_id: int, distinct_ids: List[str], h
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
def handle_feature_flag_exception(err: Exception, log_message: str = ""):
|
||||
logger.exception(log_message)
|
||||
reason = parse_exception_for_error_message(err)
|
||||
FLAG_EVALUATION_ERROR_COUNTER.labels(reason=reason).inc()
|
||||
if reason == "unknown":
|
||||
capture_exception(err)
|
||||
|
||||
|
||||
def parse_exception_for_error_message(err: Exception):
|
||||
reason = "unknown"
|
||||
if isinstance(err, OperationalError):
|
||||
if "statement timeout" in str(err):
|
||||
reason = "timeout"
|
||||
elif "no more connections" in str(err):
|
||||
reason = "no_more_connections"
|
||||
elif isinstance(err, DatabaseError):
|
||||
if "Failed to fetch conditions" in str(err):
|
||||
reason = "flag_condition_retry"
|
||||
elif "Failed to fetch group" in str(err):
|
||||
reason = "group_mapping_retry"
|
||||
|
||||
return reason
|
||||
|
||||
Reference in New Issue
Block a user