refactor(decide): Send error metrics to logs instead of sentry for kn… (#15223)

This commit is contained in:
Neil Kakkar
2023-04-25 10:41:47 +01:00
committed by GitHub
parent d9dfff1469
commit db6d8598b5
3 changed files with 80 additions and 10 deletions

View File

@@ -1,5 +1,6 @@
import base64
import json
from unittest.mock import patch
from django.core.cache import cache
from django.db import connection
@@ -1055,7 +1056,8 @@ class TestDecide(BaseTest, QueryMatchingTest):
) # different hash, different variant assigned
self.assertFalse(response.json()["errorsWhileComputingFlags"])
def test_feature_flags_v3_with_database_errors(self):
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
def test_feature_flags_v3_with_database_errors(self, mock_counter):
self.team.app_urls = ["https://example.com"]
self.team.save()
self.client.logout()
@@ -1141,6 +1143,8 @@ class TestDecide(BaseTest, QueryMatchingTest):
self.assertEqual("first-variant", response.json()["featureFlags"]["multivariate-flag"])
self.assertTrue(response.json()["errorsWhileComputingFlags"])
mock_counter.labels.assert_called_once_with(reason="timeout")
def test_feature_flags_v3_with_database_errors_and_no_flags(self):
self.team.app_urls = ["https://example.com"]
self.team.save()

View File

@@ -1,7 +1,7 @@
import datetime
import json
from typing import Dict, List, Optional
from unittest.mock import patch
from unittest.mock import call, patch
from django.core.cache import cache
from django.db import connection
@@ -2614,7 +2614,7 @@ class TestBlastRadius(ClickhouseTestMixin, APIBaseTest):
class QueryTimeoutWrapper:
def __call__(self, execute, *args, **kwargs):
raise OperationalError("I am a timeout error")
raise OperationalError("canceling statement due to statement timeout")
# return execute(*args, **kwargs)
@@ -2722,7 +2722,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue("default-flag" not in all_flags)
self.assertTrue(errors)
def test_feature_flags_v3_with_person_properties(self):
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
def test_feature_flags_v3_with_person_properties(self, mock_counter):
self.organization = Organization.objects.create(name="test")
self.team = Team.objects.create(organization=self.organization)
self.user = User.objects.create_and_join(self.organization, "random@test.com", "password", "first_name")
@@ -2792,6 +2793,10 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue(all_flags["default-flag"])
self.assertFalse(errors)
mock_counter.labels.assert_called_once_with(reason="timeout")
mock_counter.labels.return_value.inc.assert_called_once_with()
mock_counter.reset_mock()
# # now db is down, but decide was sent email parameter with different email
with self.assertNumQueries(0):
all_flags, _, _, errors = get_all_feature_flags(
@@ -2801,6 +2806,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue(all_flags["default-flag"])
self.assertFalse(errors)
mock_counter.labels.assert_not_called()
def test_feature_flags_v3_with_a_working_slow_db(self):
self.organization = Organization.objects.create(name="test")
self.team = Team.objects.create(organization=self.organization)
@@ -2882,7 +2889,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue(all_flags["default-flag"])
self.assertFalse(errors)
def test_feature_flags_v3_with_slow_db_doesnt_try_to_compute_conditions_again(self):
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
def test_feature_flags_v3_with_slow_db_doesnt_try_to_compute_conditions_again(self, mock_counter):
self.organization = Organization.objects.create(name="test")
self.team = Team.objects.create(organization=self.organization)
self.user = User.objects.create_and_join(self.organization, "random@test.com", "password", "first_name")
@@ -2955,7 +2963,17 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue(all_flags["default-flag"])
self.assertTrue(errors)
def test_feature_flags_v3_with_group_properties_and_slow_db(self):
mock_counter.labels.assert_has_calls(
[
call(reason="timeout"),
call().inc(),
call(reason="flag_condition_retry"),
call().inc(),
]
)
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
def test_feature_flags_v3_with_group_properties_and_slow_db(self, mock_counter):
self.organization = Organization.objects.create(name="test")
self.team = Team.objects.create(organization=self.organization)
self.user = User.objects.create_and_join(self.organization, "randomXYZ@test.com", "password", "first_name")
@@ -3039,6 +3057,15 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue("default-flag" not in all_flags)
self.assertTrue(errors)
mock_counter.labels.assert_has_calls(
[
call(reason="timeout"),
call().inc(),
call(reason="group_mapping_retry"),
call().inc(),
]
)
# # now db is down, but decide was sent different group property overrides
with self.assertNumQueries(2):
all_flags, _, _, errors = get_all_feature_flags(
@@ -3052,7 +3079,8 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue("default-flag" not in all_flags)
self.assertTrue(errors)
def test_feature_flags_v3_with_experience_continuity_working_slow_db(self):
@patch("posthog.models.feature_flag.flag_matching.FLAG_EVALUATION_ERROR_COUNTER")
def test_feature_flags_v3_with_experience_continuity_working_slow_db(self, mock_counter):
self.organization = Organization.objects.create(name="test")
self.team = Team.objects.create(organization=self.organization)
self.user = User.objects.create_and_join(self.organization, "random12@test.com", "password", "first_name")
@@ -3125,3 +3153,10 @@ class TestResiliency(TransactionTestCase, QueryMatchingTest):
self.assertTrue("property-flag" not in all_flags)
self.assertTrue(all_flags["default-flag"])
self.assertTrue(errors)
mock_counter.labels.assert_has_calls(
[
call(reason="timeout"),
call().inc(),
]
)

View File

@@ -5,7 +5,8 @@ import time
import structlog
from typing import Any, Dict, List, Optional, Tuple, Union
from django.db import DatabaseError, IntegrityError
from prometheus_client import Counter
from django.db import DatabaseError, IntegrityError, OperationalError
from django.db.models.expressions import ExpressionWrapper, RawSQL
from django.db.models.fields import BooleanField
from django.db.models.query import QuerySet
@@ -34,6 +35,12 @@ __LONG_SCALE__ = float(0xFFFFFFFFFFFFFFF)
FLAG_MATCHING_QUERY_TIMEOUT_MS = 1 * 1000 # 1 second. Any longer and we'll just error out.
FLAG_EVALUATION_ERROR_COUNTER = Counter(
"flag_evaluation_error_total",
"Failed decide requests with reason.",
labelnames=["reason"],
)
class FeatureFlagMatchReason(str, Enum):
CONDITION_MATCH = "condition_match"
@@ -180,7 +187,7 @@ class FeatureFlagMatcher:
}
except Exception as err:
faced_error_computing_flags = True
capture_exception(err)
handle_feature_flag_exception(err, "[Feature Flags] Error computing flags")
return flag_values, flag_evaluation_reasons, flag_payloads, faced_error_computing_flags
@@ -504,7 +511,7 @@ def get_all_feature_flags(
# since the set_feature_flag_hash_key_overrides call will fail.
# For this case, and for any other case, do not error out on decide, just continue assuming continuity couldn't happen.
capture_exception(e)
handle_feature_flag_exception(e, "[Feature Flags] Error while setting feature flag hash key overrides")
# This is the read-path for experience continuity. We need to get the overrides, and to do that, we get the person_id.
try:
@@ -593,3 +600,27 @@ def set_feature_flag_hash_key_overrides(team_id: int, distinct_ids: List[str], h
time.sleep(retry_delay)
else:
raise e
def handle_feature_flag_exception(err: Exception, log_message: str = ""):
logger.exception(log_message)
reason = parse_exception_for_error_message(err)
FLAG_EVALUATION_ERROR_COUNTER.labels(reason=reason).inc()
if reason == "unknown":
capture_exception(err)
def parse_exception_for_error_message(err: Exception):
reason = "unknown"
if isinstance(err, OperationalError):
if "statement timeout" in str(err):
reason = "timeout"
elif "no more connections" in str(err):
reason = "no_more_connections"
elif isinstance(err, DatabaseError):
if "Failed to fetch conditions" in str(err):
reason = "flag_condition_retry"
elif "Failed to fetch group" in str(err):
reason = "group_mapping_retry"
return reason