mirror of
https://github.com/BillyOutlast/posthog.git
synced 2026-02-04 03:01:23 +01:00
205 lines
7.9 KiB
Python
205 lines
7.9 KiB
Python
"""
|
|
Shared utilities for experiment-related Dagster schedules and sensors.
|
|
"""
|
|
|
|
from datetime import time
|
|
|
|
from django.db import connection
|
|
from django.db.models import Q
|
|
|
|
import dagster
|
|
|
|
from posthog.schema import ExperimentQueryResponse
|
|
|
|
from posthog.models.experiment import Experiment
|
|
from posthog.models.team import Team
|
|
|
|
# Default hour (UTC) for experiment recalculation when team has no specific time set
|
|
DEFAULT_EXPERIMENT_RECALCULATION_HOUR = 2 # 02:00 UTC
|
|
|
|
|
|
def remove_step_sessions_from_experiment_result(result: ExperimentQueryResponse) -> ExperimentQueryResponse:
|
|
"""
|
|
Remove step_sessions values from experiment results to reduce API response size.
|
|
"""
|
|
if result.baseline is not None:
|
|
result.baseline.step_sessions = None
|
|
|
|
if result.variant_results is not None:
|
|
for variant in result.variant_results:
|
|
variant.step_sessions = None
|
|
|
|
return result
|
|
|
|
|
|
def _parse_partition_key(partition_key: str) -> tuple[int, str, str]:
|
|
"""
|
|
Parse partition key to extract experiment ID, metric UUID, and fingerprint.
|
|
|
|
The partition key format is: experiment_{id}_metric_{uuid}_{fingerprint}
|
|
"""
|
|
parts = partition_key.split("_")
|
|
if len(parts) < 5 or parts[0] != "experiment" or parts[2] != "metric":
|
|
raise ValueError(f"Invalid partition key format: {partition_key}")
|
|
|
|
try:
|
|
experiment_id = int(parts[1])
|
|
metric_uuid = parts[3]
|
|
fingerprint = parts[4]
|
|
return experiment_id, metric_uuid, fingerprint
|
|
except ValueError as e:
|
|
raise ValueError(f"Failed to parse partition key {partition_key}: {e}")
|
|
|
|
|
|
def schedule_experiment_metric_partitions(
|
|
context: dagster.ScheduleEvaluationContext,
|
|
partition_name: str,
|
|
) -> list[dagster.RunRequest] | dagster.SkipReason:
|
|
"""
|
|
Get experiment partitions that should run at the current scheduled hour based on team settings.
|
|
|
|
This function filters experiments by their team's configured recalculation time and returns
|
|
RunRequests for the matching partitions.
|
|
|
|
Args:
|
|
context: Dagster schedule evaluation context
|
|
partition_name: Name of the dynamic partition set (e.g., "experiment_regular_metrics")
|
|
|
|
Returns:
|
|
List of RunRequests for partitions to process, or SkipReason if none found
|
|
"""
|
|
try:
|
|
connection.close() # Reset connection
|
|
|
|
current_hour = context.scheduled_execution_time.hour
|
|
target_time = time(current_hour, 0, 0)
|
|
|
|
# Build time filter for teams
|
|
if current_hour == DEFAULT_EXPERIMENT_RECALCULATION_HOUR:
|
|
# At default hour, include teams with NULL (not set) or explicitly set to this hour
|
|
time_filter = Q(experiment_recalculation_time=target_time) | Q(experiment_recalculation_time__isnull=True)
|
|
else:
|
|
# At other hours, only include teams explicitly set to this hour
|
|
time_filter = Q(experiment_recalculation_time=target_time)
|
|
|
|
# Get all experiments from teams scheduled at this hour
|
|
target_experiment_ids = set(
|
|
Experiment.objects.filter(
|
|
deleted=False,
|
|
stats_config__timeseries=True,
|
|
start_date__isnull=False,
|
|
end_date__isnull=True,
|
|
team__in=Team.objects.filter(time_filter),
|
|
).values_list("id", flat=True)
|
|
)
|
|
|
|
if not target_experiment_ids:
|
|
return dagster.SkipReason(f"No experiments found for teams scheduled at {current_hour}:00 UTC")
|
|
|
|
all_partitions = list(context.instance.get_dynamic_partitions(partition_name))
|
|
|
|
if not all_partitions:
|
|
return dagster.SkipReason(f"No {partition_name} partitions exist")
|
|
|
|
# Filter to only partitions for target experiments
|
|
partitions_to_run = []
|
|
for partition_key in all_partitions:
|
|
try:
|
|
experiment_id, _, _ = _parse_partition_key(partition_key)
|
|
if experiment_id in target_experiment_ids:
|
|
partitions_to_run.append(partition_key)
|
|
except ValueError:
|
|
context.log.warning(f"Skipping partition with invalid key format: {partition_key}")
|
|
continue
|
|
|
|
if not partitions_to_run:
|
|
return dagster.SkipReason(f"No partitions to process for teams at {current_hour}:00 UTC")
|
|
|
|
context.log.info(
|
|
f"Scheduling refresh for {len(partitions_to_run)} partitions from {partition_name} for teams at {current_hour}:00 UTC"
|
|
)
|
|
|
|
return [
|
|
dagster.RunRequest(
|
|
run_key=f"scheduled_{partition_key}_{context.scheduled_execution_time.strftime('%Y%m%d_%H')}",
|
|
partition_key=partition_key,
|
|
)
|
|
for partition_key in partitions_to_run
|
|
]
|
|
|
|
except Exception as e:
|
|
context.log.exception(f"Failed to schedule refresh for {partition_name}")
|
|
raise dagster.Failure(f"Failed to schedule refresh for {partition_name}: {e}")
|
|
|
|
|
|
def discover_experiment_metric_partitions(
|
|
context: dagster.SensorEvaluationContext,
|
|
partition_name: str,
|
|
partitions_def: dagster.DynamicPartitionsDefinition,
|
|
get_metrics_fn,
|
|
) -> dagster.SensorResult | dagster.SkipReason:
|
|
"""
|
|
Automatically discover new experiment-metric combinations and trigger timeseries calculation.
|
|
|
|
This function continuously monitors for new experiments or metrics that need timeseries
|
|
analysis. When new combinations are found, it creates dynamic partitions and triggers
|
|
processing only for the new partitions.
|
|
|
|
Args:
|
|
context: Dagster sensor evaluation context
|
|
partition_name: Name of the dynamic partition set (e.g., "experiment_regular_metrics")
|
|
partitions_def: Dynamic partitions definition object
|
|
get_metrics_fn: Function to get current experiment-metric combinations
|
|
|
|
Returns:
|
|
SensorResult with run requests and partition requests, or SkipReason if none found
|
|
"""
|
|
try:
|
|
connection.close() # Reset connection
|
|
|
|
current_experiment_metrics = get_metrics_fn(context)
|
|
if not current_experiment_metrics:
|
|
context.log.debug(f"No {partition_name} found for timeseries analysis")
|
|
return dagster.SkipReason(f"No experiments with {partition_name} found")
|
|
|
|
# Generate partition keys in format: experiment_{id}_metric_{uuid}_{fingerprint}
|
|
current_partition_keys = [
|
|
f"experiment_{exp_id}_metric_{metric_uuid}_{fingerprint}"
|
|
for exp_id, metric_uuid, fingerprint in current_experiment_metrics
|
|
]
|
|
|
|
# Check which partitions are new
|
|
existing_partitions = set(context.instance.get_dynamic_partitions(partition_name))
|
|
new_partitions = [key for key in current_partition_keys if key not in existing_partitions]
|
|
|
|
# Build response
|
|
run_requests = []
|
|
dynamic_partitions_requests = []
|
|
|
|
if new_partitions:
|
|
context.log.info(
|
|
f"Discovered {len(new_partitions)} new {partition_name} combinations for timeseries analysis"
|
|
)
|
|
# Add new partitions
|
|
dynamic_partitions_requests.append(partitions_def.build_add_request(new_partitions))
|
|
# Create run requests for new partitions only
|
|
run_requests = [
|
|
dagster.RunRequest(
|
|
run_key=f"sensor_{partition_key}_{context.cursor or 'initial'}",
|
|
partition_key=partition_key,
|
|
)
|
|
for partition_key in new_partitions
|
|
]
|
|
else:
|
|
context.log.debug(f"No new {partition_name} discovered for timeseries analysis")
|
|
return dagster.SkipReason(f"No new {partition_name} to process")
|
|
|
|
return dagster.SensorResult(
|
|
run_requests=run_requests,
|
|
dynamic_partitions_requests=dynamic_partitions_requests,
|
|
)
|
|
|
|
except Exception as e:
|
|
context.log.exception(f"Failed to discover {partition_name} experiments")
|
|
raise dagster.Failure(f"Failed to discover {partition_name} experiments: {e}")
|