feat(experiments): sample size calculator for Funnel (#29487)
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
|
Before Width: | Height: | Size: 279 KiB After Width: | Height: | Size: 286 KiB |
|
Before Width: | Height: | Size: 282 KiB After Width: | Height: | Size: 290 KiB |
|
Before Width: | Height: | Size: 60 KiB After Width: | Height: | Size: 61 KiB |
|
Before Width: | Height: | Size: 59 KiB After Width: | Height: | Size: 61 KiB |
|
Before Width: | Height: | Size: 87 KiB After Width: | Height: | Size: 89 KiB |
|
Before Width: | Height: | Size: 88 KiB After Width: | Height: | Size: 90 KiB |
|
Before Width: | Height: | Size: 66 KiB After Width: | Height: | Size: 67 KiB |
|
Before Width: | Height: | Size: 66 KiB After Width: | Height: | Size: 67 KiB |
@@ -121,7 +121,7 @@ export function ExperimentView(): JSX.Element {
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
<span className="card-secondary">Sample size:</span>{' '}
|
||||
<span className="card-secondary">Recommended sample size:</span>{' '}
|
||||
<span className="font-semibold">
|
||||
{humanFriendlyNumber(
|
||||
experiment.parameters.recommended_sample_size || 0,
|
||||
@@ -131,7 +131,7 @@ export function ExperimentView(): JSX.Element {
|
||||
</span>
|
||||
</div>
|
||||
<div>
|
||||
<span className="card-secondary">Running time:</span>{' '}
|
||||
<span className="card-secondary">Estimated running time:</span>{' '}
|
||||
<span className="font-semibold">
|
||||
{humanFriendlyNumber(
|
||||
experiment.parameters.recommended_running_time || 0,
|
||||
|
||||
@@ -21,6 +21,7 @@ export function RunningTimeCalculatorModal(): JSX.Element {
|
||||
uniqueUsers,
|
||||
averageEventsPerUser,
|
||||
averagePropertyValuePerUser,
|
||||
conversionRate,
|
||||
metricResultLoading,
|
||||
} = useValues(runningTimeCalculatorLogic({ experimentId }))
|
||||
const { setMinimumDetectableEffect, setMetricIndex } = useActions(runningTimeCalculatorLogic({ experimentId }))
|
||||
@@ -102,18 +103,20 @@ export function RunningTimeCalculatorModal(): JSX.Element {
|
||||
<Spinner className="text-3xl transform -translate-y-[-10px]" />
|
||||
</div>
|
||||
</div>
|
||||
) : uniqueUsers !== null && standardDeviation !== null ? (
|
||||
) : (
|
||||
<div className="border-t pt-2">
|
||||
<div className="grid grid-cols-3 gap-4">
|
||||
<div>
|
||||
<div className="card-secondary">Unique users</div>
|
||||
<div className="font-semibold">
|
||||
~{humanFriendlyNumber(uniqueUsers || 0, 0)} persons
|
||||
{uniqueUsers !== null && (
|
||||
<div>
|
||||
<div className="card-secondary">Unique users</div>
|
||||
<div className="font-semibold">
|
||||
~{humanFriendlyNumber(uniqueUsers || 0, 0)} persons
|
||||
</div>
|
||||
<div className="text-xs text-muted">
|
||||
Last {TIMEFRAME_HISTORICAL_DATA_DAYS} days
|
||||
</div>
|
||||
</div>
|
||||
<div className="text-xs text-muted">
|
||||
Last {TIMEFRAME_HISTORICAL_DATA_DAYS} days
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{averageEventsPerUser !== null && (
|
||||
<div>
|
||||
<div className="card-secondary">Avg. events per user</div>
|
||||
@@ -130,15 +133,25 @@ export function RunningTimeCalculatorModal(): JSX.Element {
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
<div>
|
||||
<div className="card-secondary">Estimated standard deviation</div>
|
||||
<div className="font-semibold">
|
||||
~{humanFriendlyNumber(standardDeviation, 0)}
|
||||
{conversionRate !== null && (
|
||||
<div>
|
||||
<div className="card-secondary">Conversion rate</div>
|
||||
<div className="font-semibold">
|
||||
~{humanFriendlyNumber(conversionRate * 100, 2)}%
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{standardDeviation !== null && (
|
||||
<div>
|
||||
<div className="card-secondary">Estimated standard deviation</div>
|
||||
<div className="font-semibold">
|
||||
~{humanFriendlyNumber(standardDeviation, 0)}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
) : null}
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -119,4 +119,51 @@ describe('runningTimeCalculatorLogic', () => {
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
// Should match https://docs.google.com/spreadsheets/d/11alyC8n7uqewZFLKfV4UAbW-0zH__EdV_Hrk2OQ4140/edit?gid=0#gid=0
|
||||
describe('calculations for FUNNEL', () => {
|
||||
beforeEach(() => {
|
||||
experimentLogic.actions.setExperiment({
|
||||
metrics: [
|
||||
{
|
||||
metric_type: ExperimentMetricType.FUNNEL,
|
||||
} as ExperimentMetric,
|
||||
],
|
||||
feature_flag: {
|
||||
filters: {
|
||||
multivariate: {
|
||||
variants: [
|
||||
{
|
||||
key: 'control',
|
||||
rollout_percentage: 50,
|
||||
},
|
||||
{
|
||||
key: 'test',
|
||||
rollout_percentage: 50,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
} as unknown as FeatureFlagBasicType,
|
||||
})
|
||||
|
||||
logic.actions.setMetricIndex(0)
|
||||
})
|
||||
|
||||
it('calculates recommended sample size and running time correctly for FUNNEL', async () => {
|
||||
logic.actions.setMinimumDetectableEffect(50)
|
||||
logic.actions.setMetricResult({
|
||||
uniqueUsers: 1000,
|
||||
conversionRate: 0.1,
|
||||
})
|
||||
|
||||
await expectLogic(logic).toFinishAllListeners()
|
||||
|
||||
await expectLogic(logic).toMatchValues({
|
||||
minimumDetectableEffect: 50,
|
||||
recommendedSampleSize: expect.closeTo(1152, 0),
|
||||
recommendedRunningTime: expect.closeTo(16.1, 1),
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -6,14 +6,122 @@ import { dayjs } from 'lib/dayjs'
|
||||
import { experimentLogic } from 'scenes/experiments/experimentLogic'
|
||||
|
||||
import { performQuery } from '~/queries/query'
|
||||
import { ExperimentMetric, ExperimentMetricType, NodeKind, TrendsQueryResponse } from '~/queries/schema/schema-general'
|
||||
import { BaseMathType, CountPerActorMathType, Experiment, ExperimentMetricMathType, PropertyMathType } from '~/types'
|
||||
import {
|
||||
ExperimentMetric,
|
||||
ExperimentMetricType,
|
||||
FunnelsQuery,
|
||||
NodeKind,
|
||||
TrendsQuery,
|
||||
TrendsQueryResponse,
|
||||
} from '~/queries/schema/schema-general'
|
||||
import {
|
||||
BaseMathType,
|
||||
CountPerActorMathType,
|
||||
Experiment,
|
||||
ExperimentMetricMathType,
|
||||
FunnelVizType,
|
||||
PropertyMathType,
|
||||
} from '~/types'
|
||||
|
||||
import type { runningTimeCalculatorLogicType } from './runningTimeCalculatorLogicType'
|
||||
|
||||
export const TIMEFRAME_HISTORICAL_DATA_DAYS = 14
|
||||
export const VARIANCE_SCALING_FACTOR_COUNT = 2
|
||||
export const VARIANCE_SCALING_FACTOR_CONTINUOUS = 0.25
|
||||
export const VARIANCE_SCALING_FACTOR_TOTAL_COUNT = 2
|
||||
export const VARIANCE_SCALING_FACTOR_SUM = 0.25
|
||||
|
||||
const getKindField = (metric: ExperimentMetric): NodeKind => {
|
||||
return metric.metric_config.kind === NodeKind.ExperimentEventMetricConfig
|
||||
? NodeKind.EventsNode
|
||||
: metric.metric_config.kind === NodeKind.ExperimentActionMetricConfig
|
||||
? NodeKind.ActionsNode
|
||||
: NodeKind.DataWarehouseNode
|
||||
}
|
||||
|
||||
const getEventField = (metric: ExperimentMetric): string | number => {
|
||||
return metric.metric_config.kind === NodeKind.ExperimentEventMetricConfig
|
||||
? metric.metric_config.event
|
||||
: metric.metric_config.kind === NodeKind.ExperimentActionMetricConfig
|
||||
? metric.metric_config.action
|
||||
: metric.metric_config.table_name
|
||||
}
|
||||
|
||||
const getTotalCountQuery = (metric: ExperimentMetric, experiment: Experiment): TrendsQuery => {
|
||||
return {
|
||||
kind: NodeKind.TrendsQuery,
|
||||
series: [
|
||||
{
|
||||
kind: getKindField(metric),
|
||||
event: getEventField(metric),
|
||||
math: BaseMathType.UniqueUsers,
|
||||
},
|
||||
{
|
||||
kind: getKindField(metric),
|
||||
event: getEventField(metric),
|
||||
math: CountPerActorMathType.Average,
|
||||
},
|
||||
],
|
||||
trendsFilter: {},
|
||||
filterTestAccounts: experiment.exposure_criteria?.filterTestAccounts === true,
|
||||
dateRange: {
|
||||
date_from: dayjs().subtract(EXPERIMENT_DEFAULT_DURATION, 'day').format('YYYY-MM-DDTHH:mm'),
|
||||
date_to: dayjs().endOf('d').format('YYYY-MM-DDTHH:mm'),
|
||||
explicitDate: true,
|
||||
},
|
||||
} as TrendsQuery
|
||||
}
|
||||
|
||||
const getSumQuery = (metric: ExperimentMetric, experiment: Experiment): TrendsQuery => {
|
||||
return {
|
||||
kind: NodeKind.TrendsQuery,
|
||||
series: [
|
||||
{
|
||||
kind: getKindField(metric),
|
||||
event: getEventField(metric),
|
||||
math: BaseMathType.UniqueUsers,
|
||||
},
|
||||
{
|
||||
kind: getKindField(metric),
|
||||
event: getEventField(metric),
|
||||
math: PropertyMathType.Sum,
|
||||
math_property: metric.metric_config.math_property,
|
||||
math_property_type: TaxonomicFilterGroupType.NumericalEventProperties,
|
||||
},
|
||||
],
|
||||
trendsFilter: {},
|
||||
filterTestAccounts: experiment.exposure_criteria?.filterTestAccounts === true,
|
||||
dateRange: {
|
||||
date_from: dayjs().subtract(EXPERIMENT_DEFAULT_DURATION, 'day').format('YYYY-MM-DDTHH:mm'),
|
||||
date_to: dayjs().endOf('d').format('YYYY-MM-DDTHH:mm'),
|
||||
explicitDate: true,
|
||||
},
|
||||
} as TrendsQuery
|
||||
}
|
||||
|
||||
const getFunnelQuery = (metric: ExperimentMetric, experiment: Experiment): FunnelsQuery => {
|
||||
return {
|
||||
kind: NodeKind.FunnelsQuery,
|
||||
series: [
|
||||
{
|
||||
kind: NodeKind.EventsNode,
|
||||
event: '$feature_flag_called',
|
||||
},
|
||||
{
|
||||
kind: getKindField(metric),
|
||||
event: getEventField(metric),
|
||||
},
|
||||
],
|
||||
funnelsFilter: {
|
||||
funnelVizType: FunnelVizType.Steps,
|
||||
},
|
||||
filterTestAccounts: experiment.exposure_criteria?.filterTestAccounts === true,
|
||||
dateRange: {
|
||||
date_from: dayjs().subtract(EXPERIMENT_DEFAULT_DURATION, 'day').format('YYYY-MM-DDTHH:mm'),
|
||||
date_to: dayjs().endOf('d').format('YYYY-MM-DDTHH:mm'),
|
||||
explicitDate: true,
|
||||
},
|
||||
interval: 'day',
|
||||
} as FunnelsQuery
|
||||
}
|
||||
|
||||
export interface RunningTimeCalculatorLogicProps {
|
||||
experimentId?: Experiment['id']
|
||||
@@ -31,6 +139,7 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
uniqueUsers: number
|
||||
averageEventsPerUser?: number
|
||||
averagePropertyValuePerUser?: number
|
||||
conversionRate?: number
|
||||
}) => ({ value }),
|
||||
}),
|
||||
reducers({
|
||||
@@ -61,57 +170,14 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
return null
|
||||
}
|
||||
|
||||
const series = []
|
||||
|
||||
const kindField =
|
||||
metric.metric_config.kind === NodeKind.ExperimentEventMetricConfig
|
||||
? NodeKind.EventsNode
|
||||
: metric.metric_config.kind === NodeKind.ExperimentActionMetricConfig
|
||||
? NodeKind.ActionsNode
|
||||
: NodeKind.DataWarehouseNode
|
||||
|
||||
const eventField =
|
||||
metric.metric_config.kind === NodeKind.ExperimentEventMetricConfig
|
||||
? metric.metric_config.event
|
||||
: metric.metric_config.kind === NodeKind.ExperimentActionMetricConfig
|
||||
? metric.metric_config.action
|
||||
: metric.metric_config.table_name
|
||||
|
||||
series.push({
|
||||
kind: kindField,
|
||||
event: eventField,
|
||||
math: BaseMathType.UniqueUsers,
|
||||
})
|
||||
|
||||
if (metric.metric_type === ExperimentMetricType.MEAN) {
|
||||
if (metric.metric_config.math === ExperimentMetricMathType.Sum) {
|
||||
series.push({
|
||||
kind: kindField,
|
||||
event: eventField,
|
||||
math: PropertyMathType.Sum,
|
||||
math_property: metric.metric_config.math_property,
|
||||
math_property_type: TaxonomicFilterGroupType.NumericalEventProperties,
|
||||
})
|
||||
} else {
|
||||
series.push({
|
||||
kind: kindField,
|
||||
event: eventField,
|
||||
math: CountPerActorMathType.Average,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const query = {
|
||||
kind: NodeKind.TrendsQuery,
|
||||
series,
|
||||
trendsFilter: {},
|
||||
filterTestAccounts: values.experiment.exposure_criteria?.filterTestAccounts === true,
|
||||
dateRange: {
|
||||
date_from: dayjs().subtract(EXPERIMENT_DEFAULT_DURATION, 'day').format('YYYY-MM-DDTHH:mm'),
|
||||
date_to: dayjs().endOf('d').format('YYYY-MM-DDTHH:mm'),
|
||||
explicitDate: true,
|
||||
},
|
||||
}
|
||||
const query =
|
||||
metric.metric_type === ExperimentMetricType.MEAN &&
|
||||
metric.metric_config.math === ExperimentMetricMathType.TotalCount
|
||||
? getTotalCountQuery(metric, values.experiment)
|
||||
: metric.metric_type === ExperimentMetricType.MEAN &&
|
||||
metric.metric_config.math === ExperimentMetricMathType.Sum
|
||||
? getSumQuery(metric, values.experiment)
|
||||
: getFunnelQuery(metric, values.experiment)
|
||||
|
||||
const result = (await performQuery(query)) as Partial<TrendsQueryResponse>
|
||||
|
||||
@@ -125,6 +191,9 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
metric.metric_config.math === ExperimentMetricMathType.Sum
|
||||
? { averagePropertyValuePerUser: result?.results?.[1]?.count ?? null }
|
||||
: {}),
|
||||
...(metric.metric_type === ExperimentMetricType.FUNNEL
|
||||
? { conversionRate: result?.results?.[1]?.count / result?.results?.[0]?.count || null }
|
||||
: {}),
|
||||
}
|
||||
},
|
||||
// For testing purposes, we want to be able set the metric result directly
|
||||
@@ -154,6 +223,10 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
(metricResult: { averagePropertyValuePerUser: number }) =>
|
||||
metricResult?.averagePropertyValuePerUser ?? null,
|
||||
],
|
||||
conversionRate: [
|
||||
(s) => [s.metricResult],
|
||||
(metricResult: { conversionRate: number }) => metricResult?.conversionRate ?? null,
|
||||
],
|
||||
variance: [
|
||||
(s) => [s.metric, s.averageEventsPerUser, s.averagePropertyValuePerUser],
|
||||
(metric: ExperimentMetric, averageEventsPerUser: number, averagePropertyValuePerUser: number) => {
|
||||
@@ -165,17 +238,17 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
metric.metric_type === ExperimentMetricType.MEAN &&
|
||||
metric.metric_config.math === ExperimentMetricMathType.TotalCount
|
||||
) {
|
||||
return VARIANCE_SCALING_FACTOR_COUNT * averageEventsPerUser
|
||||
return VARIANCE_SCALING_FACTOR_TOTAL_COUNT * averageEventsPerUser
|
||||
} else if (
|
||||
metric.metric_type === ExperimentMetricType.MEAN &&
|
||||
metric.metric_config.math === ExperimentMetricMathType.Sum
|
||||
) {
|
||||
return VARIANCE_SCALING_FACTOR_CONTINUOUS * averagePropertyValuePerUser ** 2
|
||||
return VARIANCE_SCALING_FACTOR_SUM * averagePropertyValuePerUser ** 2
|
||||
}
|
||||
return null
|
||||
},
|
||||
],
|
||||
standardDeviation: [(s) => [s.variance], (variance: number) => Math.sqrt(variance)],
|
||||
standardDeviation: [(s) => [s.variance], (variance: number) => (variance ? Math.sqrt(variance) : null)],
|
||||
numberOfVariants: [
|
||||
(s) => [s.experiment],
|
||||
(experiment: Experiment) => experiment.feature_flag?.filters.multivariate?.variants.length,
|
||||
@@ -187,6 +260,7 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
s.variance,
|
||||
s.averageEventsPerUser,
|
||||
s.averagePropertyValuePerUser,
|
||||
s.conversionRate,
|
||||
s.numberOfVariants,
|
||||
],
|
||||
(
|
||||
@@ -195,6 +269,7 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
variance: number,
|
||||
averageEventsPerUser: number,
|
||||
averagePropertyValuePerUser: number,
|
||||
conversionRate: number,
|
||||
numberOfVariants: number
|
||||
): number | null => {
|
||||
if (!metric) {
|
||||
@@ -204,38 +279,96 @@ export const runningTimeCalculatorLogic = kea<runningTimeCalculatorLogicType>([
|
||||
const minimumDetectableEffectDecimal = minimumDetectableEffect / 100
|
||||
|
||||
let d // Represents the absolute effect size (difference we want to detect)
|
||||
let sampleSizeFormula // The correct sample size formula for each metric type
|
||||
|
||||
if (
|
||||
metric.metric_type === ExperimentMetricType.MEAN &&
|
||||
metric.metric_config.math === ExperimentMetricMathType.TotalCount
|
||||
) {
|
||||
/*
|
||||
Count Per User Metric:
|
||||
- "mean" is the average number of events per user (e.g., clicks per user).
|
||||
- MDE is applied as a percentage of this mean to compute `d`.
|
||||
|
||||
Formula:
|
||||
d = MDE * averageEventsPerUser
|
||||
*/
|
||||
d = minimumDetectableEffectDecimal * averageEventsPerUser
|
||||
|
||||
/*
|
||||
Sample size formula:
|
||||
|
||||
N = (16 * variance) / d^2
|
||||
|
||||
Where:
|
||||
- `16` comes from statistical power analysis:
|
||||
- Based on a 95% confidence level (Z_alpha/2 = 1.96) and 80% power (Z_beta = 0.84),
|
||||
the combined squared Z-scores yield approximately 16.
|
||||
- `variance` is the estimated variance of the event count per user.
|
||||
- `d` is the absolute effect size (MDE * mean).
|
||||
*/
|
||||
sampleSizeFormula = (16 * variance) / d ** 2
|
||||
} else if (
|
||||
metric.metric_type === ExperimentMetricType.MEAN &&
|
||||
metric.metric_config.math === ExperimentMetricMathType.Sum
|
||||
) {
|
||||
/*
|
||||
Continuous property metric:
|
||||
- "mean" is the average value of the measured property per user (e.g., revenue per user).
|
||||
- MDE is applied as a percentage of this mean to compute `d`.
|
||||
|
||||
Formula:
|
||||
d = MDE * averagePropertyValuePerUser
|
||||
*/
|
||||
d = minimumDetectableEffectDecimal * averagePropertyValuePerUser
|
||||
|
||||
/*
|
||||
Sample Size Formula for Continuous metrics:
|
||||
|
||||
N = (16 * variance) / d^2
|
||||
|
||||
Where:
|
||||
- `variance` is the estimated variance of the continuous property.
|
||||
- The formula is identical to the Count metric case.
|
||||
*/
|
||||
sampleSizeFormula = (16 * variance) / d ** 2
|
||||
} else if (metric.metric_type === ExperimentMetricType.FUNNEL) {
|
||||
/*
|
||||
Binomial metric (conversion rate):
|
||||
- Here, "mean" does not exist in the same way as for count/continuous metrics.
|
||||
- Instead, we use `p`, the baseline conversion rate (historical probability of success).
|
||||
- MDE is applied as an absolute percentage change to `p`.
|
||||
|
||||
Formula:
|
||||
d = MDE * conversionRate
|
||||
*/
|
||||
d = minimumDetectableEffectDecimal * conversionRate
|
||||
|
||||
/*
|
||||
Sample size formula:
|
||||
|
||||
N = (16 * p * (1 - p)) / d^2
|
||||
|
||||
Where:
|
||||
- `p` is the historical conversion rate (baseline success probability).
|
||||
- `d` is the absolute MDE (e.g., detecting a 5% increase means `d = 0.05`).
|
||||
- The variance is inherent in `p(1 - p)`, which represents binomial variance.
|
||||
*/
|
||||
if (conversionRate !== null) {
|
||||
sampleSizeFormula = (16 * conversionRate * (1 - conversionRate)) / d ** 2
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
if (!d) {
|
||||
if (!d || !sampleSizeFormula) {
|
||||
return null
|
||||
}
|
||||
|
||||
/*
|
||||
N = (16 * variance) / d^2
|
||||
|
||||
Where:
|
||||
- `16` comes from statistical power analysis:
|
||||
- Based on a 95% confidence level (Z_alpha/2 = 1.96) and 80% power (Z_beta = 0.84),
|
||||
the combined squared Z-scores yield approximately 16.
|
||||
- `variance` is the estimated variance of the metric being measured.
|
||||
- `d` is the absolute effect size (MDE * mean).
|
||||
- The formula ensures that larger variance increases required sample size,
|
||||
and smaller detectable effects (MDE) also require more samples.
|
||||
*/
|
||||
return ((16 * variance) / (d * d)) * numberOfVariants
|
||||
return sampleSizeFormula * numberOfVariants
|
||||
},
|
||||
],
|
||||
|
||||
recommendedRunningTime: [
|
||||
(s) => [s.recommendedSampleSize, s.uniqueUsers],
|
||||
(recommendedSampleSize: number, uniqueUsers: number): number => {
|
||||
|
||||