fix(retention): Properly apply Monday as the first day of the week (#18481)

* fix(retention): Properly apply Monday as the first day of the week

* Update query snapshots

---------

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Michael Matloka
2023-11-08 21:29:44 +00:00
committed by GitHub
parent b3d88eaaee
commit f986c242d5
5 changed files with 249 additions and 55 deletions

View File

@@ -5,7 +5,7 @@
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target
FROM events e
WHERE team_id = 2
@@ -17,13 +17,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -70,7 +70,7 @@
[0] as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target
FROM events e
WHERE team_id = 2
@@ -82,13 +82,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -128,7 +128,7 @@
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_1" as target
FROM events e
WHERE team_id = 2
@@ -140,13 +140,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_1" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -190,7 +190,7 @@
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target
FROM events e
WHERE team_id = 2
@@ -202,13 +202,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -255,7 +255,7 @@
[0] as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target
FROM events e
WHERE team_id = 2
@@ -267,13 +267,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -313,7 +313,7 @@
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_1" as target
FROM events e
WHERE team_id = 2
@@ -325,13 +325,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_1" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -375,7 +375,7 @@
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e.person_id as target
FROM events e
LEFT JOIN
@@ -394,13 +394,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e.person_id as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -451,7 +451,7 @@
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e.person_id as target
FROM events e
LEFT JOIN
@@ -470,13 +470,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e.person_id as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -632,7 +632,7 @@
[0] as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target
FROM events e
WHERE team_id = 2
@@ -644,13 +644,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
@@ -693,7 +693,7 @@
[0] as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target
FROM events e
WHERE team_id = 2
@@ -705,13 +705,13 @@
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC')) AS event_date,
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
e."$group_0" as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC')),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'))
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e

View File

@@ -76,10 +76,16 @@ class RetentionDateDerivedMixin(PeriodMixin, TotalIntervalsMixin, DateMixin, Sel
if self.period == "Hour":
return self.date_to - tdelta
elif self.period == "Week":
date_from = self.date_to - tdelta
return date_from - timedelta(days=date_from.isoweekday() % 7)
date_from: datetime = self.date_to - tdelta
week_start_alignment_days = date_from.isoweekday() % 7
if team := getattr(self, "team", None):
from posthog.models.team.team import WeekStartDay
if team.week_start_day == WeekStartDay.MONDAY:
week_start_alignment_days = date_from.weekday()
return date_from - timedelta(days=week_start_alignment_days)
else:
date_to = self.date_to.replace(hour=0, minute=0, second=0, microsecond=0)
date_to: datetime = self.date_to.replace(hour=0, minute=0, second=0, microsecond=0)
return date_to - tdelta
@cached_property

View File

@@ -459,3 +459,147 @@
intervals_from_base
'
---
# name: TestFOSSRetention.test_week_interval
'
WITH actor_query AS
(WITH 'Week' as period,
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
pdi.person_id as target
FROM events e
INNER JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) AS pdi ON e.distinct_id = pdi.distinct_id
WHERE team_id = 2
AND e.event = '$pageview'
AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00', 'UTC')
AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00', 'UTC')
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0) AS event_date,
pdi.person_id as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-07 00:00:00', 'UTC'), 0),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 0)
)
] as breakdown_values
FROM events e
INNER JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) AS pdi ON e.distinct_id = pdi.distinct_id
WHERE team_id = 2
AND e.event = '$pageview'
AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00', 'UTC')
AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00', 'UTC') ) SELECT DISTINCT breakdown_values,
intervals_from_base,
actor_id
FROM
(SELECT target_event.breakdown_values AS breakdown_values,
datediff(period, target_event.event_date, returning_event.event_date) AS intervals_from_base,
returning_event.target AS actor_id
FROM target_event_query AS target_event
JOIN returning_event_query AS returning_event ON returning_event.target = target_event.target
WHERE returning_event.event_date > target_event.event_date
UNION ALL SELECT target_event.breakdown_values AS breakdown_values,
0 AS intervals_from_base,
target_event.target AS actor_id
FROM target_event_query AS target_event)
WHERE (breakdown_values_filter is NULL
OR breakdown_values = breakdown_values_filter)
AND (selected_interval is NULL
OR intervals_from_base = selected_interval) )
SELECT actor_activity.breakdown_values AS breakdown_values,
actor_activity.intervals_from_base AS intervals_from_base,
COUNT(DISTINCT actor_activity.actor_id) AS count
FROM actor_query AS actor_activity
GROUP BY breakdown_values,
intervals_from_base
ORDER BY breakdown_values,
intervals_from_base
'
---
# name: TestFOSSRetention.test_week_interval.1
'
WITH actor_query AS
(WITH 'Week' as period,
NULL as breakdown_values_filter,
NULL as selected_interval,
returning_event_query as
(SELECT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 3) AS event_date,
pdi.person_id as target
FROM events e
INNER JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) AS pdi ON e.distinct_id = pdi.distinct_id
WHERE team_id = 2
AND e.event = '$pageview'
AND toDateTime(e.timestamp) >= toDateTime('2020-06-08 00:00:00', 'UTC')
AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00', 'UTC')
GROUP BY target,
event_date),
target_event_query as
(SELECT DISTINCT toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 3) AS event_date,
pdi.person_id as target,
[
dateDiff(
'Week',
toStartOfWeek(toDateTime('2020-06-08 00:00:00', 'UTC'), 3),
toStartOfWeek(toTimeZone(toDateTime(e.timestamp, 'UTC'), 'UTC'), 3)
)
] as breakdown_values
FROM events e
INNER JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) AS pdi ON e.distinct_id = pdi.distinct_id
WHERE team_id = 2
AND e.event = '$pageview'
AND toDateTime(e.timestamp) >= toDateTime('2020-06-08 00:00:00', 'UTC')
AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00', 'UTC') ) SELECT DISTINCT breakdown_values,
intervals_from_base,
actor_id
FROM
(SELECT target_event.breakdown_values AS breakdown_values,
datediff(period, target_event.event_date, returning_event.event_date) AS intervals_from_base,
returning_event.target AS actor_id
FROM target_event_query AS target_event
JOIN returning_event_query AS returning_event ON returning_event.target = target_event.target
WHERE returning_event.event_date > target_event.event_date
UNION ALL SELECT target_event.breakdown_values AS breakdown_values,
0 AS intervals_from_base,
target_event.target AS actor_id
FROM target_event_query AS target_event)
WHERE (breakdown_values_filter is NULL
OR breakdown_values = breakdown_values_filter)
AND (selected_interval is NULL
OR intervals_from_base = selected_interval) )
SELECT actor_activity.breakdown_values AS breakdown_values,
actor_activity.intervals_from_base AS intervals_from_base,
COUNT(DISTINCT actor_activity.actor_id) AS count
FROM actor_query AS actor_activity
GROUP BY breakdown_values,
intervals_from_base
ORDER BY breakdown_values,
intervals_from_base
'
---

View File

@@ -419,6 +419,7 @@ def retention_test_factory(retention):
],
)
@snapshot_clickhouse_queries
def test_week_interval(self):
_create_person(
team=self.team,
@@ -449,24 +450,27 @@ def retention_test_factory(retention):
],
)
result = retention().run(
RetentionFilter(
data={
"date_to": _date(10, month=1, hour=0),
"period": "Week",
"total_intervals": 7,
}
),
test_filter = RetentionFilter(
data={
"date_to": _date(10, month=1, hour=0),
"period": "Week",
"total_intervals": 7,
}
)
# Starting with Sunday
result_sunday = retention().run(
test_filter,
self.team,
)
self.assertEqual(
pluck(result, "label"),
pluck(result_sunday, "label"),
["Week 0", "Week 1", "Week 2", "Week 3", "Week 4", "Week 5", "Week 6"],
)
self.assertEqual(
pluck(result, "values", "count"),
pluck(result_sunday, "values", "count"),
[
[2, 2, 1, 2, 2, 0, 1],
[2, 1, 2, 2, 0, 1],
@@ -479,7 +483,7 @@ def retention_test_factory(retention):
)
self.assertEqual(
pluck(result, "date"),
pluck(result_sunday, "date"),
[
datetime(2020, 6, 7, 0, tzinfo=ZoneInfo("UTC")),
datetime(2020, 6, 14, 0, tzinfo=ZoneInfo("UTC")),
@@ -491,6 +495,46 @@ def retention_test_factory(retention):
],
)
# Starting with Monday
self.team.week_start_day = 1 # WeekStartDay.MONDAY's concrete value
self.team.save()
result_monday = retention().run(
test_filter,
self.team,
)
self.assertEqual(
pluck(result_monday, "label"),
["Week 0", "Week 1", "Week 2", "Week 3", "Week 4", "Week 5", "Week 6"],
)
self.assertEqual(
pluck(result_monday, "values", "count"),
[
[2, 2, 1, 2, 2, 0, 1],
[2, 1, 2, 2, 0, 1],
[1, 1, 1, 0, 0],
[2, 2, 0, 1],
[2, 0, 1],
[0, 0],
[1],
],
)
self.assertEqual(
pluck(result_monday, "date"),
[
datetime(2020, 6, 8, 0, tzinfo=ZoneInfo("UTC")),
datetime(2020, 6, 15, 0, tzinfo=ZoneInfo("UTC")),
datetime(2020, 6, 22, 0, tzinfo=ZoneInfo("UTC")),
datetime(2020, 6, 29, 0, tzinfo=ZoneInfo("UTC")),
datetime(2020, 7, 6, 0, tzinfo=ZoneInfo("UTC")),
datetime(2020, 7, 13, 0, tzinfo=ZoneInfo("UTC")),
datetime(2020, 7, 20, 0, tzinfo=ZoneInfo("UTC")),
],
)
def test_hour_interval(self):
_create_person(
team=self.team,

View File

@@ -107,7 +107,7 @@ def get_start_of_interval_sql(
elif "%(timezone)s" not in source:
source = f"toTimeZone(toDateTime({source}, 'UTC'), %(timezone)s)"
trunc_func_args = [source]
if interval == "week":
if trunc_func == "toStartOfWeek":
trunc_func_args.append((WeekStartDay(team.week_start_day or 0)).clickhouse_mode)
interval_sql = f"{trunc_func}({', '.join(trunc_func_args)})"
# For larger intervals dates are returned instead of datetimes, and we always want datetimes for comparisons