mirror of
https://github.com/BillyOutlast/posthog.git
synced 2026-02-04 03:01:23 +01:00
feat(clickhouse): Start writing property groups on events (#24152)
Co-authored-by: Daesgar <daesgar90@gmail.com> Co-authored-by: James Greenhill <fuziontech@gmail.com>
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions
|
||||
from posthog.clickhouse.property_groups import sharded_events_property_groups
|
||||
|
||||
operations = [
|
||||
run_sql_with_exceptions(statement)
|
||||
for statement in [
|
||||
*sharded_events_property_groups.get_alter_create_statements("custom"),
|
||||
*sharded_events_property_groups.get_alter_create_statements("feature_flags"),
|
||||
]
|
||||
]
|
||||
75
posthog/clickhouse/property_groups.py
Normal file
75
posthog/clickhouse/property_groups.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from collections.abc import Iterable, MutableMapping
|
||||
from dataclasses import dataclass
|
||||
|
||||
from posthog import settings
|
||||
|
||||
|
||||
@dataclass
|
||||
class PropertyGroupDefinition:
|
||||
filter_expression: str
|
||||
codec: str = "ZSTD(1)"
|
||||
|
||||
|
||||
class PropertyGroupManager:
|
||||
def __init__(self, cluster: str, table: str, source_column: str) -> None:
|
||||
self.__cluster = cluster
|
||||
self.__table = table
|
||||
self.__source_column = source_column
|
||||
self.__groups: MutableMapping[str, PropertyGroupDefinition] = {}
|
||||
|
||||
def register(self, name: str, definition: PropertyGroupDefinition) -> None:
|
||||
assert name not in self.__groups, "property group names can only be used once"
|
||||
self.__groups[name] = definition
|
||||
|
||||
def __get_map_expression(self, definition: PropertyGroupDefinition) -> str:
|
||||
return f"mapSort(mapFilter((key, _) -> {definition.filter_expression}, CAST(JSONExtractKeysAndValues({self.__source_column}, 'String'), 'Map(String, String)')))"
|
||||
|
||||
def get_alter_create_statements(self, name: str) -> Iterable[str]:
|
||||
column_name = f"{self.__source_column}_group_{name}"
|
||||
definition = self.__groups[name]
|
||||
return [
|
||||
f"ALTER TABLE {self.__table} ON CLUSTER {self.__cluster} ADD COLUMN {column_name} Map(String, String) MATERIALIZED {self.__get_map_expression(definition)} CODEC({definition.codec})",
|
||||
f"ALTER TABLE {self.__table} ON CLUSTER {self.__cluster} ADD INDEX {column_name}_keys_bf mapKeys({column_name}) TYPE bloom_filter",
|
||||
f"ALTER TABLE {self.__table} ON CLUSTER {self.__cluster} ADD INDEX {column_name}_values_bf mapValues({column_name}) TYPE bloom_filter",
|
||||
]
|
||||
|
||||
|
||||
sharded_events_property_groups = PropertyGroupManager(settings.CLICKHOUSE_CLUSTER, "sharded_events", "properties")
|
||||
|
||||
ignore_custom_properties = [
|
||||
# `token` & `distinct_id` properties are sent with ~50% of events and by
|
||||
# many teams, and should not be treated as custom properties and their use
|
||||
# should be optimized separately
|
||||
"token",
|
||||
"distinct_id",
|
||||
# campaign properties are defined by external entities and are commonly used
|
||||
# across a large number of teams, and should also be optimized separately
|
||||
"utm_source",
|
||||
"utm_medium",
|
||||
"utm_campaign",
|
||||
"utm_content",
|
||||
"utm_term",
|
||||
"gclid", # google ads
|
||||
"gad_source", # google ads
|
||||
"gclsrc", # google ads 360
|
||||
"dclid", # google display ads
|
||||
"gbraid", # google ads, web to app
|
||||
"wbraid", # google ads, app to web
|
||||
"fbclid", # facebook
|
||||
"msclkid", # microsoft
|
||||
"twclid", # twitter
|
||||
"li_fat_id", # linkedin
|
||||
"mc_cid", # mailchimp campaign id
|
||||
"igshid", # instagram
|
||||
"ttclid", # tiktok
|
||||
"rdt_cid", # reddit
|
||||
]
|
||||
|
||||
sharded_events_property_groups.register(
|
||||
"custom",
|
||||
PropertyGroupDefinition(
|
||||
f"key NOT LIKE '$%' AND key NOT IN (" + f", ".join(f"'{name}'" for name in ignore_custom_properties) + f")"
|
||||
),
|
||||
)
|
||||
|
||||
sharded_events_property_groups.register("feature_flags", PropertyGroupDefinition("key like '$feature/%'"))
|
||||
Reference in New Issue
Block a user