Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: warming and filter #25674

Merged
merged 10 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
udf_event_array_filter
  • Loading branch information
aspicer committed Oct 18, 2024
commit 69876b711cef786ad6a1842442fcae4329b28962
19 changes: 19 additions & 0 deletions posthog/hogql_queries/insights/funnels/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,25 @@ def get_step_counts_query(self) -> ast.SelectQuery:
def get_step_counts_without_aggregation_query(self) -> ast.SelectQuery:
raise NotImplementedError()

# This is a simple heuristic to reduce the number of events we look at in UDF funnels (thus are serialized and sent over)
# We remove an event if it matches one or zero steps and there was already the same type of event before and after it (that don't have the same timestamp)
# arrayRotateRight turns [1,2,3] into [3,1,2]
# arrayRotateRight turns [1,2,3] into [2,3,1]
# For some reason, using these uses much less memory than using indexing in clickhouse to check the previous and next element
def udf_event_array_filter(self, timestamp_index: int, prop_val_index: int, steps_index: int):
return f"""arrayFilter(
(x, x_before, x_after) -> not (
length(x.{steps_index}) <= 1
and x.{steps_index} == x_before.{steps_index}
and x.{steps_index} == x_after.{steps_index}
and x.{prop_val_index} == x_before.{prop_val_index}
and x.{prop_val_index} == x_after.{prop_val_index}
and x.{timestamp_index} > x_before.{timestamp_index}
and x.{timestamp_index} < x_after.{timestamp_index}),
events_array,
arrayRotateRight(events_array, 1),
arrayRotateLeft(events_array, 1))"""

@cached_property
def breakdown_cohorts(self) -> list[Cohort]:
team, breakdown = self.context.team, self.context.breakdown
Expand Down
6 changes: 3 additions & 3 deletions posthog/hogql_queries/insights/funnels/funnel_trends_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from posthog.hogql.constants import HogQLQuerySettings
from posthog.hogql.parser import parse_select, parse_expr
from posthog.hogql_queries.insights.funnels import FunnelTrends
from posthog.hogql_queries.insights.funnels.funnel_udf import udf_event_array_filter
from posthog.hogql_queries.insights.utils.utils import get_start_of_interval_hogql_str
from posthog.schema import BreakdownType, BreakdownAttributionType
from posthog.utils import DATERANGE_MAP, relative_date_parse
Expand Down Expand Up @@ -103,15 +102,16 @@ def _inner_aggregation_query(self):
_toUInt64(toDateTime({get_start_of_interval_hogql_str(self.context.interval.value, team=self.context.team, source='timestamp')})),
uuid,
{prop_selector},
arrayFilter((x) -> x != 0, [{steps}{exclusions}])))) as events_array,
arrayFilter((x) -> x != 0, [{steps}{exclusions}])
))) as events_array,
arrayJoin({fn}(
{from_step},
{max_steps},
{self.conversion_window_limit()},
'{breakdown_attribution_string}',
'{self.context.funnelsFilter.funnelOrderType}',
{prop_vals},
{udf_event_array_filter()}
{self.udf_event_array_filter(1, 4, 5)}
)) as af_tuple,
toTimeZone(toDateTime(_toUInt64(af_tuple.1)), '{self.context.team.timezone}') as entrance_period_start,
af_tuple.2 as success_bool,
Expand Down
24 changes: 7 additions & 17 deletions posthog/hogql_queries/insights/funnels/funnel_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,6 @@
HUMAN_READABLE_TIMESTAMP_FORMAT = "%-d-%b-%Y"


# This is a simple heuristic to reduce the number of events we look at in UDF funnels (thus are serialized and sent over)
# We remove an event if it matches one or zero steps and there was already the same type of event before and after it (that don't have the same timestamp)
# arrayRotateRight turns [1,2,3] into [3,1,2]
# arrayRotateRight turns [1,2,3] into [2,3,1]
# For some reason, using these uses much less memory than using indexing in clickhouse to check the previous and next element
def udf_event_array_filter():
return """
arrayFilter(
(x, x_before, x_after) -> not (length(x.4) <= 1 and x.4 == x_before.4 and x.4 == x_after.4 and x.3 == x_before.3 and x.3 == x_after.3 and x.1 > x_before.1 and x.1 < x_after.1),
events_array,
arrayRotateRight(events_array, 1),
arrayRotateLeft(events_array, 1))
"""


class FunnelUDF(FunnelBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -92,14 +77,19 @@ def _inner_aggregation_query(self):
inner_select = parse_select(
f"""
SELECT
arraySort(t -> t.1, groupArray(tuple(toFloat(timestamp), uuid, {prop_selector}, arrayFilter((x) -> x != 0, [{steps}{exclusions}])))) as events_array,
arraySort(t -> t.1, groupArray(tuple(
toFloat(timestamp),
uuid,
{prop_selector},
arrayFilter((x) -> x != 0, [{steps}{exclusions}])
))) as events_array,
arrayJoin({fn}(
{self.context.max_steps},
{self.conversion_window_limit()},
'{breakdown_attribution_string}',
'{self.context.funnelsFilter.funnelOrderType}',
{prop_vals},
{udf_event_array_filter()}
{self.udf_event_array_filter(1, 3, 4)}
)) as af_tuple,
af_tuple.1 as step_reached,
af_tuple.1 + 1 as steps, -- Backward compatibility
Expand Down
Loading