udf_event_array_filter

PostHog · aspicer · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
commit 69876b711cef786ad6a1842442fcae4329b28962
diff --git a/posthog/hogql_queries/insights/funnels/base.py b/posthog/hogql_queries/insights/funnels/base.py
@@ -89,6 +89,25 @@ def get_step_counts_query(self) -> ast.SelectQuery:
     def get_step_counts_without_aggregation_query(self) -> ast.SelectQuery:
         raise NotImplementedError()
 
+    # This is a simple heuristic to reduce the number of events we look at in UDF funnels (thus are serialized and sent over)
+    # We remove an event if it matches one or zero steps and there was already the same type of event before and after it (that don't have the same timestamp)
+    # arrayRotateRight turns [1,2,3] into [3,1,2]
+    # arrayRotateRight turns [1,2,3] into [2,3,1]
+    # For some reason, using these uses much less memory than using indexing in clickhouse to check the previous and next element
+    def udf_event_array_filter(self, timestamp_index: int, prop_val_index: int, steps_index: int):
+        return f"""arrayFilter(
+                    (x, x_before, x_after) -> not (
+                        length(x.{steps_index}) <= 1
+                        and x.{steps_index} == x_before.{steps_index}
+                        and x.{steps_index} == x_after.{steps_index}
+                        and x.{prop_val_index} == x_before.{prop_val_index}
+                        and x.{prop_val_index} == x_after.{prop_val_index}
+                        and x.{timestamp_index} > x_before.{timestamp_index}
+                        and x.{timestamp_index} < x_after.{timestamp_index}),
+                    events_array,
+                    arrayRotateRight(events_array, 1),
+                    arrayRotateLeft(events_array, 1))"""
+
     @cached_property
     def breakdown_cohorts(self) -> list[Cohort]:
         team, breakdown = self.context.team, self.context.breakdown

diff --git a/posthog/hogql_queries/insights/funnels/funnel_trends_udf.py b/posthog/hogql_queries/insights/funnels/funnel_trends_udf.py
@@ -6,7 +6,6 @@
 from posthog.hogql.constants import HogQLQuerySettings
 from posthog.hogql.parser import parse_select, parse_expr
 from posthog.hogql_queries.insights.funnels import FunnelTrends
-from posthog.hogql_queries.insights.funnels.funnel_udf import udf_event_array_filter
 from posthog.hogql_queries.insights.utils.utils import get_start_of_interval_hogql_str
 from posthog.schema import BreakdownType, BreakdownAttributionType
 from posthog.utils import DATERANGE_MAP, relative_date_parse
@@ -103,15 +102,16 @@ def _inner_aggregation_query(self):
                     _toUInt64(toDateTime({get_start_of_interval_hogql_str(self.context.interval.value, team=self.context.team, source='timestamp')})),
                     uuid,
                     {prop_selector},
-                    arrayFilter((x) -> x != 0, [{steps}{exclusions}])))) as events_array,
+                    arrayFilter((x) -> x != 0, [{steps}{exclusions}])
+                ))) as events_array,
                 arrayJoin({fn}(
                     {from_step},
                     {max_steps},
                     {self.conversion_window_limit()},
                     '{breakdown_attribution_string}',
                     '{self.context.funnelsFilter.funnelOrderType}',
                     {prop_vals},
-                    {udf_event_array_filter()}
+                    {self.udf_event_array_filter(1, 4, 5)}
                 )) as af_tuple,
                 toTimeZone(toDateTime(_toUInt64(af_tuple.1)), '{self.context.team.timezone}') as entrance_period_start,
                 af_tuple.2 as success_bool,

diff --git a/posthog/hogql_queries/insights/funnels/funnel_udf.py b/posthog/hogql_queries/insights/funnels/funnel_udf.py
@@ -10,21 +10,6 @@
 HUMAN_READABLE_TIMESTAMP_FORMAT = "%-d-%b-%Y"
 
 
-# This is a simple heuristic to reduce the number of events we look at in UDF funnels (thus are serialized and sent over)
-# We remove an event if it matches one or zero steps and there was already the same type of event before and after it (that don't have the same timestamp)
-# arrayRotateRight turns [1,2,3] into [3,1,2]
-# arrayRotateRight turns [1,2,3] into [2,3,1]
-# For some reason, using these uses much less memory than using indexing in clickhouse to check the previous and next element
-def udf_event_array_filter():
-    return """
-            arrayFilter(
-                (x, x_before, x_after) -> not (length(x.4) <= 1 and x.4 == x_before.4 and x.4 == x_after.4 and x.3 == x_before.3 and x.3 == x_after.3 and x.1 > x_before.1 and x.1 < x_after.1),
-                events_array,
-                arrayRotateRight(events_array, 1),
-                arrayRotateLeft(events_array, 1))
-    """
-
-
 class FunnelUDF(FunnelBase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -92,14 +77,19 @@ def _inner_aggregation_query(self):
         inner_select = parse_select(
             f"""
             SELECT
-                arraySort(t -> t.1, groupArray(tuple(toFloat(timestamp), uuid, {prop_selector}, arrayFilter((x) -> x != 0, [{steps}{exclusions}])))) as events_array,
+                arraySort(t -> t.1, groupArray(tuple(
+                    toFloat(timestamp),
+                    uuid,
+                    {prop_selector},
+                    arrayFilter((x) -> x != 0, [{steps}{exclusions}])
+                ))) as events_array,
                 arrayJoin({fn}(
                     {self.context.max_steps},
                     {self.conversion_window_limit()},
                     '{breakdown_attribution_string}',
                     '{self.context.funnelsFilter.funnelOrderType}',
                     {prop_vals},
-                    {udf_event_array_filter()}
+                    {self.udf_event_array_filter(1, 3, 4)}
                 )) as af_tuple,
                 af_tuple.1 as step_reached,
                 af_tuple.1 + 1 as steps, -- Backward compatibility