tweak _patch_dask_expr

rapidsai · rapids-bot · Nov 4, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
commit db4556fd547ebc132031dfc3ef723330184b794d
diff --git a/python/dask_cudf/dask_cudf/_expr.py b/python/dask_cudf/dask_cudf/_expr.py
@@ -62,142 +62,149 @@ def _simplify_down(self):
 ##
 
 
-def _patch_dask_expr():
-    # This can be removed after cudf#15176 is addressed.
-    # See: https://github.com/rapidsai/cudf/issues/15176
-    class PatchCumulativeBlockwise(CumulativeBlockwise):
-        @property
-        def _args(self) -> list:
-            return self.operands[:1]
-
-        @property
-        def _kwargs(self) -> dict:
-            # Must pass axis and skipna as kwargs in cudf
-            return {"axis": self.axis, "skipna": self.skipna}
-
-    CumulativeBlockwise._args = PatchCumulativeBlockwise._args
-    CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs
-
-    # The upstream Var code uses `Series.values`, and relies on numpy
-    # for most of the logic. Unfortunately, cudf -> cupy conversion
-    # is not supported for data containing null values. Therefore,
-    # we must implement our own version of Var for now. This logic
-    # is mostly copied from dask-cudf.
-
-    class VarCudf(Reduction):
-        # Uses the parallel version of Welford's online algorithm (Chan '79)
-        # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
-        _parameters = [
-            "frame",
-            "skipna",
-            "ddof",
-            "numeric_only",
-            "split_every",
-        ]
-        _defaults = {
-            "skipna": True,
-            "ddof": 1,
-            "numeric_only": False,
-            "split_every": False,
-        }
-
-        @functools.cached_property
-        def _meta(self):
-            return make_meta(
-                meta_nonempty(self.frame._meta).var(
-                    skipna=self.skipna, numeric_only=self.numeric_only
-                )
+# This can be removed after cudf#15176 is addressed.
+# See: https://github.com/rapidsai/cudf/issues/15176
+class PatchCumulativeBlockwise(CumulativeBlockwise):
+    @property
+    def _args(self) -> list:
+        return self.operands[:1]
+
+    @property
+    def _kwargs(self) -> dict:
+        # Must pass axis and skipna as kwargs in cudf
+        return {"axis": self.axis, "skipna": self.skipna}
+
+
+# The upstream Var code uses `Series.values`, and relies on numpy
+# for most of the logic. Unfortunately, cudf -> cupy conversion
+# is not supported for data containing null values. Therefore,
+# we must implement our own version of Var for now. This logic
+# is mostly copied from dask-cudf.
+
+
+class VarCudf(Reduction):
+    # Uses the parallel version of Welford's online algorithm (Chan '79)
+    # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
+    _parameters = [
+        "frame",
+        "skipna",
+        "ddof",
+        "numeric_only",
+        "split_every",
+    ]
+    _defaults = {
+        "skipna": True,
+        "ddof": 1,
+        "numeric_only": False,
+        "split_every": False,
+    }
+
+    @functools.cached_property
+    def _meta(self):
+        return make_meta(
+            meta_nonempty(self.frame._meta).var(
+                skipna=self.skipna, numeric_only=self.numeric_only
             )
+        )
 
-        @property
-        def chunk_kwargs(self):
-            return dict(skipna=self.skipna, numeric_only=self.numeric_only)
+    @property
+    def chunk_kwargs(self):
+        return dict(skipna=self.skipna, numeric_only=self.numeric_only)
+
+    @property
+    def combine_kwargs(self):
+        return {}
+
+    @property
+    def aggregate_kwargs(self):
+        return dict(ddof=self.ddof)
+
+    @classmethod
+    def reduction_chunk(cls, x, skipna=True, numeric_only=False):
+        kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {}
+        if skipna or numeric_only:
+            n = x.count(**kwargs)
+            kwargs["skipna"] = skipna
+            avg = x.mean(**kwargs)
+        else:
+            # Not skipping nulls, so might as well
+            # avoid the full `count` operation
+            n = len(x)
+            kwargs["skipna"] = skipna
+            avg = x.sum(**kwargs) / n
+        if numeric_only:
+            # Workaround for cudf bug
+            # (see: https://github.com/rapidsai/cudf/issues/13731)
+            x = x[n.index]
+        m2 = ((x - avg) ** 2).sum(**kwargs)
+        return n, avg, m2
+
+    @classmethod
+    def reduction_combine(cls, parts):
+        n, avg, m2 = parts[0]
+        for i in range(1, len(parts)):
+            n_a, avg_a, m2_a = n, avg, m2
+            n_b, avg_b, m2_b = parts[i]
+            n = n_a + n_b
+            avg = (n_a * avg_a + n_b * avg_b) / n
+            delta = avg_b - avg_a
+            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
+        return n, avg, m2
+
+    @classmethod
+    def reduction_aggregate(cls, vals, ddof=1):
+        vals = cls.reduction_combine(vals)
+        n, _, m2 = vals
+        return m2 / (n - ddof)
+
+
+def _patched_var(
+    self,
+    axis=0,
+    skipna=True,
+    ddof=1,
+    numeric_only=False,
+    split_every=False,
+):
+    if axis == 0:
+        if hasattr(self._meta, "to_pandas"):
+            return VarCudf(self, skipna, ddof, numeric_only, split_every)
+        else:
+            return Var(self, skipna, ddof, numeric_only, split_every)
+    elif axis == 1:
+        return VarColumns(self, skipna, ddof, numeric_only)
+    else:
+        raise ValueError(f"axis={axis} not supported. Please specify 0 or 1")
 
-        @property
-        def combine_kwargs(self):
-            return {}
 
-        @property
-        def aggregate_kwargs(self):
-            return dict(ddof=self.ddof)
+# Temporary work-around for missing cudf + categorical support
+# See: https://github.com/rapidsai/cudf/issues/11795
+# TODO: Fix RepartitionQuantiles and remove this in cudf>24.06
+
+_original_get_divisions = _shuffle_module._get_divisions
 
-        @classmethod
-        def reduction_chunk(cls, x, skipna=True, numeric_only=False):
-            kwargs = (
-                {"numeric_only": numeric_only} if is_dataframe_like(x) else {}
-            )
-            if skipna or numeric_only:
-                n = x.count(**kwargs)
-                kwargs["skipna"] = skipna
-                avg = x.mean(**kwargs)
-            else:
-                # Not skipping nulls, so might as well
-                # avoid the full `count` operation
-                n = len(x)
-                kwargs["skipna"] = skipna
-                avg = x.sum(**kwargs) / n
-            if numeric_only:
-                # Workaround for cudf bug
-                # (see: https://github.com/rapidsai/cudf/issues/13731)
-                x = x[n.index]
-            m2 = ((x - avg) ** 2).sum(**kwargs)
-            return n, avg, m2
-
-        @classmethod
-        def reduction_combine(cls, parts):
-            n, avg, m2 = parts[0]
-            for i in range(1, len(parts)):
-                n_a, avg_a, m2_a = n, avg, m2
-                n_b, avg_b, m2_b = parts[i]
-                n = n_a + n_b
-                avg = (n_a * avg_a + n_b * avg_b) / n
-                delta = avg_b - avg_a
-                m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
-            return n, avg, m2
-
-        @classmethod
-        def reduction_aggregate(cls, vals, ddof=1):
-            vals = cls.reduction_combine(vals)
-            n, _, m2 = vals
-            return m2 / (n - ddof)
-
-    def _patched_var(
-        self,
-        axis=0,
-        skipna=True,
-        ddof=1,
-        numeric_only=False,
-        split_every=False,
-    ):
-        if axis == 0:
-            if hasattr(self._meta, "to_pandas"):
-                return VarCudf(self, skipna, ddof, numeric_only, split_every)
-            else:
-                return Var(self, skipna, ddof, numeric_only, split_every)
-        elif axis == 1:
-            return VarColumns(self, skipna, ddof, numeric_only)
-        else:
-            raise ValueError(
-                f"axis={axis} not supported. Please specify 0 or 1"
-            )
 
-    Expr.var = _patched_var
+def _patched_get_divisions(frame, other, *args, **kwargs):
+    # NOTE: The following two lines contains the "patch"
+    # (we simply convert the partitioning column to pandas)
+    if is_categorical_dtype(other._meta.dtype) and hasattr(
+        other.frame._meta, "to_pandas"
+    ):
+        other = new_collection(other).to_backend("pandas")._expr
 
-    # Temporary work-around for missing cudf + categorical support
-    # See: https://github.com/rapidsai/cudf/issues/11795
-    # TODO: Fix RepartitionQuantiles and remove this in cudf>24.06
+    # Call "original" function
+    return _original_get_divisions(frame, other, *args, **kwargs)
 
-    _original_get_divisions = _shuffle_module._get_divisions
 
-    def _patched_get_divisions(frame, other, *args, **kwargs):
-        # NOTE: The following two lines contains the "patch"
-        # (we simply convert the partitioning column to pandas)
-        if is_categorical_dtype(other._meta.dtype) and hasattr(
-            other.frame._meta, "to_pandas"
-        ):
-            other = new_collection(other).to_backend("pandas")._expr
+_PATCHED = False
 
-        # Call "original" function
-        return _original_get_divisions(frame, other, *args, **kwargs)
 
-    _shuffle_module._get_divisions = _patched_get_divisions
+def _patch_dask_expr():
+    global _PATCHED
+
+    if not _PATCHED:
+        CumulativeBlockwise._args = PatchCumulativeBlockwise._args
+        CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs
+        Expr.var = _patched_var
+        _shuffle_module._get_divisions = _patched_get_divisions
+        _PATCHED = True