rapidsai · rapids-bot · Mar 23, 2023 · Dec 1, 2022 · Dec 1, 2022 · Dec 1, 2022
@@ -1,19 +1,35 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 from abc import ABC, abstractmethod
 
+import cupy as cp
+
 import cudf
 
 from cuspatial.core._column.geocolumn import GeoColumn
 from cuspatial.core.binpreds.contains import contains_properly
 from cuspatial.utils.column_utils import (
     contains_only_linestrings,
     contains_only_multipoints,
+    contains_only_points,
     contains_only_polygons,
     has_same_geometry,
 )
 
 
+class PreprocessorOutput:
+    def __init__(self, coords, indices) -> None:
+        self.vertices = coords
+        self.indices = indices
+
+    @property
+    def xy(self):
+        return self.vertices
+
+    def point_indices(self):
+        return self.indices
+
+
 class BinaryPredicate(ABC):
     @abstractmethod
     def preprocess(self, lhs, rhs):
@@ -43,15 +59,19 @@ def preprocess(self, lhs, rhs):
             The right-hand-side of the internal binary predicate, may be
             reordered.
         """
-        pass
+        return (lhs, rhs, cudf.RangeIndex(len(rhs)))
 
     @abstractmethod
-    def postprocess(self, point_indices, point_result):
+    def postprocess(
+        self, point_indices: cudf.Series, point_result: cudf.Series
+    ) -> cudf.Series:
         """Postprocess the output data for the binary predicate. This method
         should be implemented by subclasses.
 
         Postprocess converts the raw results of the binary predicate into
-        the final result. This is where the discrete math rules are applied.
+        the final result. At this step the results for none, any, and all
+        are applied to the result of the equals, intersects, and
+        point-in-polygon predicates.
 
         Parameters
         ----------
@@ -114,6 +134,15 @@ def __init__(self, lhs, rhs, align=True):
         (self.lhs, self.rhs) = lhs.align(rhs) if align else (lhs, rhs)
         self.align = align
 
+    def _cancel_op(self, lhs, rhs, result):
+        """Used to disable computation of the binary predicate.
+
+        This occurs when the binary predicate is not supported for the
+        input types, and a final result can be computed only using
+        `preprocess` and `postprocess`."""
+        self._op = lambda x, y: result
+        return (lhs, rhs, result)
+
     def __call__(self) -> cudf.Series:
         """Return the result of the binary predicate."""
         # Type disambiguation
@@ -185,18 +214,21 @@ def postprocess(self, point_indices, point_result):
         correct type for the predicate."""
         result = cudf.DataFrame({"idx": point_indices, "pip": point_result})
         df_result = result
-        # Discrete math recombination
         if (
             contains_only_linestrings(self.rhs)
             or contains_only_polygons(self.rhs)
             or contains_only_multipoints(self.rhs)
         ):
-            # process for completed linestrings, polygons, and multipoints.
-            # Not necessary for points.
+            # Compute the set of results for each point-in-polygon predicate.
+            # Group them by the original index, and sum the results. If the
+            # sum of points in the rhs feature is equal to the number of
+            # points found in the polygon, then the polygon contains the
+            # feature.
             df_result = (
                 result.groupby("idx").sum().sort_index()
                 == result.groupby("idx").count().sort_index()
 # RHS conditioning: 
 point_indices = None 
 # point in polygon 
 if contains_only_linestrings(rhs): 
     # condition for linestrings 
     geom = rhs.lines 
 elif contains_only_polygons(rhs) is True: 
     # polygon in polygon 
     geom = rhs.polygons 
 elif contains_only_multipoints(rhs) is True: 
     # mpoint in polygon 
     geom = rhs.multipoints 
 else: 
     # no conditioning is required 
     geom = rhs.points 
 xy_points = geom.xy 
 # Arrange into shape for calling point-in-polygon, intersection, or 
 # equals 
 point_indices = geom.point_indices() 
 from cuspatial.core.geoseries import GeoSeries 
 final_rhs = GeoSeries( 
     GeoColumn._from_points_xy(xy_points._column) 
 ).points 
 return (lhs, final_rhs, point_indices) 
 # RHS conditioning: 
 point_indices = None 
 # point in polygon 
 if contains_only_linestrings(rhs): 
     # condition for linestrings 
     geom = rhs.lines 
 elif contains_only_polygons(rhs) is True: 
     # polygon in polygon 
     geom = rhs.polygons 
 elif contains_only_multipoints(rhs) is True: 
     # mpoint in polygon 
     geom = rhs.multipoints 
 else: 
     # no conditioning is required 
     geom = rhs.points 
 xy_points = geom.xy 
  
 # Arrange into shape for calling point-in-polygon, intersection, or 
 # equals 
 point_indices = geom.point_indices() 
 from cuspatial.core.geoseries import GeoSeries 
  
 final_rhs = GeoSeries( 
     GeoColumn._from_points_xy(xy_points._column) 
 ).points 
 return (lhs, final_rhs, point_indices) 
             )
+        # Convert the result to a GeoSeries.
         point_result = cudf.Series(
             df_result["pip"], index=cudf.RangeIndex(0, len(df_result))
         )
@@ -234,3 +266,229 @@ def preprocess(self, lhs, rhs):
         if contains_only_polygons(rhs):
             (lhs, rhs) = (rhs, lhs)
         return super().preprocess(lhs, rhs)
+
+    def postprocess(self, point_indices, point_result):
+        """Postprocess the output GeoSeries to ensure that they are of the
+        correct type for the predicate."""
+        result = cudf.DataFrame({"idx": point_indices, "pip": point_result})
+        df_result = result
+        # Discrete math recombination
+        if (
+            contains_only_linestrings(self.rhs)
+            or contains_only_polygons(self.rhs)
+            or contains_only_multipoints(self.rhs)
+        ):
+            # process for completed linestrings, polygons, and multipoints.
+            # Not necessary for points.
+            df_result = (
+                result.groupby("idx").sum().sort_index()
+                == result.groupby("idx").count().sort_index()
+            )
+        point_result = cudf.Series(
+            df_result["pip"], index=cudf.RangeIndex(0, len(df_result))
+        )
+        point_result.name = None
+        return point_result
+
+
+class EqualsBinpred(BinaryPredicate):
+    def _offset_equals(self, lhs, rhs):
+        """Compute the pairwise length equality of two offset arrays"""
+        lhs_lengths = lhs[:-1] - lhs[1:]
+        rhs_lengths = rhs[:-1] - rhs[1:]
+        return lhs_lengths == rhs_lengths
+
+    def _sort_multipoints(self, lhs, rhs, initial):
+        """Sort xy according to bins defined by offset"""
+        sort_indices = cp.repeat(lhs.point_indices(), 2)
+        lhs_xy, rhs_xy = lhs.xy, rhs.xy
+        lhs_xy.index = sort_indices
+        rhs_xy.index = sort_indices
+        lhs_df = lhs_xy.reset_index(drop=False, name="xy")
+        rhs_df = rhs_xy.reset_index(drop=False, name="xy")
+        lhs_sorted = lhs_df.sort_values(by=["index", "xy"]).reset_index(
+            drop=True
+        )
+        rhs_sorted = rhs_df.sort_values(by=["index", "xy"]).reset_index(
+            drop=True
+        )
+        return (
+            PreprocessorOutput(lhs_sorted["xy"], lhs.point_indices()),
+            PreprocessorOutput(rhs_sorted["xy"], rhs.point_indices()),
+            initial,
+        )
+
+    def _sort_linestrings(self, lhs, rhs, initial):
+        """Swap first and last values of each linestring to ensure that
+        the first point is the lowest value. This is necessary to ensure
+        that the endpoints are not included in the comparison."""
+        # Save temporary values since lhs.x cannot be used modified.
+        lhs_x = lhs.x
+        lhs_y = lhs.y
+        rhs_x = rhs.x
+        rhs_y = rhs.y
+        point_range = cudf.Series(cp.arange(len(lhs_x)))
+        indices = point_range.groupby(lhs.point_indices())
+        # Create masks for the first and last values of each linestring
+        swap_lx = lhs_x[indices.last()].reset_index(drop=True) < lhs_x[
+            indices.first()
+        ].reset_index(drop=True)
+        swap_ly = lhs_y[indices.last()].reset_index(drop=True) < lhs_y[
+            indices.first()
+        ].reset_index(drop=True)
+        swap_rx = rhs_x[indices.last()].reset_index(drop=True) < rhs_x[
+            indices.first()
+        ].reset_index(drop=True)
+        swap_ry = rhs_y[indices.last()].reset_index(drop=True) < rhs_y[
+            indices.first()
+        ].reset_index(drop=True)
+        # Swap the first and last values of each linestring
+        (
+            lhs_x.iloc[indices.last()[swap_lx]],
+            lhs_x.iloc[indices.first()[swap_lx]],
+        ) = (
+            lhs_x.iloc[indices.first()[swap_lx]],
+            lhs_x.iloc[indices.last()[swap_lx]],
+        )
+        (
+            lhs_y.iloc[indices.last()[swap_ly]],
+            lhs_y.iloc[indices.first()[swap_ly]],
+        ) = (
+            lhs_y.iloc[indices.first()[swap_ly]],
+            lhs_y.iloc[indices.last()[swap_ly]],
+        )
+        (
+            rhs_x.iloc[indices.last()[swap_rx]],
+            rhs_x.iloc[indices.first()[swap_rx]],
+        ) = (
+            rhs_x.iloc[indices.first()[swap_rx]],
+            rhs_x.iloc[indices.last()[swap_rx]],
+        )
+        (
+            rhs_y.iloc[indices.last()[swap_ry]],
+            rhs_y.iloc[indices.first()[swap_ry]],
+        ) = (
+            rhs_y.iloc[indices.first()[swap_ry]],
+            rhs_y.iloc[indices.last()[swap_ry]],
+        )
+        # Reconstruct the xy columns
+        lhs_xy = lhs.xy
+        rhs_xy = rhs.xy
+        lhs_xy.iloc[::2] = lhs_x
+        lhs_xy.iloc[1::2] = lhs_y
+        rhs_xy.iloc[::2] = rhs_x
+        rhs_xy.iloc[1::2] = rhs_y
+
+        return (
+            PreprocessorOutput(lhs_xy, lhs.point_indices()),
+            PreprocessorOutput(rhs_xy, rhs.point_indices()),
+            initial,
+        )
+
+    def preprocess(self, lhs, rhs):
+        # Compare types
+        type_compare = lhs.dtype == rhs.dtype
+        # Any unmatched type is not equal
+        if (type_compare == False).all():  # noqa: E712
+            # Override _op so that it will not be run.
+            return self._cancel_op(lhs, rhs, type_compare)
+        # Get indices of matching types
+        if contains_only_multipoints(lhs):
+            lengths_equal = self._offset_equals(
+                lhs.multipoints.geometry_offset,
+                rhs.multipoints.geometry_offset,
+            )
+            if lengths_equal.any():
+                # Multipoints are equal if they contains the
+                # same unordered points.
+                return self._sort_multipoints(
+                    lhs[lengths_equal].multipoints,
+                    rhs[lengths_equal].multipoints,
+                    lengths_equal,
+                )
+            else:
+                # No lengths are equal, so none can be equal.
+                return self._cancel_op(lhs, rhs, lengths_equal)
+        elif contains_only_linestrings(lhs):
+            lengths_equal = self._offset_equals(
+                lhs.lines.part_offset, rhs.lines.part_offset
+            )
+            if lengths_equal.any():
+                # Linestrings are equal if their sorted points
+                # are equal. This is unintuitive and perhaps
+                # incorrect, but it is the behavior of shapely.
+                return self._sort_linestrings(
+                    lhs[lengths_equal].lines,
+                    rhs[lengths_equal].lines,
+                    lengths_equal,
+                )
+            else:
+                return self._cancel_op(lhs, rhs, lengths_equal)
+        elif contains_only_polygons(lhs):
+            geoms_equal = self._offset_equals(
+                lhs.polygons.part_offset, rhs.polygons.part_offset
+            )
+            lengths_equal = self._offset_equals(
+                lhs[geoms_equal].polygons.ring_offset,
+                rhs[geoms_equal].polygons.ring_offset,
+            )
+            if lengths_equal.any():
+                # Don't sort polygons
+                return (
+                    lhs[lengths_equal].polygons,
+                    rhs[lengths_equal].polygons,
+                    lengths_equal,
+                )
+            else:
+                return self._cancel_op(lhs, rhs, lengths_equal)
+        elif contains_only_points(lhs):
+            return (lhs.points, rhs.points, type_compare)
+
+    def postprocess(self, lengths_equal, point_result):
+        # if point_result is not a Series, preprocessing terminated
+        # the results early.
+        if isinstance(point_result, cudf.Series):
+            point_result = point_result.sort_index()
+            lengths_equal[point_result.index] = point_result
+        return cudf.Series(lengths_equal)
+
+    def _vertices_equals(self, lhs, rhs):
+        """Compute the equals relationship between interleaved xy
+        coordinate buffers."""
+        length = min(len(lhs), len(rhs))
+        a = lhs[:length:2]._column == rhs[:length:2]._column
+        b = rhs[1:length:2]._column == lhs[1:length:2]._column
+        return a & b
+
+    def _op(self, lhs, rhs):
+        indices = lhs.point_indices()
+        result = self._vertices_equals(lhs.xy, rhs.xy)
+        result_df = cudf.DataFrame(
+            {"idx": indices[: len(result)], "equals": result}
+        )
+        gb_idx = result_df.groupby("idx")
+        result = (gb_idx.sum().sort_index() == gb_idx.count().sort_index())[
+            "equals"
+        ]
+        result.index.name = None
+        result.name = None
+        return result
+
+
+class CrossesBinpred(EqualsBinpred):
+    def postprocess(self, point_indices, point_result):
+        if has_same_geometry(self.lhs, self.rhs) and contains_only_points(
+            self.lhs
+        ):
+            return cudf.Series([False] * len(self.lhs))
+        df_result = cudf.DataFrame({"idx": point_indices, "pip": point_result})
+        point_result = cudf.Series(
+            df_result["pip"], index=cudf.RangeIndex(0, len(df_result))
+        )
+        point_result.name = None
+        return point_result
+
+
+class CoversBinpred(EqualsBinpred):
+    def postprocess(self, point_indices, point_result):
+        return cudf.Series(point_result, index=point_indices)