ray-project · raulchen · Feb 21, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/.buildkite/data.rayci.yml b/.buildkite/data.rayci.yml
@@ -54,7 +54,7 @@ steps:
         --only-tags data_non_parallel
     depends_on: data9build
 
-  - label: ":database: data: arrow latest tests"
+  - label: ":database: data: arrow v18 tests"
     tags:
       - python
       - data
@@ -68,7 +68,7 @@ steps:
         --except-tags data_integration,doctest,data_non_parallel
     depends_on: datalbuild
 
-  - label: ":database: data: arrow latest tests (data_non_parallel)"
+  - label: ":database: data: arrow v18 tests (data_non_parallel)"
     tags:
       - python
       - data
@@ -80,7 +80,7 @@ steps:
         --only-tags data_non_parallel
     depends_on: datalbuild
 
-  - label: ":database: data: arrow latest {{matrix.python}} tests ({{matrix.worker_id}})"
+  - label: ":database: data: arrow v18 {{matrix.python}} tests ({{matrix.worker_id}})"
     key: datal_python_tests
     if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
     tags:
@@ -98,7 +98,7 @@ steps:
         python: ["3.12"]
         worker_id: ["0", "1"]
 
-  - label: ":database: data: arrow latest {{matrix.python}} tests (data_non_parallel)"
+  - label: ":database: data: arrow v18 {{matrix.python}} tests (data_non_parallel)"
     key: datal_python_non_parallel_tests
     if: build.pull_request.labels includes "continuous-build" || pipeline.id == "0189e759-8c96-4302-b6b5-b4274406bf89" || pipeline.id == "018f4f1e-1b73-4906-9802-92422e3badaa"
     tags:

@@ -153,6 +153,14 @@ py_test(
     deps = ["//:ray_lib", ":conftest"],
 )
 
+py_test(
+    name = "test_aggregate",
+    size = "small",
+    srcs = ["tests/test_aggregate.py"],
+    tags = ["team:data", "exclusive"],
+    deps = ["//:ray_lib", ":conftest"],
+)
+
 py_test(
     name = "test_avro",
     size = "small",

@@ -340,7 +340,7 @@ def _sample(self, n_samples: int, sort_key: "SortKey") -> "pyarrow.Table":
         table = self._table.select(sort_key.get_columns())
         return transform_pyarrow.take_table(table, indices)
 
-    def count(self, on: str) -> Optional[U]:
+    def count(self, on: str, ignore_nulls: bool = False) -> Optional[U]:
         """Count the number of non-null values in the provided column."""
         import pyarrow.compute as pac
 
@@ -353,8 +353,10 @@ def count(self, on: str) -> Optional[U]:
         if self.num_rows() == 0:
             return None
 
+        mode = "only_valid" if ignore_nulls else "all"
+
         col = self._table[on]
-        return pac.count(col).as_py()
+        return pac.count(col, mode=mode).as_py()
 
     def _apply_arrow_compute(
         self, compute_fn: Callable, on: str, ignore_nulls: bool

@@ -64,8 +64,6 @@ class PandasRow(TableRow):
     def __getitem__(self, key: Union[str, List[str]]) -> Any:
         from ray.data.extensions import TensorArrayElement
 
-        pd = lazy_import_pandas()
-
         def get_item(keys: List[str]) -> Any:
             col = self._row[keys]
             if len(col) == 0:
@@ -75,14 +73,16 @@ def get_item(keys: List[str]) -> Any:
             if isinstance(items.iloc[0], TensorArrayElement):
                 # Getting an item in a Pandas tensor column may return
                 # a TensorArrayElement, which we have to convert to an ndarray.
-                return pd.Series(item.to_numpy() for item in items)
+                return tuple(item.to_numpy() for item in items)
 
             try:
                 # Try to interpret this as a numpy-type value.
                 # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types.  # noqa: E501
-                return pd.Series(item.as_py() for item in items)
+                return tuple(item for item in items)
+
+            except (AttributeError, ValueError) as e:
+                logger.warning(f"Failed to convert {items} to a tuple", exc_info=e)
 
-            except (AttributeError, ValueError):
                 # Fallback to the original form.
                 return items
 
@@ -94,7 +94,7 @@ def get_item(keys: List[str]) -> Any:
         if items is None:
             return None
         elif is_single_item:
-            return items.iloc[0]
+            return items[0]
         else:
             return items
 
@@ -447,8 +447,10 @@ def _apply_agg(
 
         return val
 
-    def count(self, on: str) -> Optional[U]:
-        return self._apply_agg(lambda col: col.count(), on)
+    def count(self, on: str, ignore_nulls: bool = False) -> Optional[U]:
+        return self._apply_agg(
+            lambda col: col.count() if ignore_nulls else len(col), on
+        )
 
     def sum(self, on: str, ignore_nulls: bool) -> Optional[U]:
         if on is not None and not isinstance(on, str):

@@ -327,7 +327,7 @@ def iter_groups() -> Iterator[Tuple[Sequence[KeyType], Block]]:
             if len(group_keys) == 1:
                 init_vals = group_keys[0]
 
-            accumulators = [(agg.init(init_vals) if agg.init else None) for agg in aggs]
+            accumulators = [agg.init(init_vals) for agg in aggs]
             for i in range(len(aggs)):
                 accessor = BlockAccessor.for_block(group_view)
                 # Skip empty blocks