apache · Rachelint · May 21, 2025 · May 21, 2025 · May 21, 2025 · May 21, 2025
diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs
@@ -40,8 +40,9 @@ pub(crate) use single_group_by::primitive::HashValue;
 
 use crate::aggregates::{
     group_values::single_group_by::{
-        bytes::GroupValuesByes, bytes_view::GroupValuesBytesView,
-        primitive::GroupValuesPrimitive,
+        bytes::GroupValuesByes,
+        bytes_view::GroupValuesBytesView,
+        primitive::{GroupValuesLargePrimitive, GroupValuesPrimitive},
     },
     order::GroupOrdering,
 };
@@ -134,6 +135,12 @@ pub(crate) fn new_group_values(
     if schema.fields.len() == 1 {
         let d = schema.fields[0].data_type();
 
+        macro_rules! large_downcast_helper {
+            ($t:ty, $d:ident) => {
+                return Ok(Box::new(GroupValuesLargePrimitive::<$t>::new($d.clone())))
+            };
+        }
+
         macro_rules! downcast_helper {
             ($t:ty, $d:ident) => {
                 return Ok(Box::new(GroupValuesPrimitive::<$t>::new($d.clone())))
@@ -169,7 +176,7 @@ pub(crate) fn new_group_values(
                 TimeUnit::Nanosecond => downcast_helper!(TimestampNanosecondType, d),
             },
             DataType::Decimal128(_, _) => {
-                downcast_helper!(Decimal128Type, d);
+                large_downcast_helper!(Decimal128Type, d);
             }
             DataType::Utf8 => {
                 return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Utf8)));

diff --git a/...on/physical-plan/src/aggregates/group_values/single_group_by/primitive/large_primitive.rs b/...on/physical-plan/src/aggregates/group_values/single_group_by/primitive/large_primitive.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::aggregates::group_values::single_group_by::primitive::{
+    emit_internal, HashValue,
+};
+use crate::aggregates::group_values::GroupValues;
+use ahash::RandomState;
+use arrow::array::{
+    cast::AsArray, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, PrimitiveArray,
+};
+use arrow::datatypes::DataType;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::Result;
+use datafusion_execution::memory_pool::proxy::VecAllocExt;
+use datafusion_expr::EmitTo;
+use hashbrown::hash_table::HashTable;
+use std::mem::size_of;
+
+/// A [`GroupValues`] storing a single column of large primitive values (bits > 64)
+///
+/// This specialization is significantly faster than using the more general
+/// purpose `Row`s format
-/// This specialization is significantly faster than using the more general
-/// purpose `Row`s format
+/// This specialization is faster than [`GroupValuesPrimitive`] because it does not store values directly
+/// in the hash table, but instead stores an index into self.vales
-/// This specialization is significantly faster than using the more general
-/// purpose `Row`s format
+/// This specialization is faster than [`GroupValuesPrimitive`] because it does not store values directly
+/// in the hash table, but instead stores an index into self.vales
+pub struct GroupValuesLargePrimitive<T: ArrowPrimitiveType> {
+    /// The data type of the output array
+    data_type: DataType,
+    /// Stores the `(group_index, hash)` based on the hash of its value
+    ///
+    /// We also store `hash` is for reducing cost of rehashing. Such cost
+    /// is obvious in high cardinality group by situation.
+    /// More details can see:
+    /// <https://github.com/apache/datafusion/issues/15961>
+    ///
+    map: HashTable<(usize, u64)>,
+    /// The group index of the null value if any
+    null_group: Option<usize>,
+    /// The values for each group index
+    values: Vec<T::Native>,
+    /// The random state used to generate hashes
+    random_state: RandomState,
+}
+
+impl<T: ArrowPrimitiveType> GroupValuesLargePrimitive<T> {
+    pub fn new(data_type: DataType) -> Self {
+        assert!(PrimitiveArray::<T>::is_compatible(&data_type));
+        Self {
+            data_type,
+            map: HashTable::with_capacity(128),
+            values: Vec::with_capacity(128),
+            null_group: None,
+            random_state: Default::default(),
+        }
+    }
+}
+
+impl<T: ArrowPrimitiveType> GroupValues for GroupValuesLargePrimitive<T>
+where
+    T::Native: HashValue,
+{
+    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
+        assert_eq!(cols.len(), 1);
+        groups.clear();
+
+        for v in cols[0].as_primitive::<T>() {
+            let group_id = match v {
+                None => *self.null_group.get_or_insert_with(|| {
+                    let group_id = self.values.len();
+                    self.values.push(Default::default());
+                    group_id
+                }),
+                Some(key) => {
+                    let state = &self.random_state;
+                    let hash = key.hash(state);
+                    let insert = self.map.entry(
+                        hash,
+                        |&(g, _)| unsafe { self.values.get_unchecked(g).is_eq(key) },
+                        |&(_, h)| h,
+                    );
+
+                    match insert {
+                        hashbrown::hash_table::Entry::Occupied(o) => o.get().0,
+                        hashbrown::hash_table::Entry::Vacant(v) => {
+                            let g = self.values.len();
+                            v.insert((g, hash));
+                            self.values.push(key);
+                            g
+                        }
+                    }
+                }
+            };
+            groups.push(group_id)
+        }
+        Ok(())
+    }
+
+    fn size(&self) -> usize {
+        self.map.capacity() * size_of::<(usize, u64)>() + self.values.allocated_size()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.values.is_empty()
+    }
+
+    fn len(&self) -> usize {
+        self.values.len()
+    }
+
+    fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
+        emit_internal::<T, u64>(
+            emit_to,
+            &mut self.values,
+            &mut self.null_group,
+            &mut self.map,
+            self.data_type.clone(),
+        )
+    }
+
+    fn clear_shrink(&mut self, batch: &RecordBatch) {
+        let count = batch.num_rows();
+        self.values.clear();
+        self.values.shrink_to(count);
+        self.map.clear();
+        self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared
+    }
+}