@@ -81,11 +81,14 @@ hash_float!(f16, f32, f64);
81
81
pub struct GroupValuesPrimitive < T : ArrowPrimitiveType > {
82
82
/// The data type of the output array
83
83
data_type : DataType ,
84
- /// Stores the group index based on the hash of its value
84
+ /// Stores the `(group_index, hash)` based on the hash of its value
85
85
///
86
- /// We don't store the hashes as hashing fixed width primitives
87
- /// is fast enough for this not to benefit performance
88
- map : HashTable < usize > ,
86
+ /// We also store `hash` is for reducing cost of rehashing. Such cost
87
+ /// is obvious in high cardinality group by situation.
88
+ /// More details can see:
89
+ /// <https://github.com/apache/datafusion/issues/15961>
90
+ ///
91
+ map : HashTable < ( usize , u64 ) > ,
89
92
/// The group index of the null value if any
90
93
null_group : Option < usize > ,
91
94
/// The values for each group index
@@ -127,15 +130,15 @@ where
127
130
let hash = key. hash ( state) ;
128
131
let insert = self . map . entry (
129
132
hash,
130
- |g | unsafe { self . values . get_unchecked ( * g) . is_eq ( key) } ,
131
- |g| unsafe { self . values . get_unchecked ( * g ) . hash ( state ) } ,
133
+ |& ( g , _ ) | unsafe { self . values . get_unchecked ( g) . is_eq ( key) } ,
134
+ |& ( _ , h ) | h ,
132
135
) ;
133
136
134
137
match insert {
135
- hashbrown:: hash_table:: Entry :: Occupied ( o) => * o. get ( ) ,
138
+ hashbrown:: hash_table:: Entry :: Occupied ( o) => o. get ( ) . 0 ,
136
139
hashbrown:: hash_table:: Entry :: Vacant ( v) => {
137
140
let g = self . values . len ( ) ;
138
- v. insert ( g ) ;
141
+ v. insert ( ( g , hash ) ) ;
139
142
self . values . push ( key) ;
140
143
g
141
144
}
@@ -148,7 +151,7 @@ where
148
151
}
149
152
150
153
fn size ( & self ) -> usize {
151
- self . map . capacity ( ) * size_of :: < usize > ( ) + self . values . allocated_size ( )
154
+ self . map . capacity ( ) * size_of :: < ( usize , u64 ) > ( ) + self . values . allocated_size ( )
152
155
}
153
156
154
157
fn is_empty ( & self ) -> bool {
@@ -181,12 +184,13 @@ where
181
184
build_primitive ( std:: mem:: take ( & mut self . values ) , self . null_group . take ( ) )
182
185
}
183
186
EmitTo :: First ( n) => {
184
- self . map . retain ( |group_idx | {
187
+ self . map . retain ( |entry | {
185
188
// Decrement group index by n
189
+ let group_idx = entry. 0 ;
186
190
match group_idx. checked_sub ( n) {
187
191
// Group index was >= n, shift value down
188
192
Some ( sub) => {
189
- * group_idx = sub;
193
+ entry . 0 = sub;
190
194
true
191
195
}
192
196
// Group index was < n, so remove from table
0 commit comments