1717
1818//! Functionality used both on logical and physical plans
1919
20- #[ cfg( not( feature = "force_hash_collisions" ) ) ]
21- use std:: sync:: Arc ;
22-
2320use ahash:: RandomState ;
2421use arrow:: array:: types:: { IntervalDayTime , IntervalMonthDayNano } ;
2522use arrow:: array:: * ;
@@ -215,12 +212,11 @@ fn hash_dictionary<K: ArrowDictionaryKeyType>(
215212 // Hash each dictionary value once, and then use that computed
216213 // hash for each key value to avoid a potentially expensive
217214 // redundant hashing for large dictionary elements (e.g. strings)
218- let dict_values = Arc :: clone ( array. values ( ) ) ;
215+ let dict_values = array. values ( ) ;
219216 let mut dict_hashes = vec ! [ 0 ; dict_values. len( ) ] ;
220- create_hashes ( & [ dict_values] , random_state, & mut dict_hashes) ?;
217+ create_hashes_from_arrays ( & [ dict_values. as_ref ( ) ] , random_state, & mut dict_hashes) ?;
221218
222219 // combine hash for each index in values
223- let dict_values = array. values ( ) ;
224220 for ( hash, key) in hashes_buffer. iter_mut ( ) . zip ( array. keys ( ) . iter ( ) ) {
225221 if let Some ( key) = key {
226222 let idx = key. as_usize ( ) ;
@@ -308,11 +304,11 @@ fn hash_list_array<OffsetSize>(
308304where
309305 OffsetSize : OffsetSizeTrait ,
310306{
311- let values = Arc :: clone ( array. values ( ) ) ;
307+ let values = array. values ( ) ;
312308 let offsets = array. value_offsets ( ) ;
313309 let nulls = array. nulls ( ) ;
314310 let mut values_hashes = vec ! [ 0u64 ; values. len( ) ] ;
315- create_hashes ( & [ values] , random_state, & mut values_hashes) ?;
311+ create_hashes_from_arrays ( & [ values. as_ref ( ) ] , random_state, & mut values_hashes) ?;
316312 if let Some ( nulls) = nulls {
317313 for ( i, ( start, stop) ) in offsets. iter ( ) . zip ( offsets. iter ( ) . skip ( 1 ) ) . enumerate ( ) {
318314 if nulls. is_valid ( i) {
@@ -339,11 +335,11 @@ fn hash_fixed_list_array(
339335 random_state : & RandomState ,
340336 hashes_buffer : & mut [ u64 ] ,
341337) -> Result < ( ) > {
342- let values = Arc :: clone ( array. values ( ) ) ;
338+ let values = array. values ( ) ;
343339 let value_length = array. value_length ( ) as usize ;
344340 let nulls = array. nulls ( ) ;
345341 let mut values_hashes = vec ! [ 0u64 ; values. len( ) ] ;
346- create_hashes ( & [ values] , random_state, & mut values_hashes) ?;
342+ create_hashes_from_arrays ( & [ values. as_ref ( ) ] , random_state, & mut values_hashes) ?;
347343 if let Some ( nulls) = nulls {
348344 for i in 0 ..array. len ( ) {
349345 if nulls. is_valid ( i) {
@@ -366,83 +362,113 @@ fn hash_fixed_list_array(
366362 Ok ( ( ) )
367363}
368364
369- /// Test version of `create_hashes` that produces the same value for
370- /// all hashes (to test collisions)
371- ///
372- /// See comments on `hashes_buffer` for more details
365+ /// Internal helper function that hashes a single array and either initializes or combines
366+ /// the hash values in the buffer.
367+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
368+ fn hash_single_array (
369+ array : & dyn Array ,
370+ random_state : & RandomState ,
371+ hashes_buffer : & mut [ u64 ] ,
372+ rehash : bool ,
373+ ) -> Result < ( ) > {
374+ downcast_primitive_array ! {
375+ array => hash_array_primitive( array, random_state, hashes_buffer, rehash) ,
376+ DataType :: Null => hash_null( random_state, hashes_buffer, rehash) ,
377+ DataType :: Boolean => hash_array( as_boolean_array( array) ?, random_state, hashes_buffer, rehash) ,
378+ DataType :: Utf8 => hash_array( as_string_array( array) ?, random_state, hashes_buffer, rehash) ,
379+ DataType :: Utf8View => hash_array( as_string_view_array( array) ?, random_state, hashes_buffer, rehash) ,
380+ DataType :: LargeUtf8 => hash_array( as_largestring_array( array) , random_state, hashes_buffer, rehash) ,
381+ DataType :: Binary => hash_array( as_generic_binary_array:: <i32 >( array) ?, random_state, hashes_buffer, rehash) ,
382+ DataType :: BinaryView => hash_array( as_binary_view_array( array) ?, random_state, hashes_buffer, rehash) ,
383+ DataType :: LargeBinary => hash_array( as_generic_binary_array:: <i64 >( array) ?, random_state, hashes_buffer, rehash) ,
384+ DataType :: FixedSizeBinary ( _) => {
385+ let array: & FixedSizeBinaryArray = array. as_any( ) . downcast_ref( ) . unwrap( ) ;
386+ hash_array( array, random_state, hashes_buffer, rehash)
387+ }
388+ DataType :: Dictionary ( _, _) => downcast_dictionary_array! {
389+ array => hash_dictionary( array, random_state, hashes_buffer, rehash) ?,
390+ _ => unreachable!( )
391+ }
392+ DataType :: Struct ( _) => {
393+ let array = as_struct_array( array) ?;
394+ hash_struct_array( array, random_state, hashes_buffer) ?;
395+ }
396+ DataType :: List ( _) => {
397+ let array = as_list_array( array) ?;
398+ hash_list_array( array, random_state, hashes_buffer) ?;
399+ }
400+ DataType :: LargeList ( _) => {
401+ let array = as_large_list_array( array) ?;
402+ hash_list_array( array, random_state, hashes_buffer) ?;
403+ }
404+ DataType :: Map ( _, _) => {
405+ let array = as_map_array( array) ?;
406+ hash_map_array( array, random_state, hashes_buffer) ?;
407+ }
408+ DataType :: FixedSizeList ( _, _) => {
409+ let array = as_fixed_size_list_array( array) ?;
410+ hash_fixed_list_array( array, random_state, hashes_buffer) ?;
411+ }
412+ _ => {
413+ // This is internal because we should have caught this before.
414+ return _internal_err!(
415+ "Unsupported data type in hasher: {}" ,
416+ array. data_type( )
417+ ) ;
418+ }
419+ }
420+ Ok ( ( ) )
421+ }
422+
423+ /// Test version of `hash_single_array` that forces all hashes to collide to zero.
373424#[ cfg( feature = "force_hash_collisions" ) ]
374- pub fn create_hashes < ' a > (
375- _arrays : & [ ArrayRef ] ,
425+ fn hash_single_array (
426+ _array : & dyn Array ,
376427 _random_state : & RandomState ,
377- hashes_buffer : & ' a mut Vec < u64 > ,
378- ) -> Result < & ' a mut Vec < u64 > > {
428+ hashes_buffer : & mut [ u64 ] ,
429+ _rehash : bool ,
430+ ) -> Result < ( ) > {
379431 for hash in hashes_buffer. iter_mut ( ) {
380432 * hash = 0
381433 }
382434 Ok ( hashes_buffer)
383435}
384436
385- /// Creates hash values for every row, based on the values in the
386- /// columns.
437+ /// Creates hash values for every row, based on the values in the columns.
387438///
388439/// The number of rows to hash is determined by `hashes_buffer.len()`.
389440/// `hashes_buffer` should be pre-sized appropriately
390- #[ cfg( not( feature = "force_hash_collisions" ) ) ]
441+ ///
442+ /// This is the same as [`create_hashes`] but accepts `&dyn Array`s instead of requiring
443+ /// `ArrayRef`s.
444+ pub fn create_hashes_from_arrays < ' a > (
445+ arrays : & [ & dyn Array ] ,
446+ random_state : & RandomState ,
447+ hashes_buffer : & ' a mut Vec < u64 > ,
448+ ) -> Result < & ' a mut Vec < u64 > > {
449+ for ( i, & array) in arrays. iter ( ) . enumerate ( ) {
450+ // combine hashes with `combine_hashes` for all columns besides the first
451+ let rehash = i >= 1 ;
452+ hash_single_array ( array, random_state, hashes_buffer, rehash) ?;
453+ }
454+ Ok ( hashes_buffer)
455+ }
456+
457+ /// Creates hash values for every row, based on the values in the columns.
458+ ///
459+ /// The number of rows to hash is determined by `hashes_buffer.len()`.
460+ /// `hashes_buffer` should be pre-sized appropriately.
461+ ///
462+ /// This is the same as [`create_hashes_from_arrays`] but accepts `ArrayRef`s.
391463pub fn create_hashes < ' a > (
392464 arrays : & [ ArrayRef ] ,
393465 random_state : & RandomState ,
394466 hashes_buffer : & ' a mut Vec < u64 > ,
395467) -> Result < & ' a mut Vec < u64 > > {
396- for ( i, col) in arrays. iter ( ) . enumerate ( ) {
397- let array = col. as_ref ( ) ;
468+ for ( i, array) in arrays. iter ( ) . enumerate ( ) {
398469 // combine hashes with `combine_hashes` for all columns besides the first
399470 let rehash = i >= 1 ;
400- downcast_primitive_array ! {
401- array => hash_array_primitive( array, random_state, hashes_buffer, rehash) ,
402- DataType :: Null => hash_null( random_state, hashes_buffer, rehash) ,
403- DataType :: Boolean => hash_array( as_boolean_array( array) ?, random_state, hashes_buffer, rehash) ,
404- DataType :: Utf8 => hash_array( as_string_array( array) ?, random_state, hashes_buffer, rehash) ,
405- DataType :: Utf8View => hash_array( as_string_view_array( array) ?, random_state, hashes_buffer, rehash) ,
406- DataType :: LargeUtf8 => hash_array( as_largestring_array( array) , random_state, hashes_buffer, rehash) ,
407- DataType :: Binary => hash_array( as_generic_binary_array:: <i32 >( array) ?, random_state, hashes_buffer, rehash) ,
408- DataType :: BinaryView => hash_array( as_binary_view_array( array) ?, random_state, hashes_buffer, rehash) ,
409- DataType :: LargeBinary => hash_array( as_generic_binary_array:: <i64 >( array) ?, random_state, hashes_buffer, rehash) ,
410- DataType :: FixedSizeBinary ( _) => {
411- let array: & FixedSizeBinaryArray = array. as_any( ) . downcast_ref( ) . unwrap( ) ;
412- hash_array( array, random_state, hashes_buffer, rehash)
413- }
414- DataType :: Dictionary ( _, _) => downcast_dictionary_array! {
415- array => hash_dictionary( array, random_state, hashes_buffer, rehash) ?,
416- _ => unreachable!( )
417- }
418- DataType :: Struct ( _) => {
419- let array = as_struct_array( array) ?;
420- hash_struct_array( array, random_state, hashes_buffer) ?;
421- }
422- DataType :: List ( _) => {
423- let array = as_list_array( array) ?;
424- hash_list_array( array, random_state, hashes_buffer) ?;
425- }
426- DataType :: LargeList ( _) => {
427- let array = as_large_list_array( array) ?;
428- hash_list_array( array, random_state, hashes_buffer) ?;
429- }
430- DataType :: Map ( _, _) => {
431- let array = as_map_array( array) ?;
432- hash_map_array( array, random_state, hashes_buffer) ?;
433- }
434- DataType :: FixedSizeList ( _, _) => {
435- let array = as_fixed_size_list_array( array) ?;
436- hash_fixed_list_array( array, random_state, hashes_buffer) ?;
437- }
438- _ => {
439- // This is internal because we should have caught this before.
440- return _internal_err!(
441- "Unsupported data type in hasher: {}" ,
442- col. data_type( )
443- ) ;
444- }
445- }
471+ hash_single_array ( array. as_ref ( ) , random_state, hashes_buffer, rehash) ?;
446472 }
447473 Ok ( hashes_buffer)
448474}
@@ -896,4 +922,20 @@ mod tests {
896922
897923 assert_ne ! ( one_col_hashes, two_col_hashes) ;
898924 }
925+
926+ #[ test]
927+ fn test_create_hashes_from_arrays ( ) {
928+ let int_array = Arc :: new ( Int32Array :: from ( vec ! [ 1 , 2 , 3 , 4 ] ) ) ;
929+ let float_array = Arc :: new ( Float64Array :: from ( vec ! [ 1.0 , 2.0 , 3.0 , 4.0 ] ) ) ;
930+
931+ let random_state = RandomState :: with_seeds ( 0 , 0 , 0 , 0 ) ;
932+ let hashes_buff = & mut vec ! [ 0 ; int_array. len( ) ] ;
933+ let hashes = create_hashes_from_arrays (
934+ & [ int_array. as_ref ( ) , float_array. as_ref ( ) ] ,
935+ & random_state,
936+ hashes_buff,
937+ )
938+ . unwrap ( ) ;
939+ assert_eq ! ( hashes. len( ) , 4 , ) ;
940+ }
899941}
0 commit comments