1717
1818use std:: sync:: Arc ;
1919
20- use arrow:: array:: { ArrayRef , Int32Array } ;
20+ use arrow:: array:: { ArrayRef , Int32Array , StringArray } ;
2121use arrow:: compute:: { concat_batches, SortOptions } ;
2222use arrow:: datatypes:: SchemaRef ;
2323use arrow:: record_batch:: RecordBatch ;
@@ -45,6 +45,7 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
4545use test_utils:: add_empty_batches;
4646
4747use hashbrown:: HashMap ;
48+ use rand:: distributions:: Alphanumeric ;
4849use rand:: rngs:: StdRng ;
4950use rand:: { Rng , SeedableRng } ;
5051
@@ -607,25 +608,6 @@ fn convert_bound_to_current_row_if_applicable(
607608 }
608609}
609610
610- /// This utility determines whether a given window frame can be executed with
611- /// multiple ORDER BY expressions. As an example, range frames with offset (such
612- /// as `RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING`) cannot have ORDER BY clauses
613- /// of the form `\[ORDER BY a ASC, b ASC, ...]`
614- fn can_accept_multi_orderby ( window_frame : & WindowFrame ) -> bool {
615- match window_frame. units {
616- WindowFrameUnits :: Rows => true ,
617- WindowFrameUnits :: Range => {
618- // Range can only accept multi ORDER BY clauses when bounds are
619- // CURRENT ROW or UNBOUNDED PRECEDING/FOLLOWING:
620- ( window_frame. start_bound . is_unbounded ( )
621- || window_frame. start_bound == WindowFrameBound :: CurrentRow )
622- && ( window_frame. end_bound . is_unbounded ( )
623- || window_frame. end_bound == WindowFrameBound :: CurrentRow )
624- }
625- WindowFrameUnits :: Groups => true ,
626- }
627- }
628-
629611/// Perform batch and running window same input
630612/// and verify outputs of `WindowAggExec` and `BoundedWindowAggExec` are equal
631613async fn run_window_test (
@@ -649,7 +631,7 @@ async fn run_window_test(
649631 options : SortOptions :: default ( ) ,
650632 } )
651633 }
652- if orderby_exprs. len ( ) > 1 && !can_accept_multi_orderby ( & window_frame ) {
634+ if orderby_exprs. len ( ) > 1 && !window_frame . can_accept_multi_orderby ( ) {
653635 orderby_exprs = orderby_exprs[ 0 ..1 ] . to_vec ( ) ;
654636 }
655637 let mut partitionby_exprs = vec ! [ ] ;
@@ -733,11 +715,30 @@ async fn run_window_test(
733715 ) ?) as _ ;
734716 let task_ctx = ctx. task_ctx ( ) ;
735717 let collected_usual = collect ( usual_window_exec, task_ctx. clone ( ) ) . await ?;
736- let collected_running = collect ( running_window_exec, task_ctx) . await ?;
718+ let collected_running = collect ( running_window_exec, task_ctx)
719+ . await ?
720+ . into_iter ( )
721+ . filter ( |b| b. num_rows ( ) > 0 )
722+ . collect :: < Vec < _ > > ( ) ;
737723
738724 // BoundedWindowAggExec should produce more chunk than the usual WindowAggExec.
739725 // Otherwise it means that we cannot generate result in running mode.
740- assert ! ( collected_running. len( ) > collected_usual. len( ) ) ;
726+ let err_msg = format ! ( "Inconsistent result for window_frame: {window_frame:?}, window_fn: {window_fn:?}, args:{args:?}, random_seed: {random_seed:?}, search_mode: {search_mode:?}, partition_by_columns:{partition_by_columns:?}, orderby_columns: {orderby_columns:?}" ) ;
727+ // Below check makes sure that, streaming execution generates more chunks than the bulk execution.
728+ // Since algorithms and operators works on sliding windows in the streaming execution.
729+ // However, in the current test setup for some random generated window frame clauses: It is not guaranteed
730+ // for streaming execution to generate more chunk than its non-streaming counter part in the Linear mode.
731+ // As an example window frame `OVER(PARTITION BY d ORDER BY a RANGE BETWEEN CURRENT ROW AND 9 FOLLOWING)`
732+ // needs to receive a=10 to generate result for the rows where a=0. If the input data generated is between the range [0, 9].
733+ // even in streaming mode, generated result will be single bulk as in the non-streaming version.
734+ if search_mode != Linear {
735+ assert ! (
736+ collected_running. len( ) > collected_usual. len( ) ,
737+ "{}" ,
738+ err_msg
739+ ) ;
740+ }
741+
741742 // compare
742743 let usual_formatted = pretty_format_batches ( & collected_usual) ?. to_string ( ) ;
743744 let running_formatted = pretty_format_batches ( & collected_running) ?. to_string ( ) ;
@@ -767,10 +768,17 @@ async fn run_window_test(
767768 Ok ( ( ) )
768769}
769770
771+ fn generate_random_string ( rng : & mut StdRng , length : usize ) -> String {
772+ rng. sample_iter ( & Alphanumeric )
773+ . take ( length)
774+ . map ( char:: from)
775+ . collect ( )
776+ }
777+
770778/// Return randomly sized record batches with:
771779/// three sorted int32 columns 'a', 'b', 'c' ranged from 0..DISTINCT as columns
772780/// one random int32 column x
773- fn make_staggered_batches < const STREAM : bool > (
781+ pub ( crate ) fn make_staggered_batches < const STREAM : bool > (
774782 len : usize ,
775783 n_distinct : usize ,
776784 random_seed : u64 ,
@@ -779,6 +787,7 @@ fn make_staggered_batches<const STREAM: bool>(
779787 let mut rng = StdRng :: seed_from_u64 ( random_seed) ;
780788 let mut input123: Vec < ( i32 , i32 , i32 ) > = vec ! [ ( 0 , 0 , 0 ) ; len] ;
781789 let mut input4: Vec < i32 > = vec ! [ 0 ; len] ;
790+ let mut input5: Vec < String > = vec ! [ "" . to_string( ) ; len] ;
782791 input123. iter_mut ( ) . for_each ( |v| {
783792 * v = (
784793 rng. gen_range ( 0 ..n_distinct) as i32 ,
@@ -788,17 +797,23 @@ fn make_staggered_batches<const STREAM: bool>(
788797 } ) ;
789798 input123. sort ( ) ;
790799 rng. fill ( & mut input4[ ..] ) ;
800+ input5. iter_mut ( ) . for_each ( |v| {
801+ * v = generate_random_string ( & mut rng, 1 ) ;
802+ } ) ;
803+ input5. sort ( ) ;
791804 let input1 = Int32Array :: from_iter_values ( input123. iter ( ) . map ( |k| k. 0 ) ) ;
792805 let input2 = Int32Array :: from_iter_values ( input123. iter ( ) . map ( |k| k. 1 ) ) ;
793806 let input3 = Int32Array :: from_iter_values ( input123. iter ( ) . map ( |k| k. 2 ) ) ;
794807 let input4 = Int32Array :: from_iter_values ( input4) ;
808+ let input5 = StringArray :: from_iter_values ( input5) ;
795809
796810 // split into several record batches
797811 let mut remainder = RecordBatch :: try_from_iter ( vec ! [
798812 ( "a" , Arc :: new( input1) as ArrayRef ) ,
799813 ( "b" , Arc :: new( input2) as ArrayRef ) ,
800814 ( "c" , Arc :: new( input3) as ArrayRef ) ,
801815 ( "x" , Arc :: new( input4) as ArrayRef ) ,
816+ ( "string_field" , Arc :: new( input5) as ArrayRef ) ,
802817 ] )
803818 . unwrap ( ) ;
804819
@@ -807,6 +822,7 @@ fn make_staggered_batches<const STREAM: bool>(
807822 while remainder. num_rows ( ) > 0 {
808823 let batch_size = rng. gen_range ( 0 ..50 ) ;
809824 if remainder. num_rows ( ) < batch_size {
825+ batches. push ( remainder) ;
810826 break ;
811827 }
812828 batches. push ( remainder. slice ( 0 , batch_size) ) ;
0 commit comments