@@ -188,21 +188,11 @@ impl ExecutionPlan for GlobalLimitExec {
188188 fn statistics ( & self ) -> Result < Statistics > {
189189 let input_stats = self . input . statistics ( ) ?;
190190 let skip = self . skip ;
191- // the maximum row number needs to be fetched
192- let max_row_num = self
193- . fetch
194- . map ( |fetch| {
195- if fetch >= usize:: MAX - skip {
196- usize:: MAX
197- } else {
198- fetch + skip
199- }
200- } )
201- . unwrap_or ( usize:: MAX ) ;
202191 let col_stats = Statistics :: unknown_column ( & self . schema ( ) ) ;
192+ let fetch = self . fetch . unwrap_or ( usize:: MAX ) ;
203193
204- let fetched_row_number_stats = Statistics {
205- num_rows : Precision :: Exact ( max_row_num ) ,
194+ let mut fetched_row_number_stats = Statistics {
195+ num_rows : Precision :: Exact ( fetch ) ,
206196 column_statistics : col_stats. clone ( ) ,
207197 total_byte_size : Precision :: Absent ,
208198 } ;
@@ -218,23 +208,55 @@ impl ExecutionPlan for GlobalLimitExec {
218208 } => {
219209 if nr <= skip {
220210 // if all input data will be skipped, return 0
221- Statistics {
211+ let mut skip_all_rows_stats = Statistics {
222212 num_rows : Precision :: Exact ( 0 ) ,
223213 column_statistics : col_stats,
224214 total_byte_size : Precision :: Absent ,
215+ } ;
216+ if !input_stats. num_rows . is_exact ( ) . unwrap_or ( false ) {
217+ // The input stats are inexact, so the output stats must be too.
218+ skip_all_rows_stats = skip_all_rows_stats. into_inexact ( ) ;
225219 }
226- } else if nr <= max_row_num {
227- // if the input does not reach the "fetch" globally, return input stats
220+ skip_all_rows_stats
221+ } else if nr <= fetch && self . skip == 0 {
222+ // if the input does not reach the "fetch" globally, and "skip" is zero
223+ // (meaning the input and output are identical), return input stats.
224+ // Can input_stats still be used, but adjusted, in the "skip != 0" case?
228225 input_stats
226+ } else if nr - skip <= fetch {
227+ // after "skip" input rows are skipped, the remaining rows are less than or equal to the
228+ // "fetch" values, so `num_rows` must equal the remaining rows
229+ let remaining_rows: usize = nr - skip;
230+ let mut skip_some_rows_stats = Statistics {
231+ num_rows : Precision :: Exact ( remaining_rows) ,
232+ column_statistics : col_stats. clone ( ) ,
233+ total_byte_size : Precision :: Absent ,
234+ } ;
235+ if !input_stats. num_rows . is_exact ( ) . unwrap_or ( false ) {
236+ // The input stats are inexact, so the output stats must be too.
237+ skip_some_rows_stats = skip_some_rows_stats. into_inexact ( ) ;
238+ }
239+ skip_some_rows_stats
229240 } else {
230- // if the input is greater than the "fetch", the num_row will be the "fetch",
241+ // if the input is greater than "fetch+skip ", the num_rows will be the "fetch",
231242 // but we won't be able to predict the other statistics
243+ if !input_stats. num_rows . is_exact ( ) . unwrap_or ( false )
244+ || self . fetch . is_none ( )
245+ {
246+ // If the input stats are inexact, the output stats must be too.
247+ // If the fetch value is `usize::MAX` because no LIMIT was specified,
248+ // we also can't represent it as an exact value.
249+ fetched_row_number_stats =
250+ fetched_row_number_stats. into_inexact ( ) ;
251+ }
232252 fetched_row_number_stats
233253 }
234254 }
235255 _ => {
236- // the result output row number will always be no greater than the limit number
237- fetched_row_number_stats
256+ // The result output `num_rows` will always be no greater than the limit number.
257+ // Should `num_rows` be marked as `Absent` here when the `fetch` value is large,
258+ // as the actual `num_rows` may be far away from the `fetch` value?
259+ fetched_row_number_stats. into_inexact ( )
238260 }
239261 } ;
240262 Ok ( stats)
@@ -552,7 +574,10 @@ mod tests {
552574 use crate :: common:: collect;
553575 use crate :: { common, test} ;
554576
577+ use crate :: aggregates:: { AggregateExec , AggregateMode , PhysicalGroupBy } ;
555578 use arrow_schema:: Schema ;
579+ use datafusion_physical_expr:: expressions:: col;
580+ use datafusion_physical_expr:: PhysicalExpr ;
556581
557582 #[ tokio:: test]
558583 async fn limit ( ) -> Result < ( ) > {
@@ -712,7 +737,7 @@ mod tests {
712737 }
713738
714739 #[ tokio:: test]
715- async fn skip_3_fetch_10 ( ) -> Result < ( ) > {
740+ async fn skip_3_fetch_10_stats ( ) -> Result < ( ) > {
716741 // there are total of 100 rows, we skipped 3 rows (offset = 3)
717742 let row_count = skip_and_fetch ( 3 , Some ( 10 ) ) . await ?;
718743 assert_eq ! ( row_count, 10 ) ;
@@ -748,7 +773,58 @@ mod tests {
748773 assert_eq ! ( row_count, Precision :: Exact ( 10 ) ) ;
749774
750775 let row_count = row_number_statistics_for_global_limit ( 5 , Some ( 10 ) ) . await ?;
751- assert_eq ! ( row_count, Precision :: Exact ( 15 ) ) ;
776+ assert_eq ! ( row_count, Precision :: Exact ( 10 ) ) ;
777+
778+ let row_count = row_number_statistics_for_global_limit ( 400 , Some ( 10 ) ) . await ?;
779+ assert_eq ! ( row_count, Precision :: Exact ( 0 ) ) ;
780+
781+ let row_count = row_number_statistics_for_global_limit ( 398 , Some ( 10 ) ) . await ?;
782+ assert_eq ! ( row_count, Precision :: Exact ( 2 ) ) ;
783+
784+ let row_count = row_number_statistics_for_global_limit ( 398 , Some ( 1 ) ) . await ?;
785+ assert_eq ! ( row_count, Precision :: Exact ( 1 ) ) ;
786+
787+ let row_count = row_number_statistics_for_global_limit ( 398 , None ) . await ?;
788+ assert_eq ! ( row_count, Precision :: Exact ( 2 ) ) ;
789+
790+ let row_count =
791+ row_number_statistics_for_global_limit ( 0 , Some ( usize:: MAX ) ) . await ?;
792+ assert_eq ! ( row_count, Precision :: Exact ( 400 ) ) ;
793+
794+ let row_count =
795+ row_number_statistics_for_global_limit ( 398 , Some ( usize:: MAX ) ) . await ?;
796+ assert_eq ! ( row_count, Precision :: Exact ( 2 ) ) ;
797+
798+ let row_count =
799+ row_number_inexact_statistics_for_global_limit ( 0 , Some ( 10 ) ) . await ?;
800+ assert_eq ! ( row_count, Precision :: Inexact ( 10 ) ) ;
801+
802+ let row_count =
803+ row_number_inexact_statistics_for_global_limit ( 5 , Some ( 10 ) ) . await ?;
804+ assert_eq ! ( row_count, Precision :: Inexact ( 10 ) ) ;
805+
806+ let row_count =
807+ row_number_inexact_statistics_for_global_limit ( 400 , Some ( 10 ) ) . await ?;
808+ assert_eq ! ( row_count, Precision :: Inexact ( 0 ) ) ;
809+
810+ let row_count =
811+ row_number_inexact_statistics_for_global_limit ( 398 , Some ( 10 ) ) . await ?;
812+ assert_eq ! ( row_count, Precision :: Inexact ( 2 ) ) ;
813+
814+ let row_count =
815+ row_number_inexact_statistics_for_global_limit ( 398 , Some ( 1 ) ) . await ?;
816+ assert_eq ! ( row_count, Precision :: Inexact ( 1 ) ) ;
817+
818+ let row_count = row_number_inexact_statistics_for_global_limit ( 398 , None ) . await ?;
819+ assert_eq ! ( row_count, Precision :: Inexact ( 2 ) ) ;
820+
821+ let row_count =
822+ row_number_inexact_statistics_for_global_limit ( 0 , Some ( usize:: MAX ) ) . await ?;
823+ assert_eq ! ( row_count, Precision :: Inexact ( 400 ) ) ;
824+
825+ let row_count =
826+ row_number_inexact_statistics_for_global_limit ( 398 , Some ( usize:: MAX ) ) . await ?;
827+ assert_eq ! ( row_count, Precision :: Inexact ( 2 ) ) ;
752828
753829 Ok ( ( ) )
754830 }
@@ -776,6 +852,47 @@ mod tests {
776852 Ok ( offset. statistics ( ) ?. num_rows )
777853 }
778854
855+ pub fn build_group_by (
856+ input_schema : & SchemaRef ,
857+ columns : Vec < String > ,
858+ ) -> PhysicalGroupBy {
859+ let mut group_by_expr: Vec < ( Arc < dyn PhysicalExpr > , String ) > = vec ! [ ] ;
860+ for column in columns. iter ( ) {
861+ group_by_expr. push ( ( col ( column, input_schema) . unwrap ( ) , column. to_string ( ) ) ) ;
862+ }
863+ PhysicalGroupBy :: new_single ( group_by_expr. clone ( ) )
864+ }
865+
866+ async fn row_number_inexact_statistics_for_global_limit (
867+ skip : usize ,
868+ fetch : Option < usize > ,
869+ ) -> Result < Precision < usize > > {
870+ let num_partitions = 4 ;
871+ let csv = test:: scan_partitioned ( num_partitions) ;
872+
873+ assert_eq ! ( csv. output_partitioning( ) . partition_count( ) , num_partitions) ;
874+
875+ // Adding a "GROUP BY i" changes the input stats from Exact to Inexact.
876+ let agg = AggregateExec :: try_new (
877+ AggregateMode :: Final ,
878+ build_group_by ( & csv. schema ( ) . clone ( ) , vec ! [ "i" . to_string( ) ] ) ,
879+ vec ! [ ] ,
880+ vec ! [ None ] ,
881+ vec ! [ None ] ,
882+ csv. clone ( ) ,
883+ csv. schema ( ) . clone ( ) ,
884+ ) ?;
885+ let agg_exec: Arc < dyn ExecutionPlan > = Arc :: new ( agg) ;
886+
887+ let offset = GlobalLimitExec :: new (
888+ Arc :: new ( CoalescePartitionsExec :: new ( agg_exec) ) ,
889+ skip,
890+ fetch,
891+ ) ;
892+
893+ Ok ( offset. statistics ( ) ?. num_rows )
894+ }
895+
779896 async fn row_number_statistics_for_local_limit (
780897 num_partitions : usize ,
781898 fetch : usize ,
0 commit comments