From 74fdebc805bedba15d5ab8442c4c10068b863643 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 18 Aug 2024 08:31:29 -0400 Subject: [PATCH] refine docs --- datafusion/physical-plan/src/coalesce/mod.rs | 37 ++++++++++--------- .../physical-plan/src/coalesce_batches.rs | 2 +- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/datafusion/physical-plan/src/coalesce/mod.rs b/datafusion/physical-plan/src/coalesce/mod.rs index 6c89780311648..d5b68560fdf95 100644 --- a/datafusion/physical-plan/src/coalesce/mod.rs +++ b/datafusion/physical-plan/src/coalesce/mod.rs @@ -22,26 +22,18 @@ use arrow_array::{Array, ArrayRef, RecordBatch}; use arrow_schema::SchemaRef; use std::sync::Arc; -/// Concatenate multiple record batches into larger batches +/// Concatenate multiple [`RecordBatch`]es /// -/// See [`CoalesceBatchesExec`] for more details. -/// -/// Notes: -/// -/// 1. The output rows is the same order as the input rows -/// -/// 2. The output is a sequence of batches, with all but the last being at least -/// `target_batch_size` rows. -/// -/// 3. Eventually this may also be able to handle other optimizations such as a -/// combined filter/coalesce operation. +/// `BatchCoalescer` concatenates multiple small [`RecordBatch`]es, produced by +/// operations such as `FilterExec` and `RepartitionExec`, into larger ones for +/// more efficient processing by subsequent operations. /// /// # Background /// -/// Generally speaking, larger RecordBatches are more efficient to process than -/// smaller record batches (until the CPU cache is exceeded) because there is -/// fixed processing overhead per batch. This code concatenates multiple small -/// record batches into larger ones to amortize this overhead. +/// Generally speaking, larger [`RecordBatch`]es are more efficient to process +/// than smaller record batches (until the CPU cache is exceeded) because there +/// is fixed processing overhead per batch. DataFusion tries to operate on +/// batches of `target_batch_size` rows to amortize this overhead /// /// ```text /// ┌────────────────────┐ @@ -64,6 +56,17 @@ use std::sync::Arc; /// │ │ /// └────────────────────┘ /// ``` +/// +/// # Notes: +/// +/// 1. Output rows are produced in the same order as the input rows +/// +/// 2. The output is a sequence of batches, with all but the last being at least +/// `target_batch_size` rows. +/// +/// 3. Eventually this may also be able to handle other optimizations such as a +/// combined filter/coalesce operation. +/// #[derive(Debug)] pub struct BatchCoalescer { /// The input schema @@ -76,7 +79,7 @@ pub struct BatchCoalescer { buffer: Vec, /// Buffered row count buffered_rows: usize, - /// Maximum number of rows to fetch, `None` means fetching all rows + /// Limit: maximum number of rows to fetch, `None` means fetch all rows fetch: Option, } diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 6dc27c4eae756..7caf5b8ab65a3 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -38,7 +38,7 @@ use futures::ready; use futures::stream::{Stream, StreamExt}; /// `CoalesceBatchesExec` combines small batches into larger batches for more -/// efficient use of vectorized processing by later operators. +/// efficient vectorized processing by later operators. /// /// The operator buffers batches until it collects `target_batch_size` rows and /// then emits a single concatenated batch. When only a limited number of rows