refine docs

apache · Aug 18, 2024 · 74fdebc · 74fdebc
1 parent b8f9bda
commit 74fdebc
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 18 deletions.
diff --git a/datafusion/physical-plan/src/coalesce/mod.rs b/datafusion/physical-plan/src/coalesce/mod.rs
@@ -22,26 +22,18 @@ use arrow_array::{Array, ArrayRef, RecordBatch};
 use arrow_schema::SchemaRef;
 use std::sync::Arc;
 
-/// Concatenate multiple record batches into larger batches
+/// Concatenate multiple [`RecordBatch`]es
 ///
-/// See [`CoalesceBatchesExec`] for more details.
-///
-/// Notes:
-///
-/// 1. The output rows is the same order as the input rows
-///
-/// 2. The output is a sequence of batches, with all but the last being at least
-///    `target_batch_size` rows.
-///
-/// 3. Eventually this may also be able to handle other optimizations such as a
-///    combined filter/coalesce operation.
+/// `BatchCoalescer` concatenates multiple small [`RecordBatch`]es, produced by
+/// operations such as `FilterExec` and `RepartitionExec`, into larger ones for
+/// more efficient processing by subsequent operations.
 ///
 /// # Background
 ///
-/// Generally speaking, larger RecordBatches are more efficient to process than
-/// smaller record batches (until the CPU cache is exceeded) because there is
-/// fixed processing overhead per batch. This code concatenates multiple small
-/// record batches into larger ones to amortize this overhead.
+/// Generally speaking, larger [`RecordBatch`]es are more efficient to process
+/// than smaller record batches (until the CPU cache is exceeded) because there
+/// is fixed processing overhead per batch. DataFusion tries to operate on
+/// batches of `target_batch_size` rows to amortize this overhead
 ///
 /// ```text
 /// ┌────────────────────┐
@@ -64,6 +56,17 @@ use std::sync::Arc;
 /// │                    │
 /// └────────────────────┘
 /// ```
+///
+/// # Notes:
+///
+/// 1. Output rows are produced in the same order as the input rows
+///
+/// 2. The output is a sequence of batches, with all but the last being at least
+///    `target_batch_size` rows.
+///
+/// 3. Eventually this may also be able to handle other optimizations such as a
+///    combined filter/coalesce operation.
+///
 #[derive(Debug)]
 pub struct BatchCoalescer {
     /// The input schema
@@ -76,7 +79,7 @@ pub struct BatchCoalescer {
     buffer: Vec<RecordBatch>,
     /// Buffered row count
     buffered_rows: usize,
-    /// Maximum number of rows to fetch, `None` means fetching all rows
+    /// Limit: maximum number of rows to fetch, `None` means fetch all rows
     fetch: Option<usize>,
 }
 

diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs
@@ -38,7 +38,7 @@ use futures::ready;
 use futures::stream::{Stream, StreamExt};
 
 /// `CoalesceBatchesExec` combines small batches into larger batches for more
-/// efficient use of vectorized processing by later operators.
+/// efficient vectorized processing by later operators.
 ///
 /// The operator buffers batches until it collects `target_batch_size` rows and
 /// then emits a single concatenated batch. When only a limited number of rows