apache · alamb · Jun 5, 2025 · Feb 15, 2025 · Feb 18, 2025 · Feb 19, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -216,5 +216,5 @@ unnecessary_lazy_evaluations = "warn"
 uninlined_format_args = "warn"
 
 [workspace.lints.rust]
-unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] }
+unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)", "cfg(tarpaulin_include)"] }
 unused_qualifications = "deny"
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -107,10 +107,8 @@ pub(crate) mod test_util {
 mod tests {
 
     use std::fmt::{self, Display, Formatter};
-    use std::pin::Pin;
     use std::sync::atomic::{AtomicUsize, Ordering};
     use std::sync::Arc;
-    use std::task::{Context, Poll};
     use std::time::Duration;
 
     use crate::datasource::file_format::parquet::test_util::store_parquet;
@@ -120,7 +118,7 @@ mod tests {
     use crate::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
 
     use arrow::array::RecordBatch;
-    use arrow_schema::{Schema, SchemaRef};
+    use arrow_schema::Schema;
     use datafusion_catalog::Session;
     use datafusion_common::cast::{
         as_binary_array, as_binary_view_array, as_boolean_array, as_float32_array,
@@ -140,7 +138,7 @@ mod tests {
     };
     use datafusion_execution::object_store::ObjectStoreUrl;
     use datafusion_execution::runtime_env::RuntimeEnv;
-    use datafusion_execution::{RecordBatchStream, TaskContext};
+    use datafusion_execution::TaskContext;
     use datafusion_expr::dml::InsertOp;
     use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
     use datafusion_physical_plan::{collect, ExecutionPlan};
@@ -153,7 +151,7 @@ mod tests {
     use async_trait::async_trait;
     use datafusion_datasource::file_groups::FileGroup;
     use futures::stream::BoxStream;
-    use futures::{Stream, StreamExt};
+    use futures::StreamExt;
     use insta::assert_snapshot;
     use log::error;
     use object_store::local::LocalFileSystem;
@@ -169,6 +167,8 @@ mod tests {
     use parquet::format::FileMetaData;
     use tokio::fs::File;
 
+    use crate::test_util::bounded_stream;
+
     enum ForceViews {
         Yes,
         No,
@@ -1662,43 +1662,4 @@ mod tests {
 
         Ok(())
     }
-
-    /// Creates an bounded stream for testing purposes.
-    fn bounded_stream(
-        batch: RecordBatch,
-        limit: usize,
-    ) -> datafusion_execution::SendableRecordBatchStream {
-        Box::pin(BoundedStream {
-            count: 0,
-            limit,
-            batch,
-        })
-    }
-
-    struct BoundedStream {
-        limit: usize,
-        count: usize,
-        batch: RecordBatch,
-    }
-
-    impl Stream for BoundedStream {
-        type Item = Result<RecordBatch>;
-
-        fn poll_next(
-            mut self: Pin<&mut Self>,
-            _cx: &mut Context<'_>,
-        ) -> Poll<Option<Self::Item>> {
-            if self.count >= self.limit {
-                return Poll::Ready(None);
-            }
-            self.count += 1;
-            Poll::Ready(Some(Ok(self.batch.clone())))
-        }
-    }
-
-    impl RecordBatchStream for BoundedStream {
-        fn schema(&self) -> SchemaRef {
-            self.batch.schema()
-        }
-    }
 }
diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs
@@ -22,12 +22,14 @@ pub mod parquet;
 
 pub mod csv;
 
+use futures::Stream;
 use std::any::Any;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::Write;
 use std::path::Path;
 use std::sync::Arc;
+use std::task::{Context, Poll};
 
 use crate::catalog::{TableProvider, TableProviderFactory};
 use crate::dataframe::DataFrame;
@@ -38,11 +40,13 @@ use crate::logical_expr::{LogicalPlanBuilder, UNNAMED_TABLE};
 use crate::physical_plan::ExecutionPlan;
 use crate::prelude::{CsvReadOptions, SessionContext};
 
+use crate::execution::SendableRecordBatchStream;
 use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use datafusion_catalog::Session;
 use datafusion_common::TableReference;
 use datafusion_expr::{CreateExternalTable, Expr, SortExpr, TableType};
+use std::pin::Pin;
 
 use async_trait::async_trait;
 
@@ -52,6 +56,8 @@ use tempfile::TempDir;
 pub use datafusion_common::test_util::parquet_test_data;
 pub use datafusion_common::test_util::{arrow_test_data, get_data_dir};
 
+use crate::execution::RecordBatchStream;
+
 /// Scan an empty data source, mainly used in tests
 pub fn scan_empty(
     name: Option<&str>,
@@ -234,3 +240,44 @@ pub fn register_unbounded_file_with_ordering(
     ctx.register_table(table_name, Arc::new(StreamTable::new(Arc::new(config))))?;
     Ok(())
 }
+
+/// Creates a bounded stream that emits the same record batch a specified number of times.
+/// This is useful for testing purposes.
+pub fn bounded_stream(
+    record_batch: RecordBatch,
+    limit: usize,
+) -> SendableRecordBatchStream {
+    Box::pin(BoundedStream {
+        record_batch,
+        count: 0,
+        limit,
+    })
+}
+
+struct BoundedStream {
+    record_batch: RecordBatch,
+    count: usize,
+    limit: usize,
+}
+
+impl Stream for BoundedStream {
+    type Item = Result<RecordBatch, crate::error::DataFusionError>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        _cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        if self.count >= self.limit {
+            Poll::Ready(None)
+        } else {
+            self.count += 1;
+            Poll::Ready(Some(Ok(self.record_batch.clone())))
+        }
+    }
+}
+
+impl RecordBatchStream for BoundedStream {
+    fn schema(&self) -> SchemaRef {
+        self.record_batch.schema()
+    }
+}
diff --git a/datafusion/expr-common/src/groups_accumulator.rs b/datafusion/expr-common/src/groups_accumulator.rs
@@ -21,7 +21,7 @@ use arrow::array::{ArrayRef, BooleanArray};
 use datafusion_common::{not_impl_err, Result};
 
 /// Describes how many rows should be emitted during grouping.
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum EmitTo {
     /// Emit all groups
     All,

diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml
@@ -44,7 +44,9 @@ arrow-schema = { workspace = true }
 async-ffi = { version = "0.5.0", features = ["abi_stable"] }
 async-trait = { workspace = true }
 datafusion = { workspace = true, default-features = false }
+datafusion-functions-aggregate-common = { workspace = true }
 datafusion-proto = { workspace = true }
+datafusion-proto-common = { workspace = true }
 futures = { workspace = true }
 log = { workspace = true }
 prost = { workspace = true }
@@ -56,3 +58,4 @@ doc-comment = { workspace = true }
 
 [features]
 integration-tests = []
+tarpaulin_include = [] # Exists only to prevent warnings on stable and still have accurate coverage
diff --git a/datafusion/ffi/src/arrow_wrappers.rs b/datafusion/ffi/src/arrow_wrappers.rs
@@ -21,7 +21,8 @@ use abi_stable::StableAbi;
 use arrow::{
     array::{make_array, ArrayRef},
     datatypes::{Schema, SchemaRef},
-    ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
+    error::ArrowError,
+    ffi::{from_ffi, to_ffi, FFI_ArrowArray, FFI_ArrowSchema},
 };
 use log::error;
 
@@ -44,16 +45,19 @@ impl From<SchemaRef> for WrappedSchema {
         WrappedSchema(ffi_schema)
     }
 }
+/// Some functions are expected to always succeed, like getting the schema from a TableProvider.
+/// Since going through the FFI always has the potential to fail, we need to catch these errors,
+/// give the user a warning, and return some kind of result. In this case we default to an
+/// empty schema.
+#[cfg(not(tarpaulin_include))]
+fn catch_df_schema_error(e: ArrowError) -> Schema {
+    error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {e}");
+    Schema::empty()
+}
 
 impl From<WrappedSchema> for SchemaRef {
     fn from(value: WrappedSchema) -> Self {
-        let schema = match Schema::try_from(&value.0) {
-            Ok(s) => s,
-            Err(e) => {
-                error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {e}");
-                Schema::empty()
-            }
-        };
+        let schema = Schema::try_from(&value.0).unwrap_or_else(catch_df_schema_error);
         Arc::new(schema)
     }
 }
@@ -71,11 +75,22 @@ pub struct WrappedArray {
 }
 
 impl TryFrom<WrappedArray> for ArrayRef {
-    type Error = arrow::error::ArrowError;
+    type Error = ArrowError;
 
     fn try_from(value: WrappedArray) -> Result<Self, Self::Error> {
         let data = unsafe { from_ffi(value.array, &value.schema.0)? };
 
         Ok(make_array(data))
     }
 }
+
+impl TryFrom<&ArrayRef> for WrappedArray {
+    type Error = ArrowError;
+
+    fn try_from(array: &ArrayRef) -> Result<Self, Self::Error> {
+        let (array, schema) = to_ffi(&array.to_data())?;
+        let schema = WrappedSchema(schema);
+
+        Ok(WrappedArray { array, schema })
+    }
+}
diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs
@@ -34,6 +34,7 @@ pub mod schema_provider;
 pub mod session_config;
 pub mod table_provider;
 pub mod table_source;
+pub mod udaf;
 pub mod udf;
 pub mod udtf;
 pub mod util;

diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs
@@ -300,7 +300,10 @@ impl From<FFI_EmissionType> for EmissionType {
 
 #[cfg(test)]
 mod tests {
-    use datafusion::physical_plan::Partitioning;
+    use datafusion::{
+        physical_expr::{LexOrdering, PhysicalSortExpr},
+        physical_plan::Partitioning,
+    };
 
     use super::*;
 
@@ -311,8 +314,13 @@ mod tests {
             Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)]));
 
         let original_props = PlanProperties::new(
-            EquivalenceProperties::new(schema),
-            Partitioning::UnknownPartitioning(3),
+            EquivalenceProperties::new(Arc::clone(&schema)).with_reorder(
+                LexOrdering::new(vec![PhysicalSortExpr {
+                    expr: datafusion::physical_plan::expressions::col("a", &schema)?,
+                    options: Default::default(),
+                }]),
+            ),
+            Partitioning::RoundRobinBatch(3),
             EmissionType::Incremental,
             Boundedness::Bounded,
         );

diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs
@@ -196,3 +196,49 @@ impl Stream for FFI_RecordBatchStream {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::{
+        common::record_batch, error::Result, execution::SendableRecordBatchStream,
+        test_util::bounded_stream,
+    };
+
+    use super::FFI_RecordBatchStream;
+    use futures::StreamExt;
+
+    #[tokio::test]
+    async fn test_round_trip_record_batch_stream() -> Result<()> {
+        let record_batch = record_batch!(
+            ("a", Int32, vec![1, 2, 3]),
+            ("b", Float64, vec![Some(4.0), None, Some(5.0)])
+        )?;
+        let original_rbs = bounded_stream(record_batch.clone(), 1);
+
+        let ffi_rbs: FFI_RecordBatchStream = original_rbs.into();
+        let mut ffi_rbs: SendableRecordBatchStream = Box::pin(ffi_rbs);
+
+        let schema = ffi_rbs.schema();
+        assert_eq!(
+            schema,
+            Arc::new(Schema::new(vec![
+                Field::new("a", DataType::Int32, true),
+                Field::new("b", DataType::Float64, true)
+            ]))
+        );
+
+        let batch = ffi_rbs.next().await;
+        assert!(batch.is_some());
+        assert!(batch.as_ref().unwrap().is_ok());
+        assert_eq!(batch.unwrap().unwrap(), record_batch);
+
+        // There should only be one batch
+        let no_batch = ffi_rbs.next().await;
+        assert!(no_batch.is_none());
+
+        Ok(())
+    }
+}