Consolidate dataframe examples (apache#18142) (apache#18862)

cj-zhukov · Sergey Zhukov · web-flow · commit d3ae1a78556d · 2025-11-23T14:27:47.000Z
## Which issue does this PR close?  - part of #apache#18142. ## Rationale for this change This PR is for consolidating all the `dataframe` examples (dataframe, default_column_values, deserialize_to_struct) into a single example binary. We are agreed on the pattern and we can apply it to the remaining examples  ## What changes are included in this PR?  ## Are these changes tested?  ## Are there any user-facing changes?   --------- Co-authored-by: Sergey Zhukov <szhukov@aligntech.com>
diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
@@ -41,7 +41,7 @@ cd datafusion-examples/examples
 
 # Run the `dataframe` example:
 # ... use the equivalent for other examples
-cargo run --example dataframe
+cargo run --example dataframe -- dataframe
 ```
 
 ## Single Process
@@ -61,10 +61,10 @@ cargo run --example dataframe
 - [`examples/custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs): Implement custom casting rules to adapt file schemas
 - [`examples/custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs): Write data to a custom file format
 - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3
-- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file.
+- [`examples/dataframe/dataframe.rs`](examples/dataframe/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries. Also demonstrates the various methods to write out a DataFrame to a table, parquet file, csv file, and json file.
 - [`examples/builtin_functions/date_time`](examples/builtin_functions/date_time.rs): Examples of date-time related functions and queries
-- [`default_column_values.rs`](examples/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter
-- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs
+- [`examples/custom_data_source/default_column_values.rs`](examples/custom_data_source/default_column_values.rs): Implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter
+- [`examples/dataframe/deserialize_to_struct.rs`](examples/dataframe/deserialize_to_struct.rs): Convert query results (Arrow ArrayRefs) into Rust structs
 - [`examples/query_planning/expr_api.rs`](examples/query_planning/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s
 - [`examples/custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks.
 - [`flight/sql_server.rs`](examples/flight/sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from Flight and and FlightSQL (e.g. JDBC) clients
diff --git a/datafusion-examples/examples/custom_data_source/default_column_values.rs b/datafusion-examples/examples/custom_data_source/default_column_values.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use std::any::Any;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -52,25 +54,23 @@ use object_store::{ObjectStore, PutPayload};
 // Metadata key for storing default values in field metadata
 const DEFAULT_VALUE_METADATA_KEY: &str = "example.default_value";
 
-// Example showing how to implement custom default value handling for missing columns
-// using field metadata and PhysicalExprAdapter.
-//
-// This example demonstrates how to:
-// 1. Store default values in field metadata using a constant key
-// 2. Create a custom PhysicalExprAdapter that reads these defaults
-// 3. Inject default values for missing columns in filter predicates
-// 4. Use the DefaultPhysicalExprAdapter as a fallback for standard schema adaptation
-// 5. Wrap string default values in cast expressions for proper type conversion
-//
-// Important: PhysicalExprAdapter is specifically designed for rewriting filter predicates
-// that get pushed down to file scans. For handling missing columns in projections,
-// other mechanisms in DataFusion are used (like SchemaAdapter).
-//
-// The metadata-based approach provides a flexible way to store default values as strings
-// and cast them to the appropriate types at query time.
-
-#[tokio::main]
-async fn main() -> Result<()> {
+/// Example showing how to implement custom default value handling for missing columns
+/// using field metadata and PhysicalExprAdapter.
+///
+/// This example demonstrates how to:
+/// 1. Store default values in field metadata using a constant key
+/// 2. Create a custom PhysicalExprAdapter that reads these defaults
+/// 3. Inject default values for missing columns in filter predicates
+/// 4. Use the DefaultPhysicalExprAdapter as a fallback for standard schema adaptation
+/// 5. Wrap string default values in cast expressions for proper type conversion
+///
+/// Important: PhysicalExprAdapter is specifically designed for rewriting filter predicates
+/// that get pushed down to file scans. For handling missing columns in projections,
+/// other mechanisms in DataFusion are used (like SchemaAdapter).
+///
+/// The metadata-based approach provides a flexible way to store default values as strings
+/// and cast them to the appropriate types at query time.
+pub async fn default_column_values() -> Result<()> {
     println!("=== Creating example data with missing columns and default values ===");
 
     // Create sample data where the logical schema has more columns than the physical schema
@@ -85,11 +85,10 @@ async fn main() -> Result<()> {
             .build();
 
         let mut writer =
-            ArrowWriter::try_new(&mut buf, physical_schema.clone(), Some(props))
-                .expect("creating writer");
+            ArrowWriter::try_new(&mut buf, physical_schema.clone(), Some(props))?;
 
-        writer.write(&batch).expect("Writing batch");
-        writer.close().unwrap();
+        writer.write(&batch)?;
+        writer.close()?;
         buf
     };
     let path = Path::from("example.parquet");
diff --git a/datafusion-examples/examples/custom_data_source/main.rs b/datafusion-examples/examples/custom_data_source/main.rs
@@ -21,7 +21,7 @@
 //!
 //! ## Usage
 //! ```bash
-//! cargo run --example custom_data_source -- [csv_json_opener|csv_sql_streaming|custom_datasource|custom_file_casts|custom_file_format|file_stream_provider]
+//! cargo run --example custom_data_source -- [csv_json_opener|csv_sql_streaming|custom_datasource|custom_file_casts|custom_file_format|default_column_values|file_stream_provider]
 //! ```
 //!
 //! Each subcommand runs a corresponding example:
@@ -30,13 +30,15 @@
 //! - `custom_datasource` — run queries against a custom datasource (TableProvider)
 //! - `custom_file_casts` — implement custom casting rules to adapt file schemas
 //! - `custom_file_format` — write data to a custom file format
+//! - `default_column_values` — implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter
 //! - `file_stream_provider` — run a query on FileStreamProvider which implements StreamProvider for reading and writing to arbitrary stream sources/sinks
 
 mod csv_json_opener;
 mod csv_sql_streaming;
 mod custom_datasource;
 mod custom_file_casts;
 mod custom_file_format;
+mod default_column_values;
 mod file_stream_provider;
 
 use std::str::FromStr;
@@ -49,6 +51,7 @@ enum ExampleKind {
     CustomDatasource,
     CustomFileCasts,
     CustomFileFormat,
+    DefaultColumnValues,
     FileFtreamProvider,
 }
 
@@ -60,6 +63,7 @@ impl AsRef<str> for ExampleKind {
             Self::CustomDatasource => "custom_datasource",
             Self::CustomFileCasts => "custom_file_casts",
             Self::CustomFileFormat => "custom_file_format",
+            Self::DefaultColumnValues => "default_column_values",
             Self::FileFtreamProvider => "file_stream_provider",
         }
     }
@@ -75,19 +79,21 @@ impl FromStr for ExampleKind {
             "custom_datasource" => Ok(Self::CustomDatasource),
             "custom_file_casts" => Ok(Self::CustomFileCasts),
             "custom_file_format" => Ok(Self::CustomFileFormat),
+            "default_column_values" => Ok(Self::DefaultColumnValues),
             "file_stream_provider" => Ok(Self::FileFtreamProvider),
             _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))),
         }
     }
 }
 
 impl ExampleKind {
-    const ALL: [Self; 6] = [
+    const ALL: [Self; 7] = [
         Self::CsvJsonOpener,
         Self::CsvSqlStreaming,
         Self::CustomDatasource,
         Self::CustomFileCasts,
         Self::CustomFileFormat,
+        Self::DefaultColumnValues,
         Self::FileFtreamProvider,
     ];
 
@@ -117,6 +123,9 @@ async fn main() -> Result<()> {
         ExampleKind::CustomDatasource => custom_datasource::custom_datasource().await?,
         ExampleKind::CustomFileCasts => custom_file_casts::custom_file_casts().await?,
         ExampleKind::CustomFileFormat => custom_file_format::custom_file_format().await?,
+        ExampleKind::DefaultColumnValues => {
+            default_column_values::default_column_values().await?
+        }
         ExampleKind::FileFtreamProvider => {
             file_stream_provider::file_stream_provider().await?
         }
diff --git a/datafusion-examples/examples/dataframe/dataframe.rs b/datafusion-examples/examples/dataframe/dataframe.rs
@@ -15,12 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray, StringViewArray};
 use datafusion::arrow::datatypes::{DataType, Field, Schema};
 use datafusion::catalog::MemTable;
 use datafusion::common::config::CsvOptions;
 use datafusion::common::parsers::CompressionTypeVariant;
-use datafusion::common::DataFusionError;
 use datafusion::common::ScalarValue;
 use datafusion::dataframe::DataFrameWriteOptions;
 use datafusion::error::Result;
@@ -39,6 +40,7 @@ use tempfile::tempdir;
 /// * [read_parquet]: execute queries against parquet files
 /// * [read_csv]: execute queries against csv files
 /// * [read_memory]: execute queries against in-memory arrow data
+/// * [read_memory_macro]: execute queries against in-memory arrow data using macro
 ///
 /// # Writing out to local storage
 ///
@@ -53,12 +55,7 @@ use tempfile::tempdir;
 /// * [where_scalar_subquery]: execute a scalar subquery
 /// * [where_in_subquery]: execute a subquery with an IN clause
 /// * [where_exist_subquery]: execute a subquery with an EXISTS clause
-///
-/// # Querying data
-///
-/// * [query_to_date]: execute queries against parquet files
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn dataframe_example() -> Result<()> {
     env_logger::init();
     // The SessionContext is the main high level API for interacting with DataFusion
     let ctx = SessionContext::new();
@@ -199,7 +196,7 @@ async fn read_memory_macro() -> Result<()> {
 /// 2. Write out a DataFrame to a parquet file
 /// 3. Write out a DataFrame to a csv file
 /// 4. Write out a DataFrame to a json file
-async fn write_out(ctx: &SessionContext) -> std::result::Result<(), DataFusionError> {
+async fn write_out(ctx: &SessionContext) -> Result<()> {
     let array = StringViewArray::from(vec!["a", "b", "c"]);
     let schema = Arc::new(Schema::new(vec![Field::new(
         "tablecol1",
diff --git a/datafusion-examples/examples/dataframe/deserialize_to_struct.rs b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! See `main.rs` for how to run it.
+
 use arrow::array::{AsArray, PrimitiveArray};
 use arrow::datatypes::{Float64Type, Int32Type};
 use datafusion::common::assert_batches_eq;
@@ -29,8 +31,7 @@ use futures::StreamExt;
 /// as [ArrayRef]
 ///
 /// [ArrayRef]: arrow::array::ArrayRef
-#[tokio::main]
-async fn main() -> Result<()> {
+pub async fn deserialize_to_struct() -> Result<()> {
     // Run a query that returns two columns of data
     let ctx = SessionContext::new();
     let testdata = datafusion::test_util::parquet_test_data();
diff --git a/datafusion-examples/examples/dataframe/main.rs b/datafusion-examples/examples/dataframe/main.rs
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! # These are core DataFrame API usage
+//!
+//! These examples demonstrate core DataFrame API usage.
+//!
+//! ## Usage
+//! ```bash
+//! cargo run --example dataframe -- [dataframe|deserialize_to_struct]
+//! ```
+//!
+//! Each subcommand runs a corresponding example:
+//! - `dataframe` — run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries
+//! - `deserialize_to_struct` — convert query results (Arrow ArrayRefs) into Rust structs
+
+mod dataframe;
+mod deserialize_to_struct;
+
+use std::str::FromStr;
+
+use datafusion::error::{DataFusionError, Result};
+
+enum ExampleKind {
+    Dataframe,
+    DeserializeToStruct,
+}
+
+impl AsRef<str> for ExampleKind {
+    fn as_ref(&self) -> &str {
+        match self {
+            Self::Dataframe => "dataframe",
+            Self::DeserializeToStruct => "deserialize_to_struct",
+        }
+    }
+}
+
+impl FromStr for ExampleKind {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self> {
+        match s {
+            "dataframe" => Ok(Self::Dataframe),
+            "deserialize_to_struct" => Ok(Self::DeserializeToStruct),
+            _ => Err(DataFusionError::Execution(format!("Unknown example: {s}"))),
+        }
+    }
+}
+
+impl ExampleKind {
+    const ALL: [Self; 2] = [Self::Dataframe, Self::DeserializeToStruct];
+
+    const EXAMPLE_NAME: &str = "dataframe";
+
+    fn variants() -> Vec<&'static str> {
+        Self::ALL.iter().map(|x| x.as_ref()).collect()
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let usage = format!(
+        "Usage: cargo run --example {} -- [{}]",
+        ExampleKind::EXAMPLE_NAME,
+        ExampleKind::variants().join("|")
+    );
+
+    let arg = std::env::args().nth(1).ok_or_else(|| {
+        eprintln!("{usage}");
+        DataFusionError::Execution("Missing argument".to_string())
+    })?;
+
+    match arg.parse::<ExampleKind>()? {
+        ExampleKind::Dataframe => dataframe::dataframe_example().await?,
+        ExampleKind::DeserializeToStruct => {
+            deserialize_to_struct::deserialize_to_struct().await?
+        }
+    }
+
+    Ok(())
+}