Update DataFusion architecture documentation

alamb · alamb · commit d9f1f311c789 · 2023-04-19T08:56:57.000-04:00
diff --git a/datafusion/core/src/datasource/empty.rs b/datafusion/core/src/datasource/empty.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! An empty plan that is usefull for testing and generating plans without mapping them to actual data.
+//! [`EmptyTable`] usefull for testing.
 
 use std::any::Any;
 use std::sync::Arc;
@@ -30,7 +30,8 @@ use crate::logical_expr::Expr;
 use crate::physical_plan::project_schema;
 use crate::physical_plan::{empty::EmptyExec, ExecutionPlan};
 
-/// A table with a schema but no data.
+/// An empty plan that is usefull for testing and generating plans
+/// without mapping them to actual data.
 pub struct EmptyTable {
     schema: SchemaRef,
     partitions: usize,
diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs
@@ -15,9 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! In-memory data source for presenting a `Vec<RecordBatch>` as a data source that can be
-//! queried by DataFusion. This allows data to be pre-loaded into memory and then
-//! repeatedly queried without incurring additional file I/O overhead.
+//! [`MemTable`] for querying `Vec<RecordBatch>` by DataFusion.
 
 use futures::{StreamExt, TryStreamExt};
 use std::any::Any;
@@ -41,7 +39,10 @@ use crate::physical_plan::memory::MemoryExec;
 use crate::physical_plan::ExecutionPlan;
 use crate::physical_plan::{repartition::RepartitionExec, Partitioning};
 
-/// In-memory table
+/// In-memory data source for presenting a `Vec<RecordBatch>` as a
+/// data source that can be queried by DataFusion. This allows data to
+/// be pre-loaded into memory and then repeatedly queried without
+/// incurring additional file I/O overhead.
 #[derive(Debug)]
 pub struct MemTable {
     schema: SchemaRef,
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
@@ -26,15 +26,16 @@
 //! multi-threaded, vectorized execution engine, and partitioned data
 //! sources (Parquet, CSV, JSON, and Avro).
 //!
-//! DataFusion can also be easily customized to support additional
-//! data sources, query languages, functions, custom operators and
-//! more.
+//! DataFusion is designed for easy customization such as supporting
+//! additional data sources, query languages, functions, custom
+//! operators and more. See the [Architecture] section for more details.
 //!
 //! [DataFusion]: https://arrow.apache.org/datafusion/
 //! [Apache Arrow]: https://arrow.apache.org
 //! [use cases]: https://arrow.apache.org/datafusion/user-guide/introduction.html#use-cases
 //! [SQL]: https://arrow.apache.org/datafusion/user-guide/sql/index.html
 //! [`DataFrame`]: dataframe::DataFrame
+//! [Architecture]: #architecture
 //!
 //! # Examples
 //!
@@ -150,9 +151,13 @@
 //! [`AggregateUDF`]: physical_plan::udaf::AggregateUDF
 //! [`QueryPlanner`]: execution::context::QueryPlanner
 //! [`OptimizerRule`]: datafusion_optimizer::optimizer::OptimizerRule
-//! [`PhysicalOptimizerRule`]: datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule
+//! [`PhysicalOptimizerRule`]: crate::physical_optimizer::optimizer::PhysicalOptimizerRule
+//!
+//! # Architecture
 //!
-//! # Code Organization
+//! <!-- NOTE: The goal of this section is to provide a high level
+//! overview of how DataFusion is organized and then link to other
+//! sections of the docs with more details -->
 //!
 //! ## Overview  Presentations
 //!
@@ -168,104 +173,185 @@
 //! - [March 2021]: The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts [~ 15 minutes in](https://www.youtube.com/watch?v=K6eCAVEk4kU&t=875s)) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934)
 //! - [February 2021]: How DataFusion is used within the Ballista Project is described in \*Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ)
 //!
-//! ## Architecture
+//! ## Query Planning and Execution Overview
+//!
+//! ### SQL
+//!
+//! 1. The query string is parsed to an Abstract Syntax Tree (AST)
+//! using [sqlparser].
+//!
+//! 2. The AST is converted to a [`LogicalPlan`] and logical
+//! expressions [`Expr`]s to compute the desired result by the
+//! [`SqlToRel`] planner.
+//!
+//! 3. The [`LogicalPlan`] is checked and rewritten to enforce
+//! semantic rules, such as type coercion, by [`AnalyzerRule`]s
+//!
+//! 4. The [`LogicalPlan`] is rewritten by [`OptimizerRule`]s, such as
+//! projection and filter pushdown, to improve its efficiency.
+//!
+//! 5. The [`LogicalPlan`] is converted to an [`ExecutionPlan`] by a
+//! [`PhysicalPlanner`]
+//!
+//! 6. The [`ExecutionPlan`] is rewrittten by
+//! [`PhysicalOptimizerRule`]s, such as sort and join selection, to
+//! improve its efficiency.
+//!
+//! 7. The [`ExecutionPlan`]is executed, producing one or more
+//! [`RecordBatchStream`] which yield arrow [`RecordBatch`]es.
+//!
+//! To process large datasets with many rows as efficiently as
+//! possible, DataFusion expends significant effort in steps 1-6,
+//! which are done once per plan.
 //!
-//! DataFusion is a fully fledged query engine capable of performing complex operations.
-//! Specifically, when DataFusion receives an SQL query, there are different steps
-//! that it passes through until a result is obtained. Broadly, they are:
+//! ### DataFrame
 //!
-//! 1. The string is parsed to an Abstract syntax tree (AST) using [sqlparser].
-//! 2. The planner [`SqlToRel`] converts logical expressions on the AST to logical expressions [`Expr`]s.
-//! 3. The planner [`SqlToRel`] converts logical nodes on the AST to a [`LogicalPlan`].
-//! 4. [`OptimizerRule`]s are applied to the [`LogicalPlan`] to optimize it.
-//! 5. The [`LogicalPlan`] is converted to an [`ExecutionPlan`] by a [`PhysicalPlanner`]
-//! 6. The [`ExecutionPlan`]is executed against data through the [`SessionContext`]
+//! When executing plans using the [`DataFrame`] API, the process is
+//! identical as with SQL, except that steps 1 and 2 are
+//! omitted. Instead, the DataFrame API builds the [`LogicalPlan`]
+//! directly using [`LogicalPlanBuilder`]. Systems that have their own
+//! custom query languages also typically build the desired
+//! [`LogicalPlan`] directly.
 //!
-//! With the [`DataFrame`] API, steps 1-3 are not used as the DataFrame builds the [`LogicalPlan`] directly.
+//! ## Data Sources
 //!
-//! Phases 1-5 are typically cheap when compared to phase 6, and thus DataFusion puts a
-//! lot of effort to ensure that phase 6 runs efficiently and without errors.
+//! DataFusion includes several built in data sources for common use
+//! cases, and can be extended with any source that implements the
+//! [`TableProvider`] trait and can provides a stream of
+//! [`RecordBatch`]es.
 //!
-//! DataFusion's planning is divided in two main parts: logical planning and physical planning.
+//! 1. [`ListingTable`]: Read data from Parquet, JSON, CSV, or AVRO
+//! files.  Supports single files or multiple files with HIVE style
+//! aprtitoning, optional compression, directly reading from remote
+//! object store and more.
 //!
-//! ### Logical planning
+//! 2. [`MemTable`]: Reads data from in memory [`RecordBatch`]es.
+//!
+//! 3. [`StreamingTable`]: Reads data from potentially unbounded inputs.
+//!
+//! [`ListingTable`]: crate::datasource::listing::ListingTable
+//! [`MemTable`]: crate::datasource::memory::MemTable
+//! [`StreamingTable`]: crate::datasource::streaming::StreamingTable
+//!
+//! ## Plans
+//!
+//! ### [`LogicalPlan`]
 //!
 //! Logical planning yields [`LogicalPlan`]s and logical [`Expr`]
-//! expressions which are [`Schema`]aware and represent statements
+//! expressions which are [`Schema`] aware and represent statements
 //! whose result is independent of how it should physically be
 //! executed.
 //!
 //! A [`LogicalPlan`] is a Directed Acyclic Graph (DAG) of other
 //! [`LogicalPlan`]s, and each node contains [`Expr`]s.  All of these
 //! are located in [`datafusion_expr`] module.
 //!
-//! ### Physical planning
+//! ### [`ExecutionPlan`] / Physical Plans
 //!
 //! An [`ExecutionPlan`] (sometimes referred to as a "physical plan")
 //! is a plan that can be executed against data. Compared to a
-//! logical plan, the physical plan has concrete information about how
-//! calculations should be performed (e.g. what Rust functions are
-//! used) and how data should be loaded into memory.
-//!
-//! [`ExecutionPlan`]s uses the [Apache Arrow] format as its in-memory
-//! representation of data, through the [arrow] crate. The [arrow]
-//! crate documents how the memory is physically represented.
-//!
-//! A [`ExecutionPlan`] is composed by nodes (which each implement the
-//! [`ExecutionPlan`] trait). Each node can contain physical
-//! expressions ([`PhysicalExpr`]) or aggreagate expressions
-//! ([`AggregateExpr`]).  All of these are located in the
-//! [`physical_plan`] module.
-//!
-//! Broadly speaking,
-//!
-//! * an [`ExecutionPlan`] receives a partition number and
-//!   asynchronously returns an iterator over [`RecordBatch`] (a
-//!   node-specific struct that implements [`RecordBatchReader`])
-//! * a [`PhysicalExpr`] receives a [`RecordBatch`]
-//!   and returns an [`Array`]
-//! * an [`AggregateExpr`] receives a series of [`RecordBatch`]es
-//!   and returns a [`RecordBatch`] of a single row(*)
-//!
-//! (*) Technically, it aggregates the results on each partition and then merges the results into a single partition.
-//!
-//! The following physical nodes are currently implemented:
-//!
-//! * Projection: [`ProjectionExec`](physical_plan::projection::ProjectionExec)
-//! * Filter: [`FilterExec`](physical_plan::filter::FilterExec)
-//! * Grouped and non-grouped aggregations: [`AggregateExec`](physical_plan::aggregates::AggregateExec)
-//! * Hash Join: [`HashJoinExec`](physical_plan::joins::HashJoinExec)
-//! * Cross Join: [`CrossJoinExec`](physical_plan::joins::CrossJoinExec)
-//! * Sort Merge Join: [`SortMergeJoinExec`](physical_plan::joins::SortMergeJoinExec)
-//! * Union: [`UnionExec`](physical_plan::union::UnionExec)
-//! * Sort: [`SortExec`](physical_plan::sorts::sort::SortExec)
-//! * Coalesce partitions: [`CoalescePartitionsExec`](physical_plan::coalesce_partitions::CoalescePartitionsExec)
-//! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec)
-//! * Scan CSV: [`CsvExec`](physical_plan::file_format::CsvExec)
-//! * Scan Parquet: [`ParquetExec`](physical_plan::file_format::ParquetExec)
-//! * Scan Avro: [`AvroExec`](physical_plan::file_format::AvroExec)
-//! * Scan newline-delimited JSON: [`NdJsonExec`](physical_plan::file_format::NdJsonExec)
-//! * Scan from memory: [`MemoryExec`](physical_plan::memory::MemoryExec)
-//! * Explain the plan: [`ExplainExec`](physical_plan::explain::ExplainExec)
-//!
-//! Future topics (coming soon):
-//! * Analyzer Rules
-//! * Resource management (memory and disk)
+//! [`LogicalPlan`], an [`ExecutionPlan`] has concrete information
+//! about how to perform calculations (e.g. algorithms to use), and
+//! how data flows during execution (e.g. partitioning and
+//! sortedness). It is a DAG of other [`ExecutionPlan`]s
+//! and each node can
+//!
+//! 1. [`PhysicalExpr`]: Scalar functions
+//!
+//! 2. [`AggregateExpr`]: Aggregate functions
+//!
+//! 2. [`WindowExpr`]: Window functions
+//!
+//! [`PhysicalExpr`]: crate::physical_plan::PhysicalExpr
+//! [`AggregateExpr`]: crate::physical_plan::AggregateExpr
+//! [`WindowExpr`]: crate::physical_plan::WindowExpr
+//!
+//!
+//! ### Execution
+//!
+//! [`ExecutionPlan`]s process data using the [Apache Arrow] memory
+//! format, largely with functions from the [arrow] crate. Values are
+//! represented with [`ColumnarValue`], which are either:
+//!
+//! [`ScalarValue`]: single constant values [`ScalarValue`]
+//! [`ArrayRef`]: Arrow arrays
+//!
+//! [`ColumnarValue`]: datafusion_expr::ColumnarValue
+//! [`ScalarValue`]: crate::scalar::ScalarValue
+//! [`ArrayRef`]: arrow::array::ArrayRef
+//!
+//!
+//! See the [implementors of `ExecutionPlan`] for a list of physical nodes that are implemented;
+//!
+//! [implementors of `ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#implementors
+//!
+//! ## State Management and Configuration
+//!
+//! ### Configuration
+//!
+//! [`ConfigOptions`] contain options to control DataFusion's
+//! execution.
+//!
+//! [`ConfigOptions`]: datafusion_common::config::ConfigOptions
+//!
+//! ### State Management
+//!
+//! The state required to execute queries is managed by several structures:
+//!
+//! 1. [`SessionContext`]: State needed for create logical plans such
+//! as the table definitions, and the function registries. In general
+//! a [`SessionContext`] is required to create [`LogicalPlan`]s.
+//!
+//! 2. [`TaskContext`]: State needed for execution such as the
+//! [`MemoryPool`], [`DiskManager`], and [`ObjectStoreRegistry`].
+//!
+//! 3. [`ExecutionProps`]: Per-execution properties and data (such as
+//! starting timestamps, etc).
+//!
+//! [`SessionContext`]: crate::execution::context::SessionContext
+//! [`TaskContext`]: crate::execution::context::TaskContext
+//! [`ExecutionProps`]: crate::execution::context::ExecutionProps
+//!
+//! ### Resource Management
+//!
+//! The amount of memory and temporary local disk space used by
+//! DataFusion when running a plan can be controlled using the
+//! [`MemoryPool`] and [`DiskManager`].
+//!
+//! [`DiskManager`]: crate::execution::DiskManager
+//! [`MemoryPool`]: crate::execution::memory_pool::MemoryPool
+//! [`ObjectStoreRegistry`]: crate::datasource::object_store::ObjectStoreRegistry
+//!
+//! ## Crate Organization
+//!
+//! DataFusion is organized into multiple crates to enforce modularity
+//! and improve compilation times. The crates are:
+//!
+//! * [datafusion_common]: Common traits and types
+//! * [datafusion_execution]: State needed for execution
+//! * [datafusion_expr]: [`LogicalPlan`],  [`Expr`] and related logical planning structure
+//! * [datafusion-jit]: Just In Time support
+//! * [datafusion_optimizer]: [`OptimizerRule`]s and [`AnalyzerRule`]s
+//! * [datfusion-physical-expr]: [`PhysicalExpr`] and related expressions
+//! * [datfusion-row]: Specialized row format
+//! * [datafusion_sql]:  [`SqlToRel`] SQL planner
 //!
 //! [sqlparser]: https://docs.rs/sqlparser/latest/sqlparser
 //! [`SqlToRel`]: sql::planner::SqlToRel
 //! [`Expr`]: datafusion_expr::Expr
 //! [`LogicalPlan`]: datafusion_expr::LogicalPlan
+//! [`AnalyzerRule`]: datafusion_optimizer::analyzer::AnalyzerRule
 //! [`OptimizerRule`]: optimizer::optimizer::OptimizerRule
 //! [`ExecutionPlan`]: physical_plan::ExecutionPlan
 //! [`PhysicalPlanner`]: physical_plan::PhysicalPlanner
+//! [`PhysicalOptimizerRule`]: datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule
 //! [`Schema`]: arrow::datatypes::Schema
-//! [`datafusion_expr`]: datafusion_expr
 //! [`PhysicalExpr`]: physical_plan::PhysicalExpr
 //! [`AggregateExpr`]: physical_plan::AggregateExpr
 //! [`RecordBatch`]: arrow::record_batch::RecordBatch
 //! [`RecordBatchReader`]: arrow::record_batch::RecordBatchReader
 //! [`Array`]: arrow::array::Array
+//! [`RecordBatchStream`]: crate::physical_plan::SendableRecordBatchStream
 
 /// DataFusion crate version
 pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
diff --git a/docs/source/contributor-guide/architecture.md b/docs/source/contributor-guide/architecture.md
@@ -23,5 +23,5 @@ DataFusion's code structure and organization is described in the
 [crates.io documentation], to keep it as close to the source as
 possible. You can find the most up to date version in the [source code].
 
-[crates.io documentation]: https://docs.rs/datafusion/latest/datafusion/index.html#code-organization
+[crates.io documentation]: https://docs.rs/datafusion/latest/datafusion/index.html#architecture
 [source code]: https://github.com/apache/arrow-datafusion/blob/main/datafusion/core/src/lib.rs