Skip to content

Commit d43ddf1

Browse files
blagininAdamGS
andauthored
[branch-48] Set the default value of datafusion.execution.collect_statistics to true #16447 (#16659)
* Set the default value of `datafusion.execution.collect_statistics` to `true` (#16447) * fix sqllogicaltests * Add upgrade note (cherry picked from commit 2d7ae09) * Update row group pruning --------- Co-authored-by: Adam Gutglick <adam@spiraldb.com>
1 parent 7b31676 commit d43ddf1

File tree

10 files changed

+224
-295
lines changed

10 files changed

+224
-295
lines changed

datafusion/common/src/config.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,8 @@ config_namespace! {
294294

295295
/// Should DataFusion collect statistics when first creating a table.
296296
/// Has no effect after the table is created. Applies to the default
297-
/// `ListingTableProvider` in DataFusion. Defaults to false.
298-
pub collect_statistics: bool, default = false
297+
/// `ListingTableProvider` in DataFusion. Defaults to true.
298+
pub collect_statistics: bool, default = true
299299

300300
/// Number of partitions for query execution. Increasing partitions can increase
301301
/// concurrency.

datafusion/core/src/execution/context/parquet.rs

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,12 @@ impl SessionContext {
3434
///
3535
/// # Note: Statistics
3636
///
37-
/// NOTE: by default, statistics are not collected when reading the Parquet
38-
/// files as this can slow down the initial DataFrame creation. However,
39-
/// collecting statistics can greatly accelerate queries with certain
40-
/// filters.
37+
/// NOTE: by default, statistics are collected when reading the Parquet
38+
/// files This can slow down the initial DataFrame creation while
39+
/// greatly accelerating queries with certain filters.
4140
///
42-
/// To enable collect statistics, set the [config option]
43-
/// `datafusion.execution.collect_statistics` to `true`. See
41+
/// To disable statistics collection, set the [config option]
42+
/// `datafusion.execution.collect_statistics` to `false`. See
4443
/// [`ConfigOptions`] and [`ExecutionOptions::collect_statistics`] for more
4544
/// details.
4645
///
@@ -171,28 +170,28 @@ mod tests {
171170

172171
#[tokio::test]
173172
async fn register_parquet_respects_collect_statistics_config() -> Result<()> {
174-
// The default is false
173+
// The default is true
175174
let mut config = SessionConfig::new();
176175
config.options_mut().explain.physical_plan_only = true;
177176
config.options_mut().explain.show_statistics = true;
178177
let content = explain_query_all_with_config(config).await?;
179-
assert_contains!(content, "statistics=[Rows=Absent,");
178+
assert_contains!(content, "statistics=[Rows=Exact(");
180179

181-
// Explicitly set to false
180+
// Explicitly set to true
182181
let mut config = SessionConfig::new();
183182
config.options_mut().explain.physical_plan_only = true;
184183
config.options_mut().explain.show_statistics = true;
185-
config.options_mut().execution.collect_statistics = false;
184+
config.options_mut().execution.collect_statistics = true;
186185
let content = explain_query_all_with_config(config).await?;
187-
assert_contains!(content, "statistics=[Rows=Absent,");
186+
assert_contains!(content, "statistics=[Rows=Exact(");
188187

189-
// Explicitly set to true
188+
// Explicitly set to false
190189
let mut config = SessionConfig::new();
191190
config.options_mut().explain.physical_plan_only = true;
192191
config.options_mut().explain.show_statistics = true;
193-
config.options_mut().execution.collect_statistics = true;
192+
config.options_mut().execution.collect_statistics = false;
194193
let content = explain_query_all_with_config(config).await?;
195-
assert_contains!(content, "statistics=[Rows=Exact(10),");
194+
assert_contains!(content, "statistics=[Rows=Absent,");
196195

197196
Ok(())
198197
}

datafusion/sqllogictest/test_files/explain_tree.slt

Lines changed: 192 additions & 261 deletions
Large diffs are not rendered by default.

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ datafusion.catalog.location NULL
216216
datafusion.catalog.newlines_in_values false
217217
datafusion.execution.batch_size 8192
218218
datafusion.execution.coalesce_batches true
219-
datafusion.execution.collect_statistics false
219+
datafusion.execution.collect_statistics true
220220
datafusion.execution.enable_recursive_ctes true
221221
datafusion.execution.enforce_batch_size_in_joins false
222222
datafusion.execution.keep_partition_by_columns false
@@ -326,7 +326,7 @@ datafusion.catalog.location NULL Location scanned to load tables for `default` s
326326
datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.
327327
datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
328328
datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting
329-
datafusion.execution.collect_statistics false Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to false.
329+
datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.
330330
datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs
331331
datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.
332332
datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches

datafusion/sqllogictest/test_files/limit.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,7 @@ physical_plan
854854
02)--SortExec: TopK(fetch=1000), expr=[part_key@1 ASC NULLS LAST], preserve_partitioning=[false]
855855
03)----ProjectionExec: expr=[1 as foo, part_key@0 as part_key]
856856
04)------CoalescePartitionsExec: fetch=1
857-
05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]}, projection=[part_key], limit=1, file_type=parquet
857+
05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..265], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:265..530], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:530..794]]}, projection=[part_key], limit=1, file_type=parquet
858858

859859
query I
860860
with selection as (

datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,7 @@ logical_plan
335335
physical_plan
336336
01)CoalesceBatchesExec: target_batch_size=8192
337337
02)--FilterExec: val@0 != part@1
338-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
339-
04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet
338+
03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet
340339

341340
# If we reference only a partition column it gets evaluted during the listing phase
342341
query TT

datafusion/sqllogictest/test_files/parquet_statistics.slt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ statement ok
4646
set datafusion.explain.show_statistics = true;
4747

4848
######
49-
# By default, the statistics are not gathered
49+
# By default, the statistics are gathered
5050
######
5151

5252
# Recreate the table to pick up the current setting
@@ -59,18 +59,18 @@ query TT
5959
EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
6060
----
6161
physical_plan
62-
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
63-
02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
64-
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
62+
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
63+
02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
64+
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
6565
04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
66-
05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
66+
05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
6767

6868
# cleanup
6969
statement ok
7070
DROP TABLE test_table;
7171

7272
######
73-
# When the setting is true, the statistics are gathered
73+
# When the setting is true, statistics are gathered
7474
######
7575

7676
statement ok

datafusion/sqllogictest/test_files/repartition.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ physical_plan
4646
01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
4747
02)--CoalesceBatchesExec: target_batch_size=8192
4848
03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=4
49-
04)------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
50-
05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
49+
04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
50+
05)--------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
5151
06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet
5252

5353
# disable round robin repartitioning

docs/source/user-guide/configs.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
4747
| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. |
4848
| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption |
4949
| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting |
50-
| datafusion.execution.collect_statistics | false | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to false. |
50+
| datafusion.execution.collect_statistics | true | Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. |
5151
| datafusion.execution.target_partitions | 0 | Number of partitions for query execution. Increasing partitions can increase concurrency. Defaults to the number of CPU cores on the system |
5252
| datafusion.execution.time_zone | +00:00 | The default time zone Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this time zone, and then extract the hour |
5353
| datafusion.execution.parquet.enable_page_index | true | (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. |

docs/source/user-guide/sql/ddl.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,14 +95,14 @@ LOCATION '/mnt/nyctaxi/tripdata.parquet';
9595

9696
:::{note}
9797
Statistics
98-
: By default, when a table is created, DataFusion will _NOT_ read the files
98+
: By default, when a table is created, DataFusion will read the files
9999
to gather statistics, which can be expensive but can accelerate subsequent
100-
queries substantially. If you want to gather statistics
100+
queries substantially. If you don't want to gather statistics
101101
when creating a table, set the `datafusion.execution.collect_statistics`
102-
configuration option to `true` before creating the table. For example:
102+
configuration option to `false` before creating the table. For example:
103103

104104
```sql
105-
SET datafusion.execution.collect_statistics = true;
105+
SET datafusion.execution.collect_statistics = false;
106106
```
107107

108108
See the [config settings docs](../configs.md) for more details.

0 commit comments

Comments
 (0)