Skip to content

Commit 8d8732c

Browse files
XiangpengHaoalamb
andauthored
Add a config to force using string view in benchmark (#11514)
* add a knob to force string view in benchmark * fix sql logic test * update doc * fix ci * fix ci only test * Update benchmarks/src/util/options.rs Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Update datafusion/common/src/config.rs Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * update tests --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 2c808fb commit 8d8732c

File tree

8 files changed

+44
-3
lines changed

8 files changed

+44
-3
lines changed

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,6 @@ large_futures = "warn"
154154
[workspace.lints.rust]
155155
unused_imports = "deny"
156156

157-
158157
## Temporary arrow-rs patch until 52.2.0 is released
159158

160159
[patch.crates-io]

benchmarks/src/clickbench.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ impl RunOpt {
116116
None => queries.min_query_id()..=queries.max_query_id(),
117117
};
118118

119-
let config = self.common.config();
119+
let mut config = self.common.config();
120+
config.options_mut().execution.schema_force_string_view = self.common.string_view;
121+
120122
let ctx = SessionContext::new_with_config(config);
121123
self.register_hits(&ctx).await?;
122124

benchmarks/src/tpch/run.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ impl RunOpt {
120120
.config()
121121
.with_collect_statistics(!self.disable_statistics);
122122
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
123+
config.options_mut().execution.schema_force_string_view = self.common.string_view;
123124
let ctx = SessionContext::new_with_config(config);
124125

125126
// register tables
@@ -339,6 +340,7 @@ mod tests {
339340
partitions: Some(2),
340341
batch_size: 8192,
341342
debug: false,
343+
string_view: false,
342344
};
343345
let opt = RunOpt {
344346
query: Some(query),
@@ -372,6 +374,7 @@ mod tests {
372374
partitions: Some(2),
373375
batch_size: 8192,
374376
debug: false,
377+
string_view: false,
375378
};
376379
let opt = RunOpt {
377380
query: Some(query),

benchmarks/src/util/options.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ pub struct CommonOpt {
3737
/// Activate debug mode to see more details
3838
#[structopt(short, long)]
3939
pub debug: bool,
40+
41+
/// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
42+
/// when reading ParquetFiles
43+
#[structopt(long)]
44+
pub string_view: bool,
4045
}
4146

4247
impl CommonOpt {

datafusion/common/src/config.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,10 @@ config_namespace! {
311311

312312
/// Should DataFusion keep the columns used for partition_by in the output RecordBatches
313313
pub keep_partition_by_columns: bool, default = false
314+
315+
/// If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`,
316+
/// and `Binary/BinaryLarge` with `BinaryView`.
317+
pub schema_force_string_view: bool, default = false
314318
}
315319
}
316320

datafusion/core/src/datasource/listing/table.rs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,32 @@ impl ListingOptions {
410410
.try_collect()
411411
.await?;
412412

413-
self.format.infer_schema(state, &store, &files).await
413+
let mut schema = self.format.infer_schema(state, &store, &files).await?;
414+
415+
if state.config_options().execution.schema_force_string_view {
416+
let transformed_fields: Vec<Arc<Field>> = schema
417+
.fields
418+
.iter()
419+
.map(|field| match field.data_type() {
420+
DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new(
421+
field.name(),
422+
DataType::Utf8View,
423+
field.is_nullable(),
424+
)),
425+
DataType::Binary | DataType::LargeBinary => Arc::new(Field::new(
426+
field.name(),
427+
DataType::BinaryView,
428+
field.is_nullable(),
429+
)),
430+
_ => field.clone(),
431+
})
432+
.collect();
433+
schema = Arc::new(Schema::new_with_metadata(
434+
transformed_fields,
435+
schema.metadata.clone(),
436+
));
437+
}
438+
Ok(schema)
414439
}
415440

416441
/// Infers the partition columns stored in `LOCATION` and compares

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ datafusion.execution.parquet.statistics_enabled NULL
205205
datafusion.execution.parquet.write_batch_size 1024
206206
datafusion.execution.parquet.writer_version 1.0
207207
datafusion.execution.planning_concurrency 13
208+
datafusion.execution.schema_force_string_view false
208209
datafusion.execution.soft_max_rows_per_output_file 50000000
209210
datafusion.execution.sort_in_place_threshold_bytes 1048576
210211
datafusion.execution.sort_spill_reservation_bytes 10485760
@@ -289,6 +290,7 @@ datafusion.execution.parquet.statistics_enabled NULL Sets if statistics are enab
289290
datafusion.execution.parquet.write_batch_size 1024 Sets write_batch_size in bytes
290291
datafusion.execution.parquet.writer_version 1.0 Sets parquet writer version valid values are "1.0" and "2.0"
291292
datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system
293+
datafusion.execution.schema_force_string_view false If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
292294
datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max
293295
datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.
294296
datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).

docs/source/user-guide/configs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
8787
| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs |
8888
| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental |
8989
| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches |
90+
| datafusion.execution.schema_force_string_view | false | If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. |
9091
| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. |
9192
| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores |
9293
| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible |

0 commit comments

Comments
 (0)