Skip to content

Update arrow_reader_row_filter benchmark to reflect ClickBench distribution #7461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 15, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions parquet/benches/arrow_reader_row_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ use arrow::compute::and;
use arrow::compute::kernels::cmp::{eq, gt, lt, neq};
use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
use arrow::record_batch::RecordBatch;
use arrow_array::builder::StringViewBuilder;
use arrow_array::builder::{ArrayBuilder, StringViewBuilder};
use arrow_array::StringViewArray;
use arrow_cast::pretty::pretty_format_batches;
use bytes::Bytes;
Expand Down Expand Up @@ -109,18 +109,42 @@ fn create_float64_array(size: usize) -> ArrayRef {
}

/// Creates a utf8View array of a given size with random strings.
/// Now, this column is used in one filter case.
fn create_utf8_view_array(size: usize, null_density: f32) -> ArrayRef {
///
/// This is modeled after the "SearchPhrase" column in the ClickBench benchmark.
///
/// See <https://github.com/apache/arrow-rs/issues/7460> for calculations.
///
/// The important ClickBench data properties are:
/// * Selectivity is: 13172392 / 99997497 = 0.132
/// * Number of RowSelections = 14054784
/// * Average run length of each RowSelection: 99997497 / 14054784 = 7.114
///
/// The properties of this array are:
/// * Selectivity is: 15144 / 100000 = 0.15144
/// * Number of RowSelections = 12904
/// * Average run length of each RowSelection: 100000 / 12904 = 7.75
fn create_utf8_view_array(size: usize) -> ArrayRef {
const AVG_RUN_LENGTH: usize = 4; // average number of empty/non-empty strings in a row
const EMPTY_DENSITY: u32 = 85; // percent chance that each run is an empty string

let mut builder = StringViewBuilder::with_capacity(size);
let mut rng = StdRng::seed_from_u64(44);
for _ in 0..size {
while builder.len() < size {
let mut run_length = rng.random_range(1..AVG_RUN_LENGTH);
if builder.len() + run_length > size {
// cap to size rows
run_length = size - builder.len();
}

let choice = rng.random_range(0..100);
if choice < (null_density * 100.0) as u32 {
builder.append_value("");
} else if choice < 25 {
builder.append_value("const");
if choice < EMPTY_DENSITY {
for _ in 0..run_length {
builder.append_value("");
}
} else {
builder.append_value(random_string(&mut rng));
for _ in 0..run_length {
builder.append_value(random_string(&mut rng));
}
}
}
Arc::new(builder.finish()) as ArrayRef
Expand Down Expand Up @@ -149,7 +173,7 @@ fn create_record_batch(size: usize) -> RecordBatch {

let int64_array = create_int64_array(size);
let float64_array = create_float64_array(size);
let utf8_array = create_utf8_view_array(size, 0.2);
let utf8_array = create_utf8_view_array(size);
let ts_array = create_ts_array(size);

let arrays: Vec<ArrayRef> = vec![int64_array, float64_array, utf8_array, ts_array];
Expand Down
Loading