Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: avoid pulling unnecessary columns when querying append mode table #1307

Merged
merged 14 commits into from
Dec 22, 2023
Merged
60 changes: 40 additions & 20 deletions analytic_engine/src/instance/flush_compaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ use std::{
};

use common_types::{
projected_schema::ProjectedSchema,
record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
projected_schema::{ProjectedSchema, RowProjectorBuilder},
record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
request_id::RequestId,
row::RowViewOnBatch,
time::TimeRange,
Expand All @@ -46,8 +46,8 @@ use wal::manager::WalLocation;
use crate::{
compaction::{CompactionInputFiles, CompactionTask, ExpiredFiles},
instance::{
self, create_sst_read_option, reorder_memtable::Reorder,
serial_executor::TableFlushScheduler, ScanType, SpaceStore, SpaceStoreRef,
self, reorder_memtable::Reorder, serial_executor::TableFlushScheduler, ScanType,
SpaceStore, SpaceStoreRef, SstReadOptionsBuilder,
},
manifest::meta_edit::{
AlterOptionsMeta, AlterSchemaMeta, MetaEdit, MetaEditRequest, MetaUpdate, VersionEditMeta,
Expand Down Expand Up @@ -593,7 +593,7 @@ impl FlushTask {

for time_range in &time_ranges {
let (batch_record_sender, batch_record_receiver) =
channel::<Result<RecordBatchWithKey>>(DEFAULT_CHANNEL_SIZE);
channel::<Result<FetchedRecordBatch>>(DEFAULT_CHANNEL_SIZE);
let file_id = self
.table_data
.alloc_file_id(&self.space_store.manifest)
Expand Down Expand Up @@ -933,20 +933,26 @@ impl SpaceStore {
let table_options = table_data.table_options();
let projected_schema = ProjectedSchema::no_projection(schema.clone());
let predicate = Arc::new(Predicate::empty());
let sst_read_options = create_sst_read_option(
let maybe_table_level_metrics = table_data
.metrics
.maybe_table_level_metrics()
.sst_metrics
.clone();
let sst_read_options_builder = SstReadOptionsBuilder::new(
ScanType::Compaction,
scan_options,
table_data
.metrics
.maybe_table_level_metrics()
.sst_metrics
.clone(),
maybe_table_level_metrics,
table_options.num_rows_per_row_group,
projected_schema.clone(),
predicate,
self.meta_cache.clone(),
runtime,
);
let fetched_schema = projected_schema.to_record_schema_with_key();
let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
let fetched_schema = fetched_schema.into_record_schema();
let table_schema = projected_schema.table_schema().clone();
let row_projector_builder =
RowProjectorBuilder::new(fetched_schema, table_schema, Some(primary_key_indexes));

let iter_options = IterOptions {
batch_size: table_options.num_rows_per_row_group,
Expand All @@ -966,8 +972,8 @@ impl SpaceStore {
sequence,
projected_schema,
predicate: Arc::new(Predicate::empty()),
sst_read_options_builder: sst_read_options_builder.clone(),
sst_factory: &self.sst_factory,
sst_read_options: sst_read_options.clone(),
store_picker: self.store_picker(),
merge_iter_options: iter_options.clone(),
need_dedup: table_options.need_dedup(),
Expand All @@ -992,6 +998,8 @@ impl SpaceStore {
row_iter::record_batch_with_key_iter_to_stream(merge_iter)
};

// TODO: eliminate the duplicated building of `SstReadOptions`.
let sst_read_options = sst_read_options_builder.build(row_projector_builder);
let (sst_meta, column_stats) = {
let meta_reader = SstMetaReader {
space_id: table_data.space_id,
Expand Down Expand Up @@ -1157,12 +1165,17 @@ fn collect_column_stats_from_meta_datas(metas: &[SstMetaData]) -> HashMap<String
}

fn split_record_batch_with_time_ranges(
record_batch: RecordBatchWithKey,
record_batch: FetchedRecordBatch,
time_ranges: &[TimeRange],
timestamp_idx: usize,
) -> Result<Vec<RecordBatchWithKey>> {
let mut builders: Vec<RecordBatchWithKeyBuilder> = (0..time_ranges.len())
.map(|_| RecordBatchWithKeyBuilder::new(record_batch.schema_with_key().clone()))
) -> Result<Vec<FetchedRecordBatch>> {
let fetched_schema = record_batch.schema();
let primary_key_indexes = record_batch.primary_key_indexes();
let mut builders: Vec<FetchedRecordBatchBuilder> = (0..time_ranges.len())
.map(|_| {
let primary_key_indexes = primary_key_indexes.map(|idxs| idxs.to_vec());
FetchedRecordBatchBuilder::new(fetched_schema.clone(), primary_key_indexes)
})
.collect();

for row_idx in 0..record_batch.num_rows() {
Expand Down Expand Up @@ -1203,11 +1216,18 @@ fn build_mem_table_iter(
table_data: &TableDataRef,
) -> Result<ColumnarIterPtr> {
let scan_ctx = ScanContext::default();
let projected_schema = ProjectedSchema::no_projection(table_data.schema());
let fetched_schema = projected_schema.to_record_schema_with_key();
let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
let fetched_schema = fetched_schema.into_record_schema();
let table_schema = projected_schema.table_schema().clone();
let row_projector_builder =
RowProjectorBuilder::new(fetched_schema, table_schema, Some(primary_key_indexes));
let scan_req = ScanRequest {
start_user_key: Bound::Unbounded,
end_user_key: Bound::Unbounded,
sequence: common_types::MAX_SEQUENCE_NUMBER,
projected_schema: ProjectedSchema::no_projection(table_data.schema()),
row_projector_builder,
need_dedup: table_data.dedup(),
reverse: false,
metrics_collector: None,
Expand All @@ -1226,7 +1246,7 @@ mod tests {
use common_types::{
schema::Schema,
tests::{
build_record_batch_with_key_by_rows, build_row, build_row_opt, build_schema,
build_fetched_record_batch_by_rows, build_row, build_row_opt, build_schema,
check_record_batch_with_key_with_rows,
},
time::TimeRange,
Expand Down Expand Up @@ -1275,7 +1295,7 @@ mod tests {
.into_iter()
.flatten()
.collect();
let record_batch_with_key = build_record_batch_with_key_by_rows(rows);
let record_batch_with_key = build_fetched_record_batch_by_rows(rows);
let column_num = record_batch_with_key.num_columns();
let time_ranges = vec![
TimeRange::new_unchecked_for_test(0, 100),
Expand Down
55 changes: 39 additions & 16 deletions analytic_engine/src/instance/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pub(crate) mod write;

use std::sync::Arc;

use common_types::{projected_schema::ProjectedSchema, table::TableId};
use common_types::{projected_schema::RowProjectorBuilder, table::TableId};
use generic_error::{BoxError, GenericError};
use logger::{error, info};
use macros::define_result;
Expand Down Expand Up @@ -327,32 +327,55 @@ impl Instance {
}
}

// TODO: make it a builder
#[allow(clippy::too_many_arguments)]
fn create_sst_read_option(
#[derive(Debug, Clone)]
pub struct SstReadOptionsBuilder {
scan_type: ScanType,
scan_options: ScanOptions,
maybe_table_level_metrics: Arc<MaybeTableLevelMetrics>,
num_rows_per_row_group: usize,
projected_schema: ProjectedSchema,
predicate: PredicateRef,
meta_cache: Option<MetaCacheRef>,
runtime: Arc<Runtime>,
) -> SstReadOptions {
SstReadOptions {
maybe_table_level_metrics,
num_rows_per_row_group,
frequency: scan_type.into(),
projected_schema,
predicate,
meta_cache,
scan_options,
runtime,
}

impl SstReadOptionsBuilder {
pub fn new(
scan_type: ScanType,
scan_options: ScanOptions,
maybe_table_level_metrics: Arc<MaybeTableLevelMetrics>,
num_rows_per_row_group: usize,
predicate: PredicateRef,
meta_cache: Option<MetaCacheRef>,
runtime: Arc<Runtime>,
) -> Self {
Self {
scan_type,
scan_options,
maybe_table_level_metrics,
num_rows_per_row_group,
predicate,
meta_cache,
runtime,
}
}

pub fn build(self, row_projector_builder: RowProjectorBuilder) -> SstReadOptions {
SstReadOptions {
maybe_table_level_metrics: self.maybe_table_level_metrics,
num_rows_per_row_group: self.num_rows_per_row_group,
frequency: self.scan_type.into(),
row_projector_builder,
predicate: self.predicate,
meta_cache: self.meta_cache,
scan_options: self.scan_options,
runtime: self.runtime,
}
}
}

/// Scan type which mapped to the low level `ReadFrequency` in sst reader.
enum ScanType {
#[derive(Debug, Clone, Copy)]
pub enum ScanType {
Query,
Compaction,
}
Expand Down
40 changes: 24 additions & 16 deletions analytic_engine/src/instance/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use std::{
use async_stream::try_stream;
use common_types::{
projected_schema::ProjectedSchema,
record_batch::{RecordBatch, RecordBatchWithKey},
record_batch::{FetchedRecordBatch, RecordBatch},
schema::RecordSchema,
time::TimeRange,
};
Expand All @@ -42,15 +42,14 @@ use time_ext::current_time_millis;
use trace_metric::Metric;

use crate::{
instance::{create_sst_read_option, Instance, ScanType},
instance::{Instance, ScanType, SstReadOptionsBuilder},
row_iter::{
chain,
chain::{ChainConfig, ChainIterator},
dedup::DedupIterator,
merge::{MergeBuilder, MergeConfig, MergeIterator},
IterOptions, RecordBatchWithKeyIterator,
FetchedRecordBatchIterator, IterOptions,
},
sst::factory::SstReadOptions,
table::{
data::TableData,
version::{ReadView, TableVersion},
Expand Down Expand Up @@ -123,25 +122,34 @@ impl Instance {
None,
));

let sst_read_options = create_sst_read_option(
let sst_read_options_builder = SstReadOptionsBuilder::new(
ScanType::Query,
self.scan_options.clone(),
table_metrics.sst_metrics.clone(),
table_options.num_rows_per_row_group,
request.projected_schema.clone(),
request.predicate.clone(),
self.meta_cache.clone(),
self.read_runtime().clone(),
);

if need_merge_sort {
let merge_iters = self
.build_merge_iters(table_data, &request, &table_options, sst_read_options)
.build_merge_iters(
table_data,
&request,
&table_options,
sst_read_options_builder,
)
.await?;
self.build_partitioned_streams(&request, merge_iters)
} else {
let chain_iters = self
.build_chain_iters(table_data, &request, &table_options, sst_read_options)
.build_chain_iters(
table_data,
&request,
&table_options,
sst_read_options_builder,
)
.await?;
self.build_partitioned_streams(&request, chain_iters)
}
Expand All @@ -150,7 +158,7 @@ impl Instance {
fn build_partitioned_streams(
&self,
request: &ReadRequest,
partitioned_iters: Vec<impl RecordBatchWithKeyIterator + 'static>,
partitioned_iters: Vec<impl FetchedRecordBatchIterator + 'static>,
) -> Result<PartitionedStreams> {
let read_parallelism = request.opts.read_parallelism;

Expand Down Expand Up @@ -179,7 +187,7 @@ impl Instance {
table_data: &TableData,
request: &ReadRequest,
table_options: &TableOptions,
sst_read_options: SstReadOptions,
sst_read_options_builder: SstReadOptionsBuilder,
) -> Result<Vec<DedupIterator<MergeIterator>>> {
// Current visible sequence
let sequence = table_data.last_sequence();
Expand All @@ -203,7 +211,7 @@ impl Instance {
projected_schema: request.projected_schema.clone(),
predicate: request.predicate.clone(),
sst_factory: &self.space_store.sst_factory,
sst_read_options: sst_read_options.clone(),
sst_read_options_builder: sst_read_options_builder.clone(),
store_picker: self.space_store.store_picker(),
merge_iter_options: iter_options.clone(),
need_dedup: table_options.need_dedup(),
Expand Down Expand Up @@ -239,7 +247,7 @@ impl Instance {
table_data: &TableData,
request: &ReadRequest,
table_options: &TableOptions,
sst_read_options: SstReadOptions,
sst_read_options_builder: SstReadOptionsBuilder,
) -> Result<Vec<ChainIterator>> {
let projected_schema = request.projected_schema.clone();

Expand All @@ -261,7 +269,7 @@ impl Instance {
table_id: table_data.id,
projected_schema: projected_schema.clone(),
predicate: request.predicate.clone(),
sst_read_options: sst_read_options.clone(),
sst_read_options_builder: sst_read_options_builder.clone(),
sst_factory: &self.space_store.sst_factory,
store_picker: self.space_store.store_picker(),
};
Expand Down Expand Up @@ -347,7 +355,7 @@ struct StreamStateOnMultiIters<I> {
projected_schema: ProjectedSchema,
}

impl<I: RecordBatchWithKeyIterator + 'static> StreamStateOnMultiIters<I> {
impl<I: FetchedRecordBatchIterator + 'static> StreamStateOnMultiIters<I> {
fn is_exhausted(&self) -> bool {
self.curr_iter_idx >= self.iters.len()
}
Expand All @@ -362,7 +370,7 @@ impl<I: RecordBatchWithKeyIterator + 'static> StreamStateOnMultiIters<I> {

async fn fetch_next_batch(
&mut self,
) -> Option<std::result::Result<RecordBatchWithKey, I::Error>> {
) -> Option<std::result::Result<FetchedRecordBatch, I::Error>> {
loop {
if self.is_exhausted() {
return None;
Expand All @@ -379,7 +387,7 @@ impl<I: RecordBatchWithKeyIterator + 'static> StreamStateOnMultiIters<I> {
}

fn iters_to_stream(
iters: Vec<impl RecordBatchWithKeyIterator + 'static>,
iters: Vec<impl FetchedRecordBatchIterator + 'static>,
projected_schema: ProjectedSchema,
) -> SendableRecordBatchStream {
let mut state = StreamStateOnMultiIters {
Expand Down
Loading
Loading