From 22b724ca33f92ea92893c5f389f1c8fc5f504484 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Wed, 6 Dec 2023 17:51:46 +0800
Subject: [PATCH 01/38] fix: ensure channel size non zero (#1345)

## Rationale
When channel capacity < read_parallelism, we will pass 0 to channel,
which will cause panic
```
2023-12-05 20:31:32.974 ERRO [components/panic_ext/src/lib.rs:54] thread 'ceres-read' panicked 'mpsc bounded channel requires buffer > 0' at "analytic_engine/src/sst/parquet/async_reader.rs:736"
```

## Detailed Changes
- Ensure channel size non zero

## Test Plan
No need.
---
 analytic_engine/src/sst/parquet/async_reader.rs | 1 +
 integration_tests/build_meta.sh                 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index f8db79a332..d4d378adef 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -732,6 +732,7 @@ impl<'a> SstReader for ThreadedReader<'a> {
         );
 
         let channel_cap_per_sub_reader = self.channel_cap / sub_readers.len();
+        let channel_cap_per_sub_reader = channel_cap_per_sub_reader.max(1);
         let (tx_group, rx_group): (Vec<_>, Vec<_>) = (0..read_parallelism)
             .map(|_| mpsc::channel::<Result<RecordBatchWithKey>>(channel_cap_per_sub_reader))
             .unzip();
diff --git a/integration_tests/build_meta.sh b/integration_tests/build_meta.sh
index b75e36a0da..2f0330951c 100755
--- a/integration_tests/build_meta.sh
+++ b/integration_tests/build_meta.sh
@@ -8,7 +8,7 @@ META_BIN_PATH=${META_BIN_PATH:-""}
 
 if [[ -z "${META_BIN_PATH}" ]]; then
     echo "Fetch and install ceresmeta-server..."
-    go install -a github.com/CeresDB/horaemeta/cmd/ceresmeta-server@main
+    go install -v -a github.com/CeresDB/ceresmeta/cmd/ceresmeta-server@dev
     META_BIN_PATH="$(go env GOPATH)/bin/ceresmeta-server"
 fi
 

From 46f54f4d32e6b2cdeb5ca1b47491b69da241ac2c Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Thu, 7 Dec 2023 10:39:54 +0800
Subject: [PATCH 02/38] feat: use string for request id (#1349)

## Rationale
Close #1178

## Detailed Changes
- Use string to represent request_id, which is uuid v4, random string
like `575c02e1-cd92-4c35-a5f3-353781163e93`
  - https://docs.rs/uuid/1.6.1/uuid/struct.Uuid.html#method.new_v4

## Test Plan
Manually.
---
 Cargo.lock                                    | 57 ++++++++++++-------
 Cargo.toml                                    |  3 +-
 analytic_engine/src/compaction/scheduler.rs   |  2 +-
 .../src/instance/flush_compaction.rs          | 20 ++++---
 analytic_engine/src/instance/read.rs          |  6 +-
 benchmarks/src/merge_memtable_bench.rs        |  5 +-
 benchmarks/src/merge_sst_bench.rs             |  2 +-
 benchmarks/src/sst_tools.rs                   |  3 +-
 common_types/Cargo.toml                       |  1 +
 common_types/src/request_id.rs                | 34 ++++++-----
 components/logger/src/lib.rs                  |  4 +-
 .../src/dist_sql_query/test_util.rs           |  4 +-
 interpreters/src/context.rs                   |  4 +-
 proxy/src/grpc/prom_query.rs                  |  4 +-
 proxy/src/http/prom.rs                        | 12 +++-
 proxy/src/influxdb/mod.rs                     | 10 +++-
 proxy/src/read.rs                             | 18 ++++--
 proxy/src/write.rs                            | 38 ++++++-------
 query_engine/src/datafusion_impl/mod.rs       |  2 +-
 .../src/datafusion_impl/task_context.rs       |  6 +-
 query_frontend/src/frontend.rs                | 10 ++--
 server/src/grpc/remote_engine_service/mod.rs  | 14 ++---
 table_engine/src/provider.rs                  | 12 +---
 table_engine/src/remote/model.rs              |  7 ++-
 table_engine/src/table.rs                     |  4 +-
 25 files changed, 153 insertions(+), 129 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 81b0fcfa41..95b429a73a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -96,7 +96,7 @@ dependencies = [
  "atomic_enum",
  "base64 0.13.1",
  "bytes_ext",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "codec",
  "common_types",
  "datafusion",
@@ -1303,7 +1303,7 @@ checksum = "8ef195bacb1ca0eb02d6a0562b09852941d01de2b962c7066c922115fab7dcb7"
 dependencies = [
  "arrow 38.0.0",
  "async-trait",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (registry+https://github.com/rust-lang/crates.io-index)",
  "dashmap 5.4.0",
  "futures 0.3.28",
  "paste 1.0.12",
@@ -1341,6 +1341,18 @@ dependencies = [
  "walkdir",
 ]
 
+[[package]]
+name = "ceresdbproto"
+version = "1.0.23"
+source = "git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05#2c60e0591b6066957c80e7d6ae97cf53ccd591e1"
+dependencies = [
+ "prost",
+ "protoc-bin-vendored",
+ "tonic 0.8.3",
+ "tonic-build",
+ "walkdir",
+]
+
 [[package]]
 name = "cexpr"
 version = "0.6.0"
@@ -1515,7 +1527,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "common_types",
  "etcd-client",
  "future_ext",
@@ -1593,7 +1605,7 @@ dependencies = [
  "arrow 43.0.0",
  "arrow_ext",
  "bytes_ext",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "chrono",
  "datafusion",
  "hash_ext",
@@ -1606,6 +1618,7 @@ dependencies = [
  "serde_json",
  "snafu 0.6.10",
  "sqlparser",
+ "uuid",
 ]
 
 [[package]]
@@ -2348,7 +2361,7 @@ dependencies = [
  "async-recursion",
  "async-trait",
  "catalog",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -3902,7 +3915,7 @@ name = "meta_client"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -4427,7 +4440,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "chrono",
  "clru",
  "crc",
@@ -5304,7 +5317,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "clru",
  "cluster",
  "common_types",
@@ -5431,7 +5444,7 @@ dependencies = [
  "arrow 43.0.0",
  "async-trait",
  "catalog",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "cluster",
  "codec",
  "common_types",
@@ -5742,7 +5755,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "arrow_ext",
  "async-trait",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -5871,7 +5884,7 @@ name = "router"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "cluster",
  "common_types",
  "generic_error",
@@ -6246,7 +6259,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "clru",
  "cluster",
  "common_types",
@@ -6772,7 +6785,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "codec",
  "common_types",
  "futures 0.3.28",
@@ -6794,7 +6807,7 @@ dependencies = [
  "arrow_ext",
  "async-trait",
  "bytes_ext",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -6997,7 +7010,7 @@ dependencies = [
 name = "time_ext"
 version = "1.2.6-alpha"
 dependencies = [
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "chrono",
  "common_types",
  "macros",
@@ -7509,8 +7522,8 @@ version = "1.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
 dependencies = [
- "cfg-if 0.1.10",
- "rand 0.3.23",
+ "cfg-if 1.0.0",
+ "rand 0.8.5",
  "static_assertions",
 ]
 
@@ -7587,9 +7600,9 @@ checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 
 [[package]]
 name = "uuid"
-version = "1.3.3"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
+checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
 dependencies = [
  "getrandom 0.2.8",
  "rand 0.8.5",
@@ -7598,9 +7611,9 @@ dependencies = [
 
 [[package]]
 name = "uuid-macro-internal"
-version = "1.3.3"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f67b459f42af2e6e1ee213cb9da4dbd022d3320788c3fb3e1b893093f1e45da"
+checksum = "f49e7f3f3db8040a100710a11932239fd30697115e2ba4107080d8252939845e"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -7649,7 +7662,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes_ext",
- "ceresdbproto",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
  "chrono",
  "codec",
  "common_types",
diff --git a/Cargo.toml b/Cargo.toml
index 01af922e94..50875c3cff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -94,7 +94,7 @@ bytes = "1"
 bytes_ext = { path = "components/bytes_ext" }
 catalog = { path = "catalog" }
 catalog_impls = { path = "catalog_impls" }
-ceresdbproto = "1.0.23"
+ceresdbproto = { git = "https://github.com/CeresDB/horaedbproto.git", rev = "2c60e05" }
 codec = { path = "components/codec" }
 chrono = "0.4"
 clap = "3.0"
@@ -183,6 +183,7 @@ tokio = { version = "1.29", features = ["full"] }
 wal = { path = "src/wal" }
 xorfilter-rs = { git = "https://github.com/CeresDB/xorfilter", rev = "ac8ef01" }
 zstd = { version = "0.12", default-features = false }
+uuid = "1.6.1"
 regex = "1"
 
 # This profile optimizes for good runtime performance.
diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs
index dea77022d2..1496ef514c 100644
--- a/analytic_engine/src/compaction/scheduler.rs
+++ b/analytic_engine/src/compaction/scheduler.rs
@@ -523,7 +523,7 @@ impl ScheduleWorker {
             }
             let res = space_store
                 .compact_table(
-                    request_id,
+                    request_id.clone(),
                     &table_data,
                     &compaction_task,
                     scan_options,
diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs
index 35dcca8caf..81ac53d7a8 100644
--- a/analytic_engine/src/instance/flush_compaction.rs
+++ b/analytic_engine/src/instance/flush_compaction.rs
@@ -291,7 +291,7 @@ impl FlushTask {
         // Start flush duration timer.
         let local_metrics = self.table_data.metrics.local_flush_metrics();
         let _timer = local_metrics.start_flush_timer();
-        self.dump_memtables(request_id, &mems_to_flush, flush_req.need_reorder)
+        self.dump_memtables(request_id.clone(), &mems_to_flush, flush_req.need_reorder)
             .await
             .box_err()
             .context(FlushJobWithCause {
@@ -421,7 +421,7 @@ impl FlushTask {
         if let Some(sampling_mem) = &mems_to_flush.sampling_mem {
             if let Some(seq) = self
                 .dump_sampling_memtable(
-                    request_id,
+                    request_id.clone(),
                     sampling_mem,
                     &mut files_to_level0,
                     need_reorder,
@@ -436,7 +436,7 @@ impl FlushTask {
             }
         }
         for mem in &mems_to_flush.memtables {
-            let file = self.dump_normal_memtable(request_id, mem).await?;
+            let file = self.dump_normal_memtable(request_id.clone(), mem).await?;
             if let Some(file) = file {
                 let sst_size = file.size;
                 files_to_level0.push(AddFile {
@@ -565,6 +565,7 @@ impl FlushTask {
             let store = self.space_store.clone();
             let storage_format_hint = self.table_data.table_options().storage_format_hint;
             let sst_write_options = sst_write_options.clone();
+            let request_id = request_id.clone();
 
             // spawn build sst
             let handler = self.runtime.spawn(async move {
@@ -785,7 +786,7 @@ impl SpaceStore {
         }
 
         for files in task.expired() {
-            self.delete_expired_files(table_data, request_id, files, &mut edit_meta);
+            self.delete_expired_files(table_data, &request_id, files, &mut edit_meta);
         }
 
         info!(
@@ -798,7 +799,7 @@ impl SpaceStore {
 
         for input in inputs {
             self.compact_input_files(
-                request_id,
+                request_id.clone(),
                 table_data,
                 input,
                 scan_options.clone(),
@@ -874,7 +875,7 @@ impl SpaceStore {
 
         info!(
             "Instance try to compact table, table:{}, table_id:{}, request_id:{}, input_files:{:?}",
-            table_data.name, table_data.id, request_id, input.files,
+            table_data.name, table_data.id, &request_id, input.files,
         );
 
         // The schema may be modified during compaction, so we acquire it first and use
@@ -905,6 +906,7 @@ impl SpaceStore {
             let space_id = table_data.space_id;
             let table_id = table_data.id;
             let sequence = table_data.last_sequence();
+            let request_id = request_id.clone();
             let mut builder = MergeBuilder::new(MergeConfig {
                 request_id,
                 metrics_collector: None,
@@ -933,7 +935,7 @@ impl SpaceStore {
 
         let record_batch_stream = if table_options.need_dedup() {
             row_iter::record_batch_with_key_iter_to_stream(DedupIterator::new(
-                request_id,
+                request_id.clone(),
                 merge_iter,
                 iter_options,
             ))
@@ -978,7 +980,7 @@ impl SpaceStore {
             })?;
 
         let sst_info = sst_writer
-            .write(request_id, &sst_meta, record_batch_stream)
+            .write(request_id.clone(), &sst_meta, record_batch_stream)
             .await
             .box_err()
             .with_context(|| WriteSst {
@@ -1038,7 +1040,7 @@ impl SpaceStore {
     pub(crate) fn delete_expired_files(
         &self,
         table_data: &TableData,
-        request_id: RequestId,
+        request_id: &RequestId,
         expired: &ExpiredFiles,
         edit_meta: &mut VersionEditMeta,
     ) {
diff --git a/analytic_engine/src/instance/read.rs b/analytic_engine/src/instance/read.rs
index 251d8168d3..09f261fd18 100644
--- a/analytic_engine/src/instance/read.rs
+++ b/analytic_engine/src/instance/read.rs
@@ -204,7 +204,7 @@ impl Instance {
                 .metrics_collector
                 .span(format!("{MERGE_ITER_METRICS_COLLECTOR_NAME_PREFIX}_{idx}"));
             let merge_config = MergeConfig {
-                request_id: request.request_id,
+                request_id: request.request_id.clone(),
                 metrics_collector: Some(metrics_collector),
                 deadline: request.opts.deadline,
                 space_id: table_data.space_id,
@@ -230,7 +230,7 @@ impl Instance {
                     table: &table_data.name,
                 })?;
             let dedup_iter =
-                DedupIterator::new(request.request_id, merge_iter, iter_options.clone());
+                DedupIterator::new(request.request_id.clone(), merge_iter, iter_options.clone());
 
             iters.push(dedup_iter);
         }
@@ -263,7 +263,7 @@ impl Instance {
                 .metrics_collector
                 .span(format!("{CHAIN_ITER_METRICS_COLLECTOR_NAME_PREFIX}_{idx}"));
             let chain_config = ChainConfig {
-                request_id: request.request_id,
+                request_id: request.request_id.clone(),
                 metrics_collector: Some(metrics_collector),
                 deadline: request.opts.deadline,
                 num_streams_to_prefetch: self.scan_options.num_streams_to_prefetch,
diff --git a/benchmarks/src/merge_memtable_bench.rs b/benchmarks/src/merge_memtable_bench.rs
index 3167673485..1ca655e50b 100644
--- a/benchmarks/src/merge_memtable_bench.rs
+++ b/benchmarks/src/merge_memtable_bench.rs
@@ -155,7 +155,7 @@ impl MergeMemTableBench {
         let request_id = RequestId::next_id();
         let store_picker: ObjectStorePickerRef = Arc::new(self.store.clone());
         let mut builder = MergeBuilder::new(MergeConfig {
-            request_id,
+            request_id: request_id.clone(),
             metrics_collector: None,
             deadline: None,
             space_id,
@@ -181,7 +181,8 @@ impl MergeMemTableBench {
             let mut batch_num = 0;
 
             if self.dedup {
-                let mut dedup_iter = DedupIterator::new(request_id, merge_iter, iter_options);
+                let mut dedup_iter =
+                    DedupIterator::new(request_id.clone(), merge_iter, iter_options);
                 while let Some(batch) = dedup_iter.next_batch().await.unwrap() {
                     let num_rows = batch.num_rows();
                     total_rows += num_rows;
diff --git a/benchmarks/src/merge_sst_bench.rs b/benchmarks/src/merge_sst_bench.rs
index 47d80ac83b..434f452b70 100644
--- a/benchmarks/src/merge_sst_bench.rs
+++ b/benchmarks/src/merge_sst_bench.rs
@@ -143,7 +143,7 @@ impl MergeSstBench {
         let request_id = RequestId::next_id();
         let store_picker: ObjectStorePickerRef = Arc::new(self.store.clone());
         let mut builder = MergeBuilder::new(MergeConfig {
-            request_id,
+            request_id: request_id.clone(),
             metrics_collector: None,
             deadline: None,
             space_id,
diff --git a/benchmarks/src/sst_tools.rs b/benchmarks/src/sst_tools.rs
index 1d4ca8044b..b9d41cbe83 100644
--- a/benchmarks/src/sst_tools.rs
+++ b/benchmarks/src/sst_tools.rs
@@ -247,6 +247,7 @@ pub async fn merge_sst(config: MergeSstConfig, runtime: Arc<Runtime>) {
         let space_id = config.space_id;
         let table_id = config.table_id;
         let sequence = max_sequence + 1;
+        let request_id = request_id.clone();
 
         let mut builder = MergeBuilder::new(MergeConfig {
             request_id,
@@ -272,7 +273,7 @@ pub async fn merge_sst(config: MergeSstConfig, runtime: Arc<Runtime>) {
     };
 
     let record_batch_stream = if config.dedup {
-        let iter = DedupIterator::new(request_id, iter, iter_options);
+        let iter = DedupIterator::new(request_id.clone(), iter, iter_options);
         row_iter::record_batch_with_key_iter_to_stream(iter)
     } else {
         row_iter::record_batch_with_key_iter_to_stream(iter)
diff --git a/common_types/Cargo.toml b/common_types/Cargo.toml
index d0727a082b..c676ce9a14 100644
--- a/common_types/Cargo.toml
+++ b/common_types/Cargo.toml
@@ -48,3 +48,4 @@ serde = { workspace = true }
 serde_json = { workspace = true }
 snafu = { workspace = true }
 sqlparser = { workspace = true }
+uuid = { workspace = true, features = ["fast-rng"] }
diff --git a/common_types/src/request_id.rs b/common_types/src/request_id.rs
index 9b9b6be98e..04fc10d35f 100644
--- a/common_types/src/request_id.rs
+++ b/common_types/src/request_id.rs
@@ -14,27 +14,21 @@
 
 //! Request id.
 
-use std::{
-    fmt,
-    sync::atomic::{AtomicU64, Ordering},
-};
+use std::fmt;
 
-#[derive(Debug, Clone, Copy)]
-pub struct RequestId(u64);
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct RequestId(String);
 
 impl RequestId {
     /// Acquire next request id.
     pub fn next_id() -> Self {
-        static NEXT_ID: AtomicU64 = AtomicU64::new(1);
-
-        let id = NEXT_ID.fetch_add(1, Ordering::Relaxed);
-
+        let id = uuid::Uuid::new_v4().to_string();
         Self(id)
     }
 
     #[inline]
-    pub fn as_u64(&self) -> u64 {
-        self.0
+    pub fn as_str(&self) -> &str {
+        &self.0
     }
 }
 
@@ -44,12 +38,18 @@ impl fmt::Display for RequestId {
     }
 }
 
-impl From<u64> for RequestId {
-    fn from(id: u64) -> Self {
+impl From<String> for RequestId {
+    fn from(id: String) -> Self {
         Self(id)
     }
 }
 
+impl From<&str> for RequestId {
+    fn from(id: &str) -> Self {
+        Self(id.to_string())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -57,10 +57,8 @@ mod tests {
     #[test]
     fn test_request_id() {
         let id = RequestId::next_id();
-        assert_eq!(1, id.0);
-        let id = RequestId::next_id();
-        assert_eq!(2, id.0);
+        let id2 = RequestId::next_id();
 
-        assert_eq!("2", id.to_string());
+        assert_ne!(id, id2);
     }
 }
diff --git a/components/logger/src/lib.rs b/components/logger/src/lib.rs
index d3780021f1..1dc78b0cb8 100644
--- a/components/logger/src/lib.rs
+++ b/components/logger/src/lib.rs
@@ -467,7 +467,7 @@ pub fn init_test_logger() {
 /// Timer for collecting slow query
 #[derive(Debug)]
 pub struct SlowTimer<'a> {
-    request_id: u64,
+    request_id: &'a str,
     sql: &'a str,
     slow_threshold: Duration,
     start_time: Instant,
@@ -488,7 +488,7 @@ impl<'a> Drop for SlowTimer<'a> {
 }
 
 impl<'a> SlowTimer<'a> {
-    pub fn new(request_id: u64, sql: &'a str, threshold: Duration) -> SlowTimer {
+    pub fn new(request_id: &'a str, sql: &'a str, threshold: Duration) -> SlowTimer<'a> {
         SlowTimer {
             request_id,
             sql,
diff --git a/df_engine_extensions/src/dist_sql_query/test_util.rs b/df_engine_extensions/src/dist_sql_query/test_util.rs
index 9a1e6d3740..77584a9fca 100644
--- a/df_engine_extensions/src/dist_sql_query/test_util.rs
+++ b/df_engine_extensions/src/dist_sql_query/test_util.rs
@@ -199,7 +199,7 @@ impl TestContext {
             .extract_time_range(&test_schema, &logical_filters)
             .build();
         let read_request = ReadRequest {
-            request_id: 42.into(),
+            request_id: "42".into(),
             opts: ReadOptions::default(),
             projected_schema,
             predicate,
@@ -424,7 +424,7 @@ impl ExecutableScanBuilder for MockScanBuilder {
         ctx: TableScanContext,
     ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
         let request = ReadRequest {
-            request_id: RequestId::from(42),
+            request_id: RequestId::from("test"),
             opts: ReadOptions {
                 batch_size: ctx.batch_size,
                 read_parallelism: ctx.read_parallelism,
diff --git a/interpreters/src/context.rs b/interpreters/src/context.rs
index cbc64612db..34e76d44c9 100644
--- a/interpreters/src/context.rs
+++ b/interpreters/src/context.rs
@@ -52,7 +52,7 @@ impl Context {
     /// Create a new context of query executor
     pub fn new_query_context(&self) -> Result<QueryContextRef> {
         let ctx = QueryContext {
-            request_id: self.request_id,
+            request_id: self.request_id.clone(),
             deadline: self.deadline,
             default_catalog: self.default_catalog.clone(),
             default_schema: self.default_schema.clone(),
@@ -72,7 +72,7 @@ impl Context {
 
     #[inline]
     pub fn request_id(&self) -> RequestId {
-        self.request_id
+        self.request_id.clone()
     }
 
     #[inline]
diff --git a/proxy/src/grpc/prom_query.rs b/proxy/src/grpc/prom_query.rs
index 70094ba943..a600c62cc4 100644
--- a/proxy/src/grpc/prom_query.rs
+++ b/proxy/src/grpc/prom_query.rs
@@ -93,7 +93,7 @@ impl Proxy {
         };
         let frontend = Frontend::new(provider, self.instance.dyn_config.fronted.clone());
 
-        let mut sql_ctx = SqlContext::new(request_id, deadline);
+        let mut sql_ctx = SqlContext::new(request_id.clone(), deadline);
         let expr = frontend
             .parse_promql(&mut sql_ctx, req.expr)
             .box_err()
@@ -125,7 +125,7 @@ impl Proxy {
             })?;
 
         let output = self
-            .execute_plan(request_id, catalog, &schema, plan, deadline)
+            .execute_plan(request_id.clone(), catalog, &schema, plan, deadline)
             .await
             .box_err()
             .with_context(|| ErrWithCause {
diff --git a/proxy/src/http/prom.rs b/proxy/src/http/prom.rs
index 68190174a3..2fc520f43d 100644
--- a/proxy/src/http/prom.rs
+++ b/proxy/src/http/prom.rs
@@ -117,7 +117,7 @@ impl Proxy {
         metric: String,
         query: Query,
     ) -> Result<QueryResult> {
-        let request_id = ctx.request_id;
+        let request_id = &ctx.request_id;
         let begin_instant = Instant::now();
         let deadline = ctx.timeout.map(|t| begin_instant + t);
         info!("Handle prom remote query begin, ctx:{ctx:?}, metric:{metric}, request:{query:?}");
@@ -133,7 +133,7 @@ impl Proxy {
             function_registry: &*self.instance.function_registry,
         };
         let frontend = Frontend::new(provider, self.instance.dyn_config.fronted.clone());
-        let plan_ctx = Context::new(request_id, deadline);
+        let plan_ctx = Context::new(request_id.clone(), deadline);
 
         let RemoteQueryPlan {
             plan,
@@ -156,7 +156,13 @@ impl Proxy {
                 msg: "Query is blocked",
             })?;
         let output = self
-            .execute_plan(request_id, &ctx.catalog, &ctx.schema, plan, deadline)
+            .execute_plan(
+                request_id.clone(),
+                &ctx.catalog,
+                &ctx.schema,
+                plan,
+                deadline,
+            )
             .await?;
 
         let cost = begin_instant.saturating_elapsed().as_millis();
diff --git a/proxy/src/influxdb/mod.rs b/proxy/src/influxdb/mod.rs
index ec61cbb743..292bf1a93f 100644
--- a/proxy/src/influxdb/mod.rs
+++ b/proxy/src/influxdb/mod.rs
@@ -137,7 +137,7 @@ impl Proxy {
             function_registry: &*self.instance.function_registry,
         };
         let frontend = Frontend::new(provider, self.instance.dyn_config.fronted.clone());
-        let sql_ctx = SqlContext::new(request_id, deadline);
+        let sql_ctx = SqlContext::new(request_id.clone(), deadline);
 
         let mut stmts = frontend
             .parse_influxql(&sql_ctx, &req.query)
@@ -180,7 +180,13 @@ impl Proxy {
                 msg: "Query is blocked",
             })?;
         let output = self
-            .execute_plan(request_id, &ctx.catalog, &ctx.schema, plan, deadline)
+            .execute_plan(
+                request_id.clone(),
+                &ctx.catalog,
+                &ctx.schema,
+                plan,
+                deadline,
+            )
             .await?;
 
         info!(
diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index 9102be916b..ba9b33c7d6 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -161,14 +161,14 @@ impl Proxy {
         sql: &str,
         enable_partition_table_access: bool,
     ) -> Result<Output> {
-        let request_id = ctx.request_id;
+        let request_id = &ctx.request_id;
         let slow_threshold_secs = self
             .instance()
             .dyn_config
             .slow_threshold
             .load(std::sync::atomic::Ordering::Relaxed);
         let slow_threshold = Duration::from_secs(slow_threshold_secs);
-        let slow_timer = SlowTimer::new(ctx.request_id.as_u64(), sql, slow_threshold);
+        let slow_timer = SlowTimer::new(request_id.as_str(), sql, slow_threshold);
         let deadline = ctx.timeout.map(|t| slow_timer.start_time() + t);
         let catalog = self.instance.catalog_manager.default_catalog_name();
 
@@ -185,7 +185,7 @@ impl Proxy {
         };
         let frontend = Frontend::new(provider, instance.dyn_config.fronted.clone());
 
-        let mut sql_ctx = SqlContext::new(request_id, deadline);
+        let mut sql_ctx = SqlContext::new(request_id.clone(), deadline);
         // Parse sql, frontend error of invalid sql already contains sql
         // TODO(yingwen): Maybe move sql from frontend error to outer error
         let mut stmts = frontend
@@ -236,10 +236,16 @@ impl Proxy {
         }
 
         let output = if enable_partition_table_access {
-            self.execute_plan_involving_partition_table(request_id, catalog, schema, plan, deadline)
-                .await
+            self.execute_plan_involving_partition_table(
+                request_id.clone(),
+                catalog,
+                schema,
+                plan,
+                deadline,
+            )
+            .await
         } else {
-            self.execute_plan(request_id, catalog, schema, plan, deadline)
+            self.execute_plan(request_id.clone(), catalog, schema, plan, deadline)
                 .await
         };
         let output = output.box_err().with_context(|| ErrWithCause {
diff --git a/proxy/src/write.rs b/proxy/src/write.rs
index c455078ddf..991173205e 100644
--- a/proxy/src/write.rs
+++ b/proxy/src/write.rs
@@ -102,13 +102,13 @@ impl Proxy {
         ctx: Context,
         req: WriteRequest,
     ) -> Result<WriteResponse> {
-        let request_id = ctx.request_id;
+        let request_id = &ctx.request_id;
         let write_context = req.context.clone().context(ErrNoCause {
             msg: "Missing context",
             code: StatusCode::BAD_REQUEST,
         })?;
 
-        self.handle_auto_create_table_with_meta(request_id, &write_context.database, &req)
+        self.handle_auto_create_table_with_meta(request_id.clone(), &write_context.database, &req)
             .await?;
 
         let (write_request_to_local, write_requests_to_forward) =
@@ -121,7 +121,7 @@ impl Proxy {
             .await;
 
         // Write to local.
-        self.collect_write_to_local_future(&mut futures, ctx, request_id, write_request_to_local)
+        self.collect_write_to_local_future(&mut futures, ctx, write_request_to_local)
             .await;
 
         self.collect_write_response(futures).await
@@ -136,7 +136,7 @@ impl Proxy {
         ctx: Context,
         req: WriteRequest,
     ) -> Result<WriteResponse> {
-        let request_id = ctx.request_id;
+        let request_id = &ctx.request_id;
         let write_context = req.context.clone().context(ErrNoCause {
             msg: "Missing context",
             code: StatusCode::BAD_REQUEST,
@@ -152,14 +152,14 @@ impl Proxy {
 
         // Create table.
         self.handle_auto_create_table_without_meta(
-            request_id,
+            request_id.clone(),
             &write_request_to_local,
             &write_context.database,
         )
         .await?;
 
         // Write to local.
-        self.collect_write_to_local_future(&mut futures, ctx, request_id, write_request_to_local)
+        self.collect_write_to_local_future(&mut futures, ctx, write_request_to_local)
             .await;
 
         self.collect_write_response(futures).await
@@ -197,7 +197,7 @@ impl Proxy {
                 continue;
             }
             self.create_table(
-                request_id,
+                request_id.clone(),
                 self.instance.catalog_manager.default_catalog_name(),
                 schema,
                 write_table_req,
@@ -238,7 +238,7 @@ impl Proxy {
                 let table = self.try_get_table(catalog, schema, table_name)?;
                 if table.is_none() {
                     self.create_table(
-                        request_id,
+                        request_id.clone(),
                         catalog,
                         schema,
                         &write_request.table_requests[idx],
@@ -268,7 +268,7 @@ impl Proxy {
             function_registry: &*self.instance.function_registry,
         };
         let frontend = Frontend::new(provider, self.instance.dyn_config.fronted.clone());
-        let ctx = FrontendContext::new(request_id, deadline);
+        let ctx = FrontendContext::new(request_id.clone(), deadline);
         let plan = frontend
             .write_req_to_plan(&ctx, schema_config, write_table_req)
             .box_err()
@@ -385,15 +385,13 @@ impl Proxy {
         &'a self,
         futures: &mut WriteResponseFutures<'a>,
         ctx: Context,
-        request_id: RequestId,
         write_request: WriteRequest,
     ) {
         if write_request.table_requests.is_empty() {
             return;
         }
 
-        let local_handle =
-            async move { Ok(self.write_to_local(ctx, request_id, write_request).await) };
+        let local_handle = async move { Ok(self.write_to_local(ctx, write_request).await) };
         futures.push(local_handle.boxed());
     }
 
@@ -473,12 +471,8 @@ impl Proxy {
         }
     }
 
-    async fn write_to_local(
-        &self,
-        ctx: Context,
-        request_id: RequestId,
-        req: WriteRequest,
-    ) -> Result<WriteResponse> {
+    async fn write_to_local(&self, ctx: Context, req: WriteRequest) -> Result<WriteResponse> {
+        let request_id = ctx.request_id;
         let begin_instant = Instant::now();
         let deadline = ctx.timeout.map(|t| begin_instant + t);
         let catalog_name = self.instance.catalog_manager.default_catalog_name();
@@ -497,7 +491,7 @@ impl Proxy {
         );
 
         let write_context = WriteContext {
-            request_id,
+            request_id: request_id.clone(),
             deadline,
             catalog: catalog_name.to_string(),
             schema: schema_name.clone(),
@@ -513,7 +507,7 @@ impl Proxy {
             let table = insert_plan.table.clone();
             match self
                 .execute_insert_plan(
-                    request_id,
+                    request_id.clone(),
                     catalog_name,
                     &schema_name,
                     insert_plan,
@@ -580,7 +574,7 @@ impl Proxy {
                 let columns = find_new_columns(&table_schema, &write_table_req)?;
                 if !columns.is_empty() {
                     self.execute_add_columns_plan(
-                        request_id,
+                        request_id.clone(),
                         &catalog,
                         &schema,
                         table.clone(),
@@ -693,7 +687,7 @@ impl Proxy {
             operations: AlterTableOperation::AddColumn(columns),
         });
         let _ = self
-            .execute_plan(request_id, catalog, schema, plan, deadline)
+            .execute_plan(request_id.clone(), catalog, schema, plan, deadline)
             .await?;
 
         info!("Add columns success, request_id:{request_id}, table:{table_name}");
diff --git a/query_engine/src/datafusion_impl/mod.rs b/query_engine/src/datafusion_impl/mod.rs
index d4109d3219..ed8d963ffe 100644
--- a/query_engine/src/datafusion_impl/mod.rs
+++ b/query_engine/src/datafusion_impl/mod.rs
@@ -140,7 +140,7 @@ impl DfContextBuilder {
             .deadline
             .map(|deadline| deadline.duration_since(Instant::now()).as_millis() as u64);
         let ceresdb_options = CeresdbOptions {
-            request_id: ctx.request_id.as_u64(),
+            request_id: ctx.request_id.clone().to_string(),
             request_timeout: timeout,
             default_catalog: ctx.default_catalog.clone(),
             default_schema: ctx.default_schema.clone(),
diff --git a/query_engine/src/datafusion_impl/task_context.rs b/query_engine/src/datafusion_impl/task_context.rs
index b5eb7856cd..5946f3d22b 100644
--- a/query_engine/src/datafusion_impl/task_context.rs
+++ b/query_engine/src/datafusion_impl/task_context.rs
@@ -184,7 +184,7 @@ impl RemotePhysicalPlanExecutor for RemotePhysicalPlanExecutorImpl {
             .get::<CeresdbOptions>();
         assert!(ceresdb_options.is_some());
         let ceresdb_options = ceresdb_options.unwrap();
-        let request_id = RequestId::from(ceresdb_options.request_id);
+        let request_id = RequestId::from(ceresdb_options.request_id.clone());
         let deadline = ceresdb_options
             .request_timeout
             .map(|n| Instant::now() + Duration::from_millis(n));
@@ -251,7 +251,7 @@ struct DistQueryResolverBuilder {
 impl DistQueryResolverBuilder {
     fn build(&self, ctx: &Context) -> Resolver {
         let scan_builder = Box::new(ExecutableScanBuilderImpl {
-            request_id: ctx.request_id,
+            request_id: ctx.request_id.clone(),
             deadline: ctx.deadline,
         });
 
@@ -284,7 +284,7 @@ impl ExecutableScanBuilder for ExecutableScanBuilderImpl {
         };
 
         let read_request = ReadRequest {
-            request_id: self.request_id,
+            request_id: self.request_id.clone(),
             opts: read_opts,
             projected_schema: ctx.projected_schema,
             predicate: ctx.predicate,
diff --git a/query_frontend/src/frontend.rs b/query_frontend/src/frontend.rs
index 016df62eb7..7208cfbedf 100644
--- a/query_frontend/src/frontend.rs
+++ b/query_frontend/src/frontend.rs
@@ -144,7 +144,7 @@ impl<P: MetaProvider> Frontend<P> {
     pub fn statement_to_plan(&self, ctx: &Context, stmt: Statement) -> Result<Plan> {
         let planner = Planner::new(
             &self.provider,
-            ctx.request_id,
+            ctx.request_id.clone(),
             ctx.read_parallelism,
             self.dyn_config.as_ref(),
         );
@@ -160,7 +160,7 @@ impl<P: MetaProvider> Frontend<P> {
     ) -> Result<(Plan, Arc<ColumnNames>)> {
         let planner = Planner::new(
             &self.provider,
-            ctx.request_id,
+            ctx.request_id.clone(),
             ctx.read_parallelism,
             self.dyn_config.as_ref(),
         );
@@ -176,7 +176,7 @@ impl<P: MetaProvider> Frontend<P> {
     ) -> Result<RemoteQueryPlan> {
         let planner = Planner::new(
             &self.provider,
-            ctx.request_id,
+            ctx.request_id.clone(),
             ctx.read_parallelism,
             self.dyn_config.as_ref(),
         );
@@ -186,7 +186,7 @@ impl<P: MetaProvider> Frontend<P> {
     pub fn influxql_stmt_to_plan(&self, ctx: &Context, stmt: InfluxqlStatement) -> Result<Plan> {
         let planner = Planner::new(
             &self.provider,
-            ctx.request_id,
+            ctx.request_id.clone(),
             ctx.read_parallelism,
             self.dyn_config.as_ref(),
         );
@@ -201,7 +201,7 @@ impl<P: MetaProvider> Frontend<P> {
     ) -> Result<Plan> {
         let planner = Planner::new(
             &self.provider,
-            ctx.request_id,
+            ctx.request_id.clone(),
             ctx.read_parallelism,
             self.dyn_config.as_ref(),
         );
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index 7fb3e48aec..b0abb62822 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -133,7 +133,7 @@ struct ExecutePlanMetricCollector {
 }
 
 impl ExecutePlanMetricCollector {
-    fn new(request_id: u64, query: String, slow_threshold_secs: u64) -> Self {
+    fn new(request_id: String, query: String, slow_threshold_secs: u64) -> Self {
         Self {
             start: Instant::now(),
             query,
@@ -616,12 +616,12 @@ impl RemoteEngineServiceImpl {
             .load(std::sync::atomic::Ordering::Relaxed);
 
         let metric = ExecutePlanMetricCollector::new(
-            ctx.request_id,
+            ctx.request_id_str.clone(),
             ctx.displayable_query,
             slow_threshold_secs,
         );
         let query_ctx = create_query_ctx(
-            ctx.request_id,
+            ctx.request_id_str,
             ctx.default_catalog,
             ctx.default_schema,
             ctx.timeout_ms,
@@ -661,12 +661,12 @@ impl RemoteEngineServiceImpl {
             .slow_threshold
             .load(std::sync::atomic::Ordering::Relaxed);
         let metric = ExecutePlanMetricCollector::new(
-            ctx.request_id,
+            ctx.request_id_str.clone(),
             ctx.displayable_query,
             slow_threshold_secs,
         );
         let query_ctx = create_query_ctx(
-            ctx.request_id,
+            ctx.request_id_str,
             ctx.default_catalog,
             ctx.default_schema,
             ctx.timeout_ms,
@@ -890,7 +890,7 @@ async fn handle_stream_read(
         msg: "fail to convert read request",
     })?;
 
-    let request_id = read_request.request_id;
+    let request_id = &read_request.request_id;
     info!(
         "Handle stream read, request_id:{request_id}, table:{table_ident:?}, read_options:{:?}, predicate:{:?} ",
         read_request.opts,
@@ -1097,7 +1097,7 @@ fn extract_plan_from_req(request: ExecutePlanRequest) -> Result<(ExecContext, Ve
 }
 
 fn create_query_ctx(
-    request_id: u64,
+    request_id: String,
     default_catalog: String,
     default_schema: String,
     timeout_ms: i64,
diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs
index a8b3e7310f..75cd58b9bf 100644
--- a/table_engine/src/provider.rs
+++ b/table_engine/src/provider.rs
@@ -51,7 +51,7 @@ const SCAN_TABLE_METRICS_COLLECTOR_NAME: &str = "scan_table";
 
 #[derive(Clone, Debug)]
 pub struct CeresdbOptions {
-    pub request_id: u64,
+    pub request_id: String,
     pub request_timeout: Option<u64>,
     pub default_schema: String,
     pub default_catalog: String,
@@ -76,13 +76,7 @@ impl ExtensionOptions for CeresdbOptions {
 
     fn set(&mut self, key: &str, value: &str) -> Result<()> {
         match key {
-            "request_id" => {
-                self.request_id = value.parse::<u64>().map_err(|e| {
-                    DataFusionError::External(
-                        format!("could not parse request_id, input:{value}, err:{e:?}").into(),
-                    )
-                })?
-            }
+            "request_id" => self.request_id = value.to_string(),
             "request_timeout" => {
                 self.request_timeout = Some(value.parse::<u64>().map_err(|e| {
                     DataFusionError::External(
@@ -182,7 +176,7 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
         let ceresdb_options = state.config_options().extensions.get::<CeresdbOptions>();
         assert!(ceresdb_options.is_some());
         let ceresdb_options = ceresdb_options.unwrap();
-        let request_id = RequestId::from(ceresdb_options.request_id);
+        let request_id = RequestId::from(ceresdb_options.request_id.clone());
         let deadline = ceresdb_options
             .request_timeout
             .map(|n| Instant::now() + Duration::from_millis(n));
diff --git a/table_engine/src/remote/model.rs b/table_engine/src/remote/model.rs
index 842829d3c4..2fc6a297eb 100644
--- a/table_engine/src/remote/model.rs
+++ b/table_engine/src/remote/model.rs
@@ -463,7 +463,8 @@ impl From<RemoteExecuteRequest> for ceresdbproto::remote_engine::ExecutePlanRequ
         };
 
         let pb_context = ceresdbproto::remote_engine::ExecContext {
-            request_id: value.context.request_id.as_u64(),
+            request_id: 0, // not used any more
+            request_id_str: value.context.request_id.to_string(),
             default_catalog: value.context.default_catalog,
             default_schema: value.context.default_schema,
             timeout_ms: rest_duration_ms,
@@ -504,7 +505,7 @@ impl TryFrom<ceresdbproto::remote_engine::ExecutePlanRequest> for RemoteExecuteR
             msg: "missing exec ctx",
         })?;
         let ceresdbproto::remote_engine::ExecContext {
-            request_id,
+            request_id_str,
             default_catalog,
             default_schema,
             timeout_ms,
@@ -512,7 +513,7 @@ impl TryFrom<ceresdbproto::remote_engine::ExecutePlanRequest> for RemoteExecuteR
             ..
         } = pb_exec_ctx;
 
-        let request_id = RequestId::from(request_id);
+        let request_id = RequestId::from(request_id_str);
         let deadline = if timeout_ms >= 0 {
             Some(Instant::now() + Duration::from_millis(timeout_ms as u64))
         } else {
diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs
index 62f7c3437a..4f3e854f49 100644
--- a/table_engine/src/table.rs
+++ b/table_engine/src/table.rs
@@ -437,7 +437,7 @@ impl TryFrom<ReadRequest> for ceresdbproto::remote_engine::TableReadRequest {
                 })?;
 
         Ok(Self {
-            request_id: request.request_id.as_u64(),
+            request_id: 0, // this field not used any more
             opts: Some(request.opts.into()),
             projected_schema: Some(request.projected_schema.into()),
             predicate: Some(predicate_pb),
@@ -465,7 +465,7 @@ impl TryFrom<ceresdbproto::remote_engine::TableReadRequest> for ReadRequest {
                 .context(ConvertPredicate)?,
         );
         Ok(Self {
-            request_id: pb.request_id.into(),
+            request_id: pb.request_id.to_string().into(),
             opts,
             projected_schema,
             predicate,

From 9f9079f5862262e012685e5f3cc62ed5b5c461cd Mon Sep 17 00:00:00 2001
From: CooooolFrog <zuliangwanghust@gmail.com>
Date: Thu, 7 Dec 2023 14:38:40 +0800
Subject: [PATCH 03/38] chore: update create table integration test result
 (#1344)

## Rationale
Because the logic related to create table has been reconstructed in
`horaemeta` https://github.com/CeresDB/horaemeta/pull/288, the results
returned when reporting errors during create table have changed.
Therefore, we need to correct the results of table creation in the
integration test.

## Detailed Changes
* Update create table result in integration test.

## Test Plan
No need.
---
 .../cases/env/cluster/ddl/create_tables.result           | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/integration_tests/cases/env/cluster/ddl/create_tables.result b/integration_tests/cases/env/cluster/ddl/create_tables.result
index ad40c9b100..51cc171193 100644
--- a/integration_tests/cases/env/cluster/ddl/create_tables.result
+++ b/integration_tests/cases/env/cluster/ddl/create_tables.result
@@ -45,12 +45,12 @@ affected_rows: 0
 
 CREATE TABLE IF NOT EXISTS `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t\", engine: \"Analytic\", create_if_not_exist: true, options: {}, partition_table_info: None }, err:Missing table info, msg:created table is not found in the create table response. sql:CREATE TABLE IF NOT EXISTS `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t\", engine: \"Analytic\", create_if_not_exist: true, options: {}, partition_table_info: None }, err:Bad response, resp code:500, msg:create table metadata: tableName:05_create_tables_t: (#500)table already exists, cause:<nil>. sql:CREATE TABLE IF NOT EXISTS `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
 
 -- table already exist
 CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t\", engine: \"Analytic\", create_if_not_exist: false, options: {}, partition_table_info: None }, err:Missing table info, msg:created table is not found in the create table response. sql:CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t\", engine: \"Analytic\", create_if_not_exist: false, options: {}, partition_table_info: None }, err:Bad response, resp code:500, msg:create table metadata: tableName:05_create_tables_t: (#500)table already exists, cause:<nil>. sql:CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
 
 create table `05_create_tables_t2`(a int, b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic with (enable_ttl='false');
 
@@ -70,12 +70,12 @@ Int32(4),
 -- table already exist
 create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t2\", engine: \"Analytic\", create_if_not_exist: false, options: {}, partition_table_info: None }, err:Missing table info, msg:created table is not found in the create table response. sql:create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t2\", engine: \"Analytic\", create_if_not_exist: false, options: {}, partition_table_info: None }, err:Bad response, resp code:500, msg:create table metadata: tableName:05_create_tables_t2: (#500)table already exists, cause:<nil>. sql:create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
 
 -- table already exist
 create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t2\", engine: \"Analytic\", create_if_not_exist: false, options: {}, partition_table_info: None }, err:Missing table info, msg:created table is not found in the create table response. sql:create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to create table, msg:failed to create table by meta client, req:CreateTableRequest { schema_name: \"public\", name: \"05_create_tables_t2\", engine: \"Analytic\", create_if_not_exist: false, options: {}, partition_table_info: None }, err:Bad response, resp code:500, msg:create table metadata: tableName:05_create_tables_t2: (#500)table already exists, cause:<nil>. sql:create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;" })
 
 create table `05_create_tables_t3`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
@@ -317,4 +317,3 @@ affected_rows: 0
 DROP TABLE IF EXISTS `05_create_tables_t12`;
 
 affected_rows: 0
-

From a498083bbe9f7b90f3cb18dcbef6f25b5ba998b6 Mon Sep 17 00:00:00 2001
From: CooooolFrog <zuliangwanghust@gmail.com>
Date: Thu, 7 Dec 2023 15:01:01 +0800
Subject: [PATCH 04/38] fix: fix create table result (#1354)

## Rationale
In this PR https://github.com/CeresDB/horaedb/pull/1344, the results of
table creation in the integration test were updated, but it was found
that the integration test failed to pass due to an extra blank row. Fix
this problem.

## Detailed Changes
* Fix create table result.

## Test Plan
No need.
---
 integration_tests/cases/env/cluster/ddl/create_tables.result | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration_tests/cases/env/cluster/ddl/create_tables.result b/integration_tests/cases/env/cluster/ddl/create_tables.result
index 51cc171193..0d24f5043f 100644
--- a/integration_tests/cases/env/cluster/ddl/create_tables.result
+++ b/integration_tests/cases/env/cluster/ddl/create_tables.result
@@ -316,4 +316,4 @@ affected_rows: 0
 
 DROP TABLE IF EXISTS `05_create_tables_t12`;
 
-affected_rows: 0
+affected_rows: 0
\ No newline at end of file

From f164cdaab242e2c025e686004b0ea3878c3b0623 Mon Sep 17 00:00:00 2001
From: CooooolFrog <zuliangwanghust@gmail.com>
Date: Thu, 7 Dec 2023 16:04:31 +0800
Subject: [PATCH 05/38] Revert "fix: fix create table result" (#1355)

Reverts CeresDB/horaedb#1354
---
 integration_tests/cases/env/cluster/ddl/create_tables.result | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration_tests/cases/env/cluster/ddl/create_tables.result b/integration_tests/cases/env/cluster/ddl/create_tables.result
index 0d24f5043f..51cc171193 100644
--- a/integration_tests/cases/env/cluster/ddl/create_tables.result
+++ b/integration_tests/cases/env/cluster/ddl/create_tables.result
@@ -316,4 +316,4 @@ affected_rows: 0
 
 DROP TABLE IF EXISTS `05_create_tables_t12`;
 
-affected_rows: 0
\ No newline at end of file
+affected_rows: 0

From b5265faddf596d4748a2844539926beaa84c345b Mon Sep 17 00:00:00 2001
From: CooooolFrog <zuliangwanghust@gmail.com>
Date: Thu, 7 Dec 2023 16:47:47 +0800
Subject: [PATCH 06/38] fix: fix test create table result (#1357)

## Rationale


## Detailed Changes


## Test Plan
---
 integration_tests/cases/env/cluster/ddl/create_tables.result | 1 +
 1 file changed, 1 insertion(+)

diff --git a/integration_tests/cases/env/cluster/ddl/create_tables.result b/integration_tests/cases/env/cluster/ddl/create_tables.result
index 51cc171193..1abd07c69c 100644
--- a/integration_tests/cases/env/cluster/ddl/create_tables.result
+++ b/integration_tests/cases/env/cluster/ddl/create_tables.result
@@ -317,3 +317,4 @@ affected_rows: 0
 DROP TABLE IF EXISTS `05_create_tables_t12`;
 
 affected_rows: 0
+

From fc76a795226913a1a649f53430145e6f8d7f748e Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Thu, 7 Dec 2023 19:15:06 +0800
Subject: [PATCH 07/38] chore: disable frequently failed tests (#1352)

## Rationale
<del>This issue happens quite a lot in CI, and developer can do nothing
beside retry, which is a very annoying thing. Since this test is cased
by race condition, and there is no easy way to fix it, so I suggest we
disable it for now.
</del>


## Detailed Changes
Increase wait time to 10s.

## Test Plan

CI
---
 analytic_engine/src/tests/read_write_test.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/analytic_engine/src/tests/read_write_test.rs b/analytic_engine/src/tests/read_write_test.rs
index bc453cbb56..4b39399b96 100644
--- a/analytic_engine/src/tests/read_write_test.rs
+++ b/analytic_engine/src/tests/read_write_test.rs
@@ -667,7 +667,8 @@ fn test_write_buffer_size_overflow<T: WalsOpener>(
         .await;
 
         // TODO(lee) a better way to wait table flushing finishes.
-        thread::sleep(time::Duration::from_millis(1500));
+        // https://github.com/CeresDB/horaedb/issues/1241
+        thread::sleep(time::Duration::from_millis(10000));
 
         let stats = table.stats();
         assert_eq!(old_stats.num_read + 5, stats.num_read);

From 281111afa362d00658f9aca27ca5895c7cd22299 Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Mon, 11 Dec 2023 14:03:36 +0800
Subject: [PATCH 08/38] chore: ignore flush failure when flush (#1362)

## Rationale
The flush failure before table close may lead to shard close failure.
However, such failure is tolerable because the unflushed can still be
recovered during the following table open.

## Detailed Changes
Ignore the flush failure before closing table.

## Test Plan
Pass all the tests in the ci.
---
 analytic_engine/src/instance/close.rs | 45 +++++++++++++++++----------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/analytic_engine/src/instance/close.rs b/analytic_engine/src/instance/close.rs
index 9cc801590c..5719b18015 100644
--- a/analytic_engine/src/instance/close.rs
+++ b/analytic_engine/src/instance/close.rs
@@ -25,6 +25,7 @@ use crate::{
     },
     manifest::{ManifestRef, SnapshotRequest},
     space::SpaceRef,
+    table::data::TableDataRef,
 };
 
 pub(crate) struct Closer {
@@ -47,13 +48,37 @@ impl Closer {
             }
         };
 
+        // Do flush before close for the fast recovery during the following opening.
+        // And it should not stop closing if flush fails.
+        if let Err(e) = self.flush(&table_data).await {
+            warn!(
+                "Ignore the failure to flush data before close, table:{}, table_id:{}, err:{e}",
+                table_data.name, table_data.id
+            );
+        }
+
+        // Table has been closed so remove it from the space.
+        let removed_table = self.space.remove_table(&request.table_name);
+        assert!(removed_table.is_some());
+
+        // Table is already moved out of space, we should close it to stop background
+        // jobs.
+        table_data.set_closed();
+
+        info!(
+            "table:{}-{} has been removed from the space_id:{}",
+            table_data.name, table_data.id, self.space.id
+        );
+        Ok(())
+    }
+
+    async fn flush(&self, table_data: &TableDataRef) -> Result<()> {
         // Flush table.
         let opts = TableFlushOptions::default();
         let mut serial_exec = table_data.serial_exec.lock().await;
         let flush_scheduler = serial_exec.flush_scheduler();
-
         self.flusher
-            .do_flush(flush_scheduler, &table_data, opts)
+            .do_flush(flush_scheduler, table_data, opts)
             .await
             .context(FlushTable {
                 space_id: self.space.id,
@@ -74,20 +99,6 @@ impl Closer {
             .context(DoManifestSnapshot {
                 space_id: self.space.id,
                 table: &table_data.name,
-            })?;
-
-        // Table has been closed so remove it from the space.
-        let removed_table = self.space.remove_table(&request.table_name);
-        assert!(removed_table.is_some());
-
-        // Table is already moved out of space, we should close it to stop background
-        // jobs.
-        table_data.set_closed();
-
-        info!(
-            "table:{}-{} has been removed from the space_id:{}",
-            table_data.name, table_data.id, self.space.id
-        );
-        Ok(())
+            })
     }
 }

From 65130f590b03fd43a10be2d66d88aaa44bb5007d Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Mon, 11 Dec 2023 18:58:12 +0800
Subject: [PATCH 09/38] feat: support metrics for number of bytes fetched from
 object storage (#1363)

## Rationale
The metrics about the fetched data from the object storage helps
estimate the load of query on one specific table, and the table's load
distribution can help build a cluster topology with better load balance.

## Detailed Changes
Collect metrics for bytes fetched from object storage.

## Test Plan
Check the added metrics manually.
---
 analytic_engine/src/instance/read.rs          | 18 +----
 analytic_engine/src/sst/metrics.rs            | 29 ++++++-
 .../src/sst/parquet/async_reader.rs           | 33 ++++++--
 analytic_engine/src/table/metrics.rs          | 16 ++--
 components/parquet_ext/src/meta_data.rs       |  4 +-
 components/parquet_ext/src/reader.rs          | 80 +++++++++++++------
 6 files changed, 125 insertions(+), 55 deletions(-)

diff --git a/analytic_engine/src/instance/read.rs b/analytic_engine/src/instance/read.rs
index 09f261fd18..f769ec689d 100644
--- a/analytic_engine/src/instance/read.rs
+++ b/analytic_engine/src/instance/read.rs
@@ -106,16 +106,10 @@ impl Instance {
         let now = current_time_millis() as i64;
 
         let query_time_range = (end_time as f64 - start_time as f64) / 1000.0;
-        table_data
-            .metrics
-            .maybe_table_level_metrics()
-            .query_time_range
-            .observe(query_time_range);
-
+        let table_metrics = table_data.metrics.maybe_table_level_metrics();
+        table_metrics.query_time_range.observe(query_time_range);
         let since_start = (now as f64 - start_time as f64) / 1000.0;
-        table_data
-            .metrics
-            .maybe_table_level_metrics()
+        table_metrics
             .duration_since_query_query_start_time
             .observe(since_start);
 
@@ -132,11 +126,7 @@ impl Instance {
         let sst_read_options = create_sst_read_option(
             ScanType::Query,
             self.scan_options.clone(),
-            table_data
-                .metrics
-                .maybe_table_level_metrics()
-                .sst_metrics
-                .clone(),
+            table_metrics.sst_metrics.clone(),
             table_options.num_rows_per_row_group,
             request.projected_schema.clone(),
             request.predicate.clone(),
diff --git a/analytic_engine/src/sst/metrics.rs b/analytic_engine/src/sst/metrics.rs
index 3ae4e1f9fc..5200181d75 100644
--- a/analytic_engine/src/sst/metrics.rs
+++ b/analytic_engine/src/sst/metrics.rs
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::sync::atomic::{AtomicU64, Ordering};
+
 use lazy_static::lazy_static;
 use prometheus::{
-    exponential_buckets, register_counter, register_histogram, register_int_counter_vec, Counter,
-    Histogram, IntCounter, IntCounterVec,
+    exponential_buckets, register_counter, register_histogram, register_histogram_vec,
+    register_int_counter_vec, Counter, Histogram, HistogramVec, IntCounter, IntCounterVec,
 };
 
 lazy_static! {
@@ -48,12 +50,21 @@ lazy_static! {
         "The counter for row group after prune",
         &["table"]
     ).unwrap();
+
+    static ref FETCHED_SST_BYTES_HISTOGRAM: HistogramVec = register_histogram_vec!(
+        "fetched_sst_bytes",
+        "Histogram for sst get range length",
+        &["table"],
+        exponential_buckets(100.0, 2.0, 5).unwrap()
+    ).unwrap();
 }
 
 #[derive(Debug)]
 pub struct MaybeTableLevelMetrics {
     pub row_group_before_prune_counter: IntCounter,
     pub row_group_after_prune_counter: IntCounter,
+    pub fetched_sst_bytes_hist: Histogram,
+    pub fetched_sst_bytes: AtomicU64,
 }
 
 impl MaybeTableLevelMetrics {
@@ -63,6 +74,20 @@ impl MaybeTableLevelMetrics {
                 .with_label_values(&[table]),
             row_group_after_prune_counter: ROW_GROUP_AFTER_PRUNE_COUNTER
                 .with_label_values(&[table]),
+            fetched_sst_bytes_hist: FETCHED_SST_BYTES_HISTOGRAM.with_label_values(&[table]),
+            fetched_sst_bytes: AtomicU64::new(0),
         }
     }
+
+    #[inline]
+    pub fn observe_fetched_sst_bytes(&self) {
+        self.fetched_sst_bytes_hist
+            .observe(self.fetched_sst_bytes.load(Ordering::Relaxed) as f64)
+    }
+}
+
+impl Drop for MaybeTableLevelMetrics {
+    fn drop(&mut self) {
+        self.observe_fetched_sst_bytes();
+    }
 }
diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index d4d378adef..8e34c125d5 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -17,7 +17,7 @@
 use std::{
     ops::Range,
     pin::Pin,
-    sync::Arc,
+    sync::{atomic::Ordering, Arc},
     task::{Context, Poll},
     time::{Duration, Instant},
 };
@@ -43,7 +43,10 @@ use parquet::{
     arrow::{arrow_reader::RowSelection, ParquetRecordBatchStreamBuilder, ProjectionMask},
     file::metadata::RowGroupMetaData,
 };
-use parquet_ext::{meta_data::ChunkReader, reader::ObjectStoreReader};
+use parquet_ext::{
+    meta_data::ChunkReader,
+    reader::{MetricsObserver, ObjectStoreReader},
+};
 use runtime::{AbortOnDropMany, JoinHandle, Runtime};
 use snafu::ResultExt;
 use table_engine::predicate::PredicateRef;
@@ -235,7 +238,6 @@ impl<'a> Reader<'a> {
     }
 
     // TODO: remove it and use the suggested api.
-    #[allow(deprecated)]
     async fn fetch_record_batch_streams(
         &mut self,
         suggested_parallelism: usize,
@@ -307,11 +309,15 @@ impl<'a> Reader<'a> {
         );
 
         let mut streams = Vec::with_capacity(target_row_group_chunks.len());
+        let metrics_collector = ObjectStoreMetricsObserver {
+            table_level_sst_metrics: self.table_level_sst_metrics.clone(),
+        };
         for chunk in target_row_group_chunks {
-            let object_store_reader = ObjectStoreReader::new(
+            let object_store_reader = ObjectStoreReader::with_metrics(
                 self.store.clone(),
                 self.path.clone(),
                 parquet_metadata.clone(),
+                metrics_collector.clone(),
             );
             let mut builder = ParquetRecordBatchStreamBuilder::new(object_store_reader)
                 .await
@@ -323,7 +329,7 @@ impl<'a> Reader<'a> {
             debug!(
                 "Build row selection for file path:{}, result:{row_selection:?}, page indexes:{}",
                 self.path,
-                parquet_metadata.page_indexes().is_some()
+                parquet_metadata.column_index().is_some()
             );
             if let Some(selection) = row_selection {
                 builder = builder.with_row_selection(selection);
@@ -755,6 +761,23 @@ impl<'a> SstReader for ThreadedReader<'a> {
     }
 }
 
+#[derive(Clone)]
+struct ObjectStoreMetricsObserver {
+    table_level_sst_metrics: Arc<MaybeTableLevelMetrics>,
+}
+
+impl MetricsObserver for ObjectStoreMetricsObserver {
+    fn elapsed(&self, path: &Path, elapsed: Duration) {
+        debug!("ObjectStoreReader dropped, path:{path}, elapsed:{elapsed:?}",);
+    }
+
+    fn num_bytes_fetched(&self, _: &Path, num_bytes: usize) {
+        self.table_level_sst_metrics
+            .fetched_sst_bytes
+            .fetch_add(num_bytes as u64, Ordering::Relaxed);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::{
diff --git a/analytic_engine/src/table/metrics.rs b/analytic_engine/src/table/metrics.rs
index ea3b0c526c..c6b85917d0 100644
--- a/analytic_engine/src/table/metrics.rs
+++ b/analytic_engine/src/table/metrics.rs
@@ -165,12 +165,11 @@ impl From<&AtomicTableStats> for TableStats {
 /// Now the registered labels won't remove from the metrics vec to avoid panic
 /// on concurrent removal.
 pub struct Metrics {
-    // Stats of a single table.
+    /// The table name used for metric label
+    maybe_table_name: String,
+    /// Stats of a single table.
     stats: Arc<AtomicTableStats>,
 
-    // Maybe table level sst metrics
-    maybe_table_level_metrics: Arc<MaybeTableLevelMetrics>,
-
     compaction_input_sst_size_histogram: Histogram,
     compaction_output_sst_size_histogram: Histogram,
     compaction_input_sst_row_num_histogram: Histogram,
@@ -193,8 +192,8 @@ pub struct Metrics {
 impl Default for Metrics {
     fn default() -> Self {
         Self {
+            maybe_table_name: DEFAULT_METRICS_KEY.to_string(),
             stats: Arc::new(AtomicTableStats::default()),
-            maybe_table_level_metrics: Arc::new(MaybeTableLevelMetrics::new(DEFAULT_METRICS_KEY)),
             compaction_input_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM
                 .with_label_values(&["input"]),
             compaction_output_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM
@@ -290,16 +289,15 @@ impl<'a> MetricsContext<'a> {
 impl Metrics {
     pub fn new(mut metric_ctx: MetricsContext) -> Self {
         Self {
-            maybe_table_level_metrics: Arc::new(MaybeTableLevelMetrics::new(
-                metric_ctx.maybe_table_name(),
-            )),
+            maybe_table_name: metric_ctx.maybe_table_name().to_string(),
             ..Default::default()
         }
     }
 
+    /// Generate a table-level metric observer.
     #[inline]
     pub fn maybe_table_level_metrics(&self) -> Arc<MaybeTableLevelMetrics> {
-        self.maybe_table_level_metrics.clone()
+        Arc::new(MaybeTableLevelMetrics::new(&self.maybe_table_name))
     }
 
     #[inline]
diff --git a/components/parquet_ext/src/meta_data.rs b/components/parquet_ext/src/meta_data.rs
index fa2dd6b01d..edae1e01e5 100644
--- a/components/parquet_ext/src/meta_data.rs
+++ b/components/parquet_ext/src/meta_data.rs
@@ -23,7 +23,7 @@ use parquet::{
     file::{footer, metadata::ParquetMetaData},
 };
 
-use crate::reader::ObjectStoreReader;
+use crate::reader::{NoopMetricsObserver, ObjectStoreReader};
 
 #[async_trait]
 pub trait ChunkReader: Sync + Send {
@@ -86,7 +86,7 @@ pub async fn fetch_parquet_metadata(
 /// TODO: Currently there is no method to build page indexes for meta data in
 /// `parquet`, maybe we can write a issue in `arrow-rs` .
 pub async fn meta_with_page_indexes(
-    object_store_reader: ObjectStoreReader,
+    object_store_reader: ObjectStoreReader<NoopMetricsObserver>,
 ) -> Result<Arc<ParquetMetaData>> {
     let read_options = ArrowReaderOptions::new().with_page_index(true);
     let builder =
diff --git a/components/parquet_ext/src/reader.rs b/components/parquet_ext/src/reader.rs
index 389779d792..57d58671f0 100644
--- a/components/parquet_ext/src/reader.rs
+++ b/components/parquet_ext/src/reader.rs
@@ -12,60 +12,94 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::{ops::Range, sync::Arc, time::Instant};
+use std::{
+    ops::Range,
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
 use bytes::Bytes;
 use futures::{
     future::{BoxFuture, FutureExt},
     TryFutureExt,
 };
-use logger::debug;
 use object_store::{ObjectStoreRef, Path};
 use parquet::{arrow::async_reader::AsyncFileReader, file::metadata::ParquetMetaData};
 
-/// Implemention AsyncFileReader based on `ObjectStore`
-///
-/// TODO: Perhaps we should avoid importing `object_store` in `parquet_ext` to
-/// keep the crate `parquet_ext` more pure.
+/// The observer for metrics of [ObjectStoreReader].
+pub trait MetricsObserver: Send {
+    fn elapsed(&self, path: &Path, elapsed: Duration);
+    fn num_bytes_fetched(&self, path: &Path, num_bytes: usize);
+}
+
+#[derive(Debug, Clone)]
+pub struct NoopMetricsObserver;
+
+impl MetricsObserver for NoopMetricsObserver {
+    fn elapsed(&self, _: &Path, _: Duration) {}
+
+    fn num_bytes_fetched(&self, _: &Path, _: usize) {}
+}
+
+/// The implementation based on `ObjectStore` for [`AsyncFileReader`].
 #[derive(Clone)]
-pub struct ObjectStoreReader {
+pub struct ObjectStoreReader<T: MetricsObserver> {
     storage: ObjectStoreRef,
     path: Path,
     meta_data: Arc<ParquetMetaData>,
     begin: Instant,
+    metrics: T,
 }
 
-impl ObjectStoreReader {
+impl ObjectStoreReader<NoopMetricsObserver> {
     pub fn new(storage: ObjectStoreRef, path: Path, meta_data: Arc<ParquetMetaData>) -> Self {
+        Self::with_metrics(storage, path, meta_data, NoopMetricsObserver)
+    }
+}
+
+impl<T: MetricsObserver> ObjectStoreReader<T> {
+    pub fn with_metrics(
+        storage: ObjectStoreRef,
+        path: Path,
+        meta_data: Arc<ParquetMetaData>,
+        metrics: T,
+    ) -> Self {
         Self {
             storage,
             path,
             meta_data,
             begin: Instant::now(),
+            metrics,
         }
     }
 }
 
-impl Drop for ObjectStoreReader {
+impl<T: MetricsObserver> Drop for ObjectStoreReader<T> {
     fn drop(&mut self) {
-        debug!(
-            "ObjectStoreReader dropped, path:{}, elapsed:{:?}",
-            &self.path,
-            self.begin.elapsed()
-        );
+        self.metrics.elapsed(&self.path, self.begin.elapsed())
     }
 }
 
-impl AsyncFileReader for ObjectStoreReader {
+impl<T: MetricsObserver> AsyncFileReader for ObjectStoreReader<T> {
     fn get_bytes(&mut self, range: Range<usize>) -> BoxFuture<'_, parquet::errors::Result<Bytes>> {
-        self.storage
-            .get_range(&self.path, range)
-            .map_err(|e| {
-                parquet::errors::ParquetError::General(format!(
-                    "Failed to fetch range from object store, err:{e}"
-                ))
-            })
-            .boxed()
+        async move {
+            let get_res = self
+                .storage
+                .get_range(&self.path, range)
+                .map_err(|e| {
+                    parquet::errors::ParquetError::General(format!(
+                        "Failed to fetch range from object store, err:{e}"
+                    ))
+                })
+                .await;
+
+            if let Ok(bytes) = &get_res {
+                self.metrics.num_bytes_fetched(&self.path, bytes.len());
+            }
+
+            get_res
+        }
+        .boxed()
     }
 
     fn get_byte_ranges(

From 18a59b61cfc76348a168dab9f2b2bb903e590a2f Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Tue, 12 Dec 2023 11:09:13 +0800
Subject: [PATCH 10/38] fix: collect metrics for `get_ranges` (#1364)

## Rationale
Metrics about fetched sst bytes with `get_ranges` is not collected.

## Detailed Changes
Collect the missing metrics.

## Test Plan
Query and check the metrics after executing a query.
---
 analytic_engine/src/sst/metrics.rs            | 22 +++++++++++--------
 .../src/sst/parquet/async_reader.rs           |  2 +-
 components/parquet_ext/src/reader.rs          | 12 ++++++++--
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/analytic_engine/src/sst/metrics.rs b/analytic_engine/src/sst/metrics.rs
index 5200181d75..9d2c51b790 100644
--- a/analytic_engine/src/sst/metrics.rs
+++ b/analytic_engine/src/sst/metrics.rs
@@ -55,7 +55,8 @@ lazy_static! {
         "fetched_sst_bytes",
         "Histogram for sst get range length",
         &["table"],
-        exponential_buckets(100.0, 2.0, 5).unwrap()
+        // The buckets: [1MB, 2MB, 4MB, 8MB, ... , 8GB]
+        exponential_buckets(1024.0 * 1024.0, 2.0, 13).unwrap()
     ).unwrap();
 }
 
@@ -63,8 +64,8 @@ lazy_static! {
 pub struct MaybeTableLevelMetrics {
     pub row_group_before_prune_counter: IntCounter,
     pub row_group_after_prune_counter: IntCounter,
-    pub fetched_sst_bytes_hist: Histogram,
-    pub fetched_sst_bytes: AtomicU64,
+    pub num_fetched_sst_bytes_hist: Histogram,
+    pub num_fetched_sst_bytes: AtomicU64,
 }
 
 impl MaybeTableLevelMetrics {
@@ -74,20 +75,23 @@ impl MaybeTableLevelMetrics {
                 .with_label_values(&[table]),
             row_group_after_prune_counter: ROW_GROUP_AFTER_PRUNE_COUNTER
                 .with_label_values(&[table]),
-            fetched_sst_bytes_hist: FETCHED_SST_BYTES_HISTOGRAM.with_label_values(&[table]),
-            fetched_sst_bytes: AtomicU64::new(0),
+            num_fetched_sst_bytes_hist: FETCHED_SST_BYTES_HISTOGRAM.with_label_values(&[table]),
+            num_fetched_sst_bytes: AtomicU64::new(0),
         }
     }
 
     #[inline]
-    pub fn observe_fetched_sst_bytes(&self) {
-        self.fetched_sst_bytes_hist
-            .observe(self.fetched_sst_bytes.load(Ordering::Relaxed) as f64)
+    pub fn maybe_observe_num_fetched_sst_bytes(&self) {
+        let num_fetched_sst_bytes = self.num_fetched_sst_bytes.load(Ordering::Relaxed);
+        if num_fetched_sst_bytes != 0 {
+            self.num_fetched_sst_bytes_hist
+                .observe(num_fetched_sst_bytes as f64);
+        }
     }
 }
 
 impl Drop for MaybeTableLevelMetrics {
     fn drop(&mut self) {
-        self.observe_fetched_sst_bytes();
+        self.maybe_observe_num_fetched_sst_bytes();
     }
 }
diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index 8e34c125d5..f58c1da852 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -773,7 +773,7 @@ impl MetricsObserver for ObjectStoreMetricsObserver {
 
     fn num_bytes_fetched(&self, _: &Path, num_bytes: usize) {
         self.table_level_sst_metrics
-            .fetched_sst_bytes
+            .num_fetched_sst_bytes
             .fetch_add(num_bytes as u64, Ordering::Relaxed);
     }
 }
diff --git a/components/parquet_ext/src/reader.rs b/components/parquet_ext/src/reader.rs
index 57d58671f0..0d70d726a3 100644
--- a/components/parquet_ext/src/reader.rs
+++ b/components/parquet_ext/src/reader.rs
@@ -107,14 +107,22 @@ impl<T: MetricsObserver> AsyncFileReader for ObjectStoreReader<T> {
         ranges: Vec<Range<usize>>,
     ) -> BoxFuture<'_, parquet::errors::Result<Vec<Bytes>>> {
         async move {
-            self.storage
+            let get_res = self
+                .storage
                 .get_ranges(&self.path, &ranges)
                 .map_err(|e| {
                     parquet::errors::ParquetError::General(format!(
                         "Failed to fetch ranges from object store, err:{e}"
                     ))
                 })
-                .await
+                .await;
+
+            if let Ok(bytes) = &get_res {
+                let num_bytes: usize = bytes.iter().map(|v| v.len()).sum();
+                self.metrics.num_bytes_fetched(&self.path, num_bytes);
+            }
+
+            get_res
         }
         .boxed()
     }

From 1d8593fcfc412e92bf64f013d5727265a4da4bce Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Fri, 15 Dec 2023 11:01:45 +0800
Subject: [PATCH 11/38] chore: disable timeout for http api (#1367)

## Rationale


## Detailed Changes


## Test Plan
Pass CI
---
 integration_tests/cases/env/local/ddl/query-plan.result | 8 ++++----
 server/src/http.rs                                      | 5 ++++-
 table_engine/src/provider.rs                            | 2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/integration_tests/cases/env/local/ddl/query-plan.result b/integration_tests/cases/env/local/ddl/query-plan.result
index 9fe35c86a2..ec2258d64d 100644
--- a/integration_tests/cases/env/local/ddl/query-plan.result
+++ b/integration_tests/cases/env/local/ddl/query-plan.result
@@ -27,7 +27,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348001000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=1\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1:\n\n\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=1\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1:\n=0]\n"),
 
 
 -- This query should not include memtable
@@ -36,7 +36,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348002000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n\n\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n=0]\n"),
 
 
 -- SQLNESS ARG pre_cmd=flush
@@ -47,7 +47,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348001000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=1\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_sst_1:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=320\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n\n\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=1\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_sst_1:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=320\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
 
 
 -- This query should not include SST
@@ -55,7 +55,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348002000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n\n\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n=0]\n"),
 
 
 DROP TABLE `03_dml_select_real_time_range`;
diff --git a/server/src/http.rs b/server/src/http.rs
index 04d2134ed8..068f21fa44 100644
--- a/server/src/http.rs
+++ b/server/src/http.rs
@@ -310,7 +310,10 @@ impl Service {
             .and(self.with_proxy())
             .and(self.with_read_runtime())
             .and_then(
-                |req, ctx, proxy: Arc<Proxy>, runtime: RuntimeRef| async move {
+                |req, mut ctx: RequestContext, proxy: Arc<Proxy>, runtime: RuntimeRef| async move {
+                    // We don't timeout http api since it's mainly used for debugging.
+                    ctx.timeout = None;
+
                     let result = runtime
                         .spawn(async move {
                             proxy
diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs
index 75cd58b9bf..7f9e974708 100644
--- a/table_engine/src/provider.rs
+++ b/table_engine/src/provider.rs
@@ -413,7 +413,7 @@ impl ExecutionPlan for ScanTable {
         let pushdown_filters = &self.request.predicate;
         metric_set.push(Arc::new(Metric::new(
             MetricValue::Count {
-                name: format!("\n{metrics_desc}\n\n{pushdown_filters:?}").into(),
+                name: format!("\n{pushdown_filters:?}\n{metrics_desc}").into(),
                 count: Count::new(),
             },
             None,

From b5bfb2c1f09515559fa957e65c4d76014ec0ef48 Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Fri, 15 Dec 2023 16:24:36 +0800
Subject: [PATCH 12/38] fix: ignore collecting fetched bytes stats when sst
 file is read only once (#1369)

## Rationale
The stats about the number of bytes fetched from object store should not
include the low-frequency reading, e.g. compaction because such stats
are used to show the query load distribution across the tables.

## Detailed Changes
Ignore collecting the fetched bytes stats in the low-frequency reading.

## Test Plan
The ci's tests should pass.
---
 .../src/sst/parquet/async_reader.rs           | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index f58c1da852..6ed297a027 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -309,8 +309,12 @@ impl<'a> Reader<'a> {
         );
 
         let mut streams = Vec::with_capacity(target_row_group_chunks.len());
-        let metrics_collector = ObjectStoreMetricsObserver {
-            table_level_sst_metrics: self.table_level_sst_metrics.clone(),
+        let metrics_collector = {
+            let metrics_for_object_store = matches!(self.frequency, ReadFrequency::Frequent)
+                .then(|| self.table_level_sst_metrics.clone());
+            ObjectStoreMetricsObserver {
+                table_level_sst_metrics: metrics_for_object_store,
+            }
         };
         for chunk in target_row_group_chunks {
             let object_store_reader = ObjectStoreReader::with_metrics(
@@ -763,18 +767,20 @@ impl<'a> SstReader for ThreadedReader<'a> {
 
 #[derive(Clone)]
 struct ObjectStoreMetricsObserver {
-    table_level_sst_metrics: Arc<MaybeTableLevelMetrics>,
+    table_level_sst_metrics: Option<Arc<MaybeTableLevelMetrics>>,
 }
 
 impl MetricsObserver for ObjectStoreMetricsObserver {
     fn elapsed(&self, path: &Path, elapsed: Duration) {
-        debug!("ObjectStoreReader dropped, path:{path}, elapsed:{elapsed:?}",);
+        debug!("ObjectStoreReader dropped, path:{path}, elapsed:{elapsed:?}");
     }
 
     fn num_bytes_fetched(&self, _: &Path, num_bytes: usize) {
-        self.table_level_sst_metrics
-            .num_fetched_sst_bytes
-            .fetch_add(num_bytes as u64, Ordering::Relaxed);
+        if let Some(metrics) = &self.table_level_sst_metrics {
+            metrics
+                .num_fetched_sst_bytes
+                .fetch_add(num_bytes as u64, Ordering::Relaxed);
+        }
     }
 }
 

From f41ad897b870e169d4297f82a52118c83b3a1aca Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Fri, 15 Dec 2023 16:32:39 +0800
Subject: [PATCH 13/38] chore: disable block for http api (#1368)

## Rationale
HTTP API is mainly used for debugging, and should not be blocked.

## Detailed Changes


## Test Plan
Pass CI
---
 proxy/src/grpc/sql_query.rs |  2 ++
 proxy/src/http/sql.rs       |  1 +
 proxy/src/lib.rs            | 16 ----------------
 proxy/src/read.rs           | 22 ++++++++++++++++++++--
 4 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/proxy/src/grpc/sql_query.rs b/proxy/src/grpc/sql_query.rs
index 785b0a3483..d753d49a43 100644
--- a/proxy/src/grpc/sql_query.rs
+++ b/proxy/src/grpc/sql_query.rs
@@ -104,6 +104,7 @@ impl Proxy {
                     schema,
                     &req.sql,
                     self.sub_table_access_perm.enable_others,
+                    true,
                 )
                 .await?
             }
@@ -170,6 +171,7 @@ impl Proxy {
                 schema,
                 &req.sql,
                 self.sub_table_access_perm.enable_others,
+                true,
             )
             .await?;
 
diff --git a/proxy/src/http/sql.rs b/proxy/src/http/sql.rs
index 14ce7f291f..48197f4829 100644
--- a/proxy/src/http/sql.rs
+++ b/proxy/src/http/sql.rs
@@ -55,6 +55,7 @@ impl Proxy {
                 schema,
                 &req.query,
                 self.sub_table_access_perm.enable_http,
+                false,
             )
             .await;
 
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a4155fe0ef..74d268138b 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -523,14 +523,6 @@ impl Proxy {
         plan: Plan,
         deadline: Option<Instant>,
     ) -> Result<Output> {
-        self.instance
-            .limiter
-            .try_limit(&plan)
-            .box_err()
-            .context(Internal {
-                msg: "Request is blocked",
-            })?;
-
         let interpreter =
             self.build_interpreter(request_id, catalog, schema, plan, deadline, false)?;
         Self::interpreter_execute_plan(interpreter, deadline).await
@@ -544,14 +536,6 @@ impl Proxy {
         plan: Plan,
         deadline: Option<Instant>,
     ) -> Result<Output> {
-        self.instance
-            .limiter
-            .try_limit(&plan)
-            .box_err()
-            .context(Internal {
-                msg: "Request is blocked",
-            })?;
-
         let interpreter =
             self.build_interpreter(request_id, catalog, schema, plan, deadline, true)?;
         Self::interpreter_execute_plan(interpreter, deadline).await
diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index ba9b33c7d6..9d93221cae 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -57,6 +57,7 @@ impl Proxy {
         schema: &str,
         sql: &str,
         enable_partition_table_access: bool,
+        enable_block_query: bool, // true for grpc, false for http
     ) -> Result<SqlResponse> {
         if let Some(resp) = self
             .maybe_forward_sql_query(ctx.clone(), schema, sql)
@@ -69,7 +70,13 @@ impl Proxy {
         };
 
         let output = self
-            .fetch_sql_query_output(ctx, schema, sql, enable_partition_table_access)
+            .fetch_sql_query_output(
+                ctx,
+                schema,
+                sql,
+                enable_partition_table_access,
+                enable_block_query,
+            )
             .await?;
 
         Ok(SqlResponse::Local(output))
@@ -128,7 +135,7 @@ impl Proxy {
         };
 
         let result = self
-            .fetch_sql_query_output(ctx, schema, sql, enable_partition_table_access)
+            .fetch_sql_query_output(ctx, schema, sql, enable_partition_table_access, true)
             .await;
 
         guard.cancel();
@@ -160,6 +167,7 @@ impl Proxy {
         schema: &str,
         sql: &str,
         enable_partition_table_access: bool,
+        enable_block_query: bool,
     ) -> Result<Output> {
         let request_id = &ctx.request_id;
         let slow_threshold_secs = self
@@ -225,6 +233,16 @@ impl Proxy {
                 msg: "Failed to create plan",
             })?;
 
+        if enable_block_query {
+            self.instance
+                .limiter
+                .try_limit(&plan)
+                .box_err()
+                .context(Internal {
+                    msg: "Request is blocked",
+                })?;
+        }
+
         let mut plan_maybe_expired = false;
         if let Some(table_name) = &table_name {
             match self.is_plan_expired(&plan, catalog, schema, table_name) {

From 785eed720234b52ee9ff8270f489486d1840e909 Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Mon, 18 Dec 2023 10:34:08 +0800
Subject: [PATCH 14/38] feat: avoid building dictionary for massive unique
 column values (#1365)

## Rationale
Close #1105

## Detailed Changes
- Reduce the parameters in the sst write path
- Avoid building dictionary for massive unique column values

## Test Plan
- New unit test for the changeset
- Observe the metrics for disable/enable dictionary encoding
---
 Cargo.lock                                    |   1 +
 analytic_engine/Cargo.toml                    |   1 +
 analytic_engine/src/instance/engine.rs        |   8 +-
 analytic_engine/src/instance/open.rs          |   4 +-
 .../src/instance/serial_executor.rs           |   3 +-
 analytic_engine/src/sst/factory.rs            |  14 +-
 analytic_engine/src/sst/meta_data/cache.rs    |   2 +-
 .../src/sst/meta_data/metadata_reader.rs      |   5 +-
 .../src/sst/parquet/async_reader.rs           |   5 +-
 analytic_engine/src/sst/parquet/encoding.rs   |  54 +--
 analytic_engine/src/sst/parquet/meta_data.rs  |  10 +-
 .../src/sst/parquet/row_group_pruner.rs       |   3 +-
 analytic_engine/src/sst/parquet/writer.rs     | 325 +++++++++++++-----
 components/codec/src/columnar/timestamp.rs    |   6 +-
 components/id_allocator/src/lib.rs            |   2 +-
 server/src/grpc/remote_engine_service/mod.rs  |   2 +-
 src/wal/src/rocksdb_impl/manager.rs           |   2 +-
 table_engine/src/partition/rule/random.rs     |   7 +-
 18 files changed, 311 insertions(+), 143 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 95b429a73a..5b41170ad6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -104,6 +104,7 @@ dependencies = [
  "future_ext",
  "futures 0.3.28",
  "generic_error",
+ "hash_ext",
  "hex",
  "hyperloglog",
  "id_allocator",
diff --git a/analytic_engine/Cargo.toml b/analytic_engine/Cargo.toml
index 661b19b1e9..0cb13f3d30 100644
--- a/analytic_engine/Cargo.toml
+++ b/analytic_engine/Cargo.toml
@@ -50,6 +50,7 @@ datafusion = { workspace = true }
 future_ext = { workspace = true }
 futures = { workspace = true }
 generic_error = { workspace = true }
+hash_ext = { workspace = true }
 hex = { workspace = true }
 hyperloglog = { workspace = true }
 id_allocator = { workspace = true }
diff --git a/analytic_engine/src/instance/engine.rs b/analytic_engine/src/instance/engine.rs
index 607000157f..9fd658fc11 100644
--- a/analytic_engine/src/instance/engine.rs
+++ b/analytic_engine/src/instance/engine.rs
@@ -26,10 +26,14 @@ use table_engine::{
 };
 use wal::manager::WalLocation;
 
-use super::open::{TableContext, TablesOfShardContext};
 use crate::{
     engine::build_space_id,
-    instance::{close::Closer, drop::Dropper, open::OpenTablesOfShardResult, Instance},
+    instance::{
+        close::Closer,
+        drop::Dropper,
+        open::{OpenTablesOfShardResult, TableContext, TablesOfShardContext},
+        Instance,
+    },
     space::{MemSizeOptions, Space, SpaceAndTable, SpaceContext, SpaceId, SpaceRef},
 };
 
diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs
index 9363e5a096..446d363348 100644
--- a/analytic_engine/src/instance/open.rs
+++ b/analytic_engine/src/instance/open.rs
@@ -26,13 +26,13 @@ use snafu::ResultExt;
 use table_engine::{engine::TableDef, table::TableId};
 use wal::manager::WalManagerRef;
 
-use super::{engine::OpenTablesOfShard, flush_compaction::Flusher};
 use crate::{
     compaction::scheduler::SchedulerImpl,
     context::OpenContext,
     engine,
     instance::{
-        engine::{OpenManifest, ReadMetaUpdate, Result},
+        engine::{OpenManifest, OpenTablesOfShard, ReadMetaUpdate, Result},
+        flush_compaction::Flusher,
         mem_collector::MemUsageCollector,
         wal_replayer::{ReplayMode, WalReplayer},
         Instance, SpaceStore,
diff --git a/analytic_engine/src/instance/serial_executor.rs b/analytic_engine/src/instance/serial_executor.rs
index 56dcfc27f2..579f6892a7 100644
--- a/analytic_engine/src/instance/serial_executor.rs
+++ b/analytic_engine/src/instance/serial_executor.rs
@@ -30,9 +30,8 @@ use tokio::sync::{
     watch::{self, Receiver, Sender},
 };
 
-use super::flush_compaction::{BackgroundFlushFailed, TableFlushOptions};
 use crate::{
-    instance::flush_compaction::{Other, Result},
+    instance::flush_compaction::{BackgroundFlushFailed, Other, Result, TableFlushOptions},
     table::data::TableData,
 };
 
diff --git a/analytic_engine/src/sst/factory.rs b/analytic_engine/src/sst/factory.rs
index d7faf6fe29..8a585d8639 100644
--- a/analytic_engine/src/sst/factory.rs
+++ b/analytic_engine/src/sst/factory.rs
@@ -32,7 +32,10 @@ use crate::{
         header::HeaderParser,
         meta_data::cache::MetaCacheRef,
         metrics::MaybeTableLevelMetrics as SstMaybeTableLevelMetrics,
-        parquet::{writer::ParquetSstWriter, AsyncParquetReader, ThreadedReader},
+        parquet::{
+            writer::{ParquetSstWriter, WriteOptions},
+            AsyncParquetReader, ThreadedReader,
+        },
         reader::SstReader,
         writer::SstWriter,
     },
@@ -200,11 +203,16 @@ impl Factory for FactoryImpl {
         store_picker: &'a ObjectStorePickerRef,
         level: Level,
     ) -> Result<Box<dyn SstWriter + Send + 'a>> {
+        let write_options = WriteOptions {
+            num_rows_per_row_group: options.num_rows_per_row_group,
+            max_buffer_size: options.max_buffer_size,
+            compression: options.compression.into(),
+            sst_level: level,
+        };
         Ok(Box::new(ParquetSstWriter::new(
             path,
-            level,
+            write_options,
             store_picker,
-            options,
         )))
     }
 }
diff --git a/analytic_engine/src/sst/meta_data/cache.rs b/analytic_engine/src/sst/meta_data/cache.rs
index 5f0b34d993..fa30b8df2c 100644
--- a/analytic_engine/src/sst/meta_data/cache.rs
+++ b/analytic_engine/src/sst/meta_data/cache.rs
@@ -175,7 +175,7 @@ mod tests {
     use parquet::{arrow::ArrowWriter, file::footer};
     use parquet_ext::ParquetMetaData;
 
-    use super::MetaData;
+    use super::*;
     use crate::{
         sst::parquet::{
             encoding::{self, META_PATH_KEY, META_VERSION_KEY},
diff --git a/analytic_engine/src/sst/meta_data/metadata_reader.rs b/analytic_engine/src/sst/meta_data/metadata_reader.rs
index 8bc240ed1a..cfad19b115 100644
--- a/analytic_engine/src/sst/meta_data/metadata_reader.rs
+++ b/analytic_engine/src/sst/meta_data/metadata_reader.rs
@@ -20,11 +20,10 @@ use object_store::{ObjectStoreRef, Path};
 use parquet::{data_type::AsBytes, file::metadata::KeyValue};
 use snafu::{ensure, OptionExt, ResultExt};
 
-use super::UnknownMetaVersion;
 use crate::sst::{
     meta_data::{
         DecodeCustomMetaData, FetchAndDecodeSstMeta, FetchFromStore, KvMetaDataNotFound,
-        KvMetaPathEmpty,
+        KvMetaPathEmpty, UnknownMetaVersion,
     },
     parquet::{
         encoding::{self, decode_sst_meta_data_from_bytes, META_VERSION_CURRENT, META_VERSION_V1},
@@ -32,7 +31,7 @@ use crate::sst::{
     },
 };
 
-define_result!(super::Error);
+define_result!(crate::sst::meta_data::Error);
 
 #[async_trait]
 pub trait CustomMetadataReader {
diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index 6ed297a027..0eb65c63d8 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -57,7 +57,6 @@ use tokio::sync::{
 };
 use trace_metric::{MetricsCollector, TraceMetricWhenDrop};
 
-use super::meta_data::ColumnValueSet;
 use crate::{
     prefetchable_stream::{NoopPrefetcher, PrefetchableStream},
     sst::{
@@ -68,7 +67,9 @@ use crate::{
         },
         metrics::MaybeTableLevelMetrics,
         parquet::{
-            encoding::ParquetDecoder, meta_data::ParquetFilter, row_group_pruner::RowGroupPruner,
+            encoding::ParquetDecoder,
+            meta_data::{ColumnValueSet, ParquetFilter},
+            row_group_pruner::RowGroupPruner,
         },
         reader::{error::*, Result, SstReader},
     },
diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs
index 742d0ec370..9dc4a13277 100644
--- a/analytic_engine/src/sst/parquet/encoding.rs
+++ b/analytic_engine/src/sst/parquet/encoding.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::convert::TryFrom;
+use std::{collections::HashMap, convert::TryFrom};
 
 use arrow::{compute, record_batch::RecordBatch as ArrowRecordBatch};
 use async_trait::async_trait;
@@ -26,6 +26,7 @@ use parquet::{
     arrow::AsyncArrowWriter,
     basic::Compression,
     file::{metadata::KeyValue, properties::WriterProperties},
+    schema::types::ColumnPath,
 };
 use prost::{bytes, Message};
 use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
@@ -237,25 +238,40 @@ struct ColumnarRecordEncoder<W> {
     arrow_schema: ArrowSchemaRef,
 }
 
+#[derive(Debug, Clone)]
+pub struct ColumnEncoding {
+    pub enable_dict: bool,
+}
+
+#[derive(Debug, Clone)]
+pub struct EncodeOptions {
+    pub num_rows_per_row_group: usize,
+    pub max_buffer_size: usize,
+    pub compression: Compression,
+    pub column_encodings: HashMap<String, ColumnEncoding>,
+}
+
 impl<W: AsyncWrite + Send + Unpin> ColumnarRecordEncoder<W> {
-    fn try_new(
-        sink: W,
-        schema: &Schema,
-        num_rows_per_row_group: usize,
-        max_buffer_size: usize,
-        compression: Compression,
-    ) -> Result<Self> {
+    fn try_new(sink: W, schema: &Schema, options: &EncodeOptions) -> Result<Self> {
         let arrow_schema = schema.to_arrow_schema_ref();
 
-        let write_props = WriterProperties::builder()
-            .set_max_row_group_size(num_rows_per_row_group)
-            .set_compression(compression)
-            .build();
+        let write_props = {
+            let mut builder = WriterProperties::builder()
+                .set_max_row_group_size(options.num_rows_per_row_group)
+                .set_compression(options.compression);
+
+            for (col_name, encoding) in &options.column_encodings {
+                let col_path = ColumnPath::new(vec![col_name.to_string()]);
+                builder = builder.set_column_dictionary_enabled(col_path, encoding.enable_dict);
+            }
+
+            builder.build()
+        };
 
         let arrow_writer = AsyncArrowWriter::try_new(
             sink,
             arrow_schema.clone(),
-            max_buffer_size,
+            options.max_buffer_size,
             Some(write_props),
         )
         .box_err()
@@ -326,18 +342,10 @@ impl ParquetEncoder {
     pub fn try_new<W: AsyncWrite + Unpin + Send + 'static>(
         sink: W,
         schema: &Schema,
-        num_rows_per_row_group: usize,
-        max_buffer_size: usize,
-        compression: Compression,
+        options: &EncodeOptions,
     ) -> Result<Self> {
         Ok(ParquetEncoder {
-            record_encoder: Box::new(ColumnarRecordEncoder::try_new(
-                sink,
-                schema,
-                num_rows_per_row_group,
-                max_buffer_size,
-                compression,
-            )?),
+            record_encoder: Box::new(ColumnarRecordEncoder::try_new(sink, schema, options)?),
         })
     }
 
diff --git a/analytic_engine/src/sst/parquet/meta_data.rs b/analytic_engine/src/sst/parquet/meta_data.rs
index 622c4e8909..d7f63467dd 100644
--- a/analytic_engine/src/sst/parquet/meta_data.rs
+++ b/analytic_engine/src/sst/parquet/meta_data.rs
@@ -353,14 +353,14 @@ pub struct ParquetMetaData {
 
 pub type ParquetMetaDataRef = Arc<ParquetMetaData>;
 
-impl From<MetaData> for ParquetMetaData {
-    fn from(meta: MetaData) -> Self {
+impl From<&MetaData> for ParquetMetaData {
+    fn from(meta: &MetaData) -> Self {
         Self {
-            min_key: meta.min_key,
-            max_key: meta.max_key,
+            min_key: meta.min_key.clone(),
+            max_key: meta.max_key.clone(),
             time_range: meta.time_range,
             max_sequence: meta.max_sequence,
-            schema: meta.schema,
+            schema: meta.schema.clone(),
             parquet_filter: None,
             column_values: None,
         }
diff --git a/analytic_engine/src/sst/parquet/row_group_pruner.rs b/analytic_engine/src/sst/parquet/row_group_pruner.rs
index 81aeaeff7d..dbfc48734a 100644
--- a/analytic_engine/src/sst/parquet/row_group_pruner.rs
+++ b/analytic_engine/src/sst/parquet/row_group_pruner.rs
@@ -36,9 +36,8 @@ use parquet_ext::prune::{
 use snafu::ensure;
 use trace_metric::{MetricsCollector, TraceMetricWhenDrop};
 
-use super::meta_data::ColumnValueSet;
 use crate::sst::{
-    parquet::meta_data::ParquetFilter,
+    parquet::meta_data::{ColumnValueSet, ParquetFilter},
     reader::error::{OtherNoCause, Result},
 };
 
diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs
index 24aa626e49..b2c7874547 100644
--- a/analytic_engine/src/sst/parquet/writer.rs
+++ b/analytic_engine/src/sst/parquet/writer.rs
@@ -14,7 +14,7 @@
 
 //! Sst writer implementation based on parquet.
 
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 
 use async_trait::async_trait;
 use common_types::{
@@ -29,14 +29,16 @@ use parquet::data_type::AsBytes;
 use snafu::{OptionExt, ResultExt};
 use tokio::io::{AsyncWrite, AsyncWriteExt};
 
-use super::meta_data::{ColumnValueSet, RowGroupFilter};
 use crate::{
     sst::{
-        factory::{ObjectStorePickerRef, SstWriteOptions},
+        factory::ObjectStorePickerRef,
         file::Level,
         parquet::{
-            encoding::{encode_sst_meta_data, ParquetEncoder},
-            meta_data::{ParquetFilter, ParquetMetaData, RowGroupFilterBuilder},
+            encoding::{encode_sst_meta_data, ColumnEncoding, EncodeOptions, ParquetEncoder},
+            meta_data::{
+                ColumnValueSet, ParquetFilter, ParquetMetaData, RowGroupFilter,
+                RowGroupFilterBuilder,
+            },
         },
         writer::{
             self, BuildParquetFilter, EncodePbData, EncodeRecordBatch, ExpectTimestampColumn, Io,
@@ -48,50 +50,47 @@ use crate::{
 };
 
 const KEEP_COLUMN_VALUE_THRESHOLD: usize = 20;
+/// Only the row group which contains at least
+/// `MIN_NUM_ROWS_DICT_ENCODING_SAMPLE` rows can be sampling to decide whether
+/// to do dictionary encoding.
+const MIN_NUM_ROWS_SAMPLE_DICT_ENCODING: usize = 1024;
+/// If the number of unique value exceeds
+/// `total_num_values * MAX_UNIQUE_VALUE_RATIO_DICT_ENCODING`, there is no need
+/// to do dictionary encoding for such column.
+const MAX_UNIQUE_VALUE_RATIO_DICT_ENCODING: f64 = 0.12;
 
 /// The implementation of sst based on parquet and object storage.
 #[derive(Debug)]
 pub struct ParquetSstWriter<'a> {
     /// The path where the data is persisted.
     path: &'a Path,
-    level: Level,
     /// The storage where the data is persist.
     store: &'a ObjectStoreRef,
-    /// Max row group size.
-    num_rows_per_row_group: usize,
-    max_buffer_size: usize,
-    compression: Compression,
+    options: WriteOptions,
 }
 
 impl<'a> ParquetSstWriter<'a> {
     pub fn new(
         path: &'a Path,
-        level: Level,
+        options: WriteOptions,
         store_picker: &'a ObjectStorePickerRef,
-        options: &SstWriteOptions,
     ) -> Self {
         let store = store_picker.default_store();
         Self {
             path,
-            level,
             store,
-            num_rows_per_row_group: options.num_rows_per_row_group,
-            compression: options.compression.into(),
-            max_buffer_size: options.max_buffer_size,
+            options,
         }
     }
 }
 
 /// The writer will reorganize the record batches into row groups, and then
 /// encode them to parquet file.
-struct RecordBatchGroupWriter {
+struct RecordBatchGroupWriter<'a> {
     request_id: RequestId,
     input: RecordBatchStream,
-    meta_data: MetaData,
-    num_rows_per_row_group: usize,
-    max_buffer_size: usize,
-    compression: Compression,
-    level: Level,
+    meta_data: &'a MetaData,
+    options: &'a WriteOptions,
 
     // inner status
     input_exhausted: bool,
@@ -102,21 +101,32 @@ struct RecordBatchGroupWriter {
     column_values: Option<Vec<Option<ColumnValueSet>>>,
 }
 
-impl RecordBatchGroupWriter {
+#[derive(Debug, Clone)]
+pub struct WriteOptions {
+    pub num_rows_per_row_group: usize,
+    pub max_buffer_size: usize,
+    pub compression: Compression,
+    pub sst_level: Level,
+}
+
+impl WriteOptions {
+    #[inline]
+    pub fn need_custom_filter(&self) -> bool {
+        !self.sst_level.is_min()
+    }
+}
+
+impl<'a> RecordBatchGroupWriter<'a> {
     fn new(
         request_id: RequestId,
         input: RecordBatchStream,
-        meta_data: MetaData,
-        num_rows_per_row_group: usize,
-        max_buffer_size: usize,
-        compression: Compression,
-        level: Level,
+        meta_data: &'a MetaData,
+        options: &'a WriteOptions,
     ) -> Self {
-        let column_values = if level.is_min() {
-            // There are not many rows in min level, so we don't record values for them.
-            None
-        } else {
-            let column_values = meta_data
+        // No need to build complex index for the min-level sst so there is no need to
+        // collect the column values.
+        let column_values = options.need_custom_filter().then(|| {
+            meta_data
                 .schema
                 .columns()
                 .iter()
@@ -128,19 +138,14 @@ impl RecordBatchGroupWriter {
                         None
                     }
                 })
-                .collect();
-
-            Some(column_values)
-        };
+                .collect()
+        });
 
         Self {
             request_id,
             input,
             meta_data,
-            num_rows_per_row_group,
-            max_buffer_size,
-            compression,
-            level,
+            options,
             input_exhausted: false,
             real_time_range: None,
             column_values,
@@ -158,7 +163,7 @@ impl RecordBatchGroupWriter {
     ) -> Result<Vec<RecordBatchWithKey>> {
         let mut curr_row_group = vec![];
         // Used to record the number of remaining rows to fill `curr_row_group`.
-        let mut remaining = self.num_rows_per_row_group;
+        let mut remaining = self.options.num_rows_per_row_group;
 
         // Keep filling `curr_row_group` until `remaining` is zero.
         while remaining > 0 {
@@ -209,6 +214,19 @@ impl RecordBatchGroupWriter {
         Ok(curr_row_group)
     }
 
+    fn build_column_encodings(
+        &self,
+        sample_row_groups: &[RecordBatchWithKey],
+    ) -> Result<HashMap<String, ColumnEncoding>> {
+        let sampler = ColumnEncodingSampler {
+            sample_row_groups,
+            meta_data: self.meta_data,
+            min_num_sample_rows: MIN_NUM_ROWS_SAMPLE_DICT_ENCODING,
+            max_unique_value_ratio: MAX_UNIQUE_VALUE_RATIO_DICT_ENCODING,
+        };
+        sampler.sample()
+    }
+
     /// Build the parquet filter for the given `row_group`.
     fn build_row_group_filter(
         &self,
@@ -230,10 +248,6 @@ impl RecordBatchGroupWriter {
         builder.build().box_err().context(BuildParquetFilter)
     }
 
-    fn need_custom_filter(&self) -> bool {
-        !self.level.is_min()
-    }
-
     fn update_column_values(
         column_values: &mut [Option<ColumnValueSet>],
         record_batch: &RecordBatchWithKey,
@@ -307,28 +321,24 @@ impl RecordBatchGroupWriter {
         let mut arrow_row_group = Vec::new();
         let mut total_num_rows = 0;
 
-        let mut parquet_encoder = ParquetEncoder::try_new(
-            sink,
-            &self.meta_data.schema,
-            self.num_rows_per_row_group,
-            self.max_buffer_size,
-            self.compression,
-        )
-        .box_err()
-        .context(EncodeRecordBatch)?;
-        let mut parquet_filter = if self.need_custom_filter() {
-            Some(ParquetFilter::default())
-        } else {
-            None
+        let mut row_group = self.fetch_next_row_group(&mut prev_record_batch).await?;
+        let column_encodings = self.build_column_encodings(&row_group)?;
+        let encode_options = EncodeOptions {
+            num_rows_per_row_group: self.options.num_rows_per_row_group,
+            max_buffer_size: self.options.max_buffer_size,
+            compression: self.options.compression,
+            column_encodings,
         };
+        let mut parquet_encoder =
+            ParquetEncoder::try_new(sink, &self.meta_data.schema, &encode_options)
+                .box_err()
+                .context(EncodeRecordBatch)?;
+        let mut parquet_filter = self
+            .options
+            .need_custom_filter()
+            .then(ParquetFilter::default);
         let timestamp_index = self.meta_data.schema.timestamp_index();
-
-        loop {
-            let row_group = self.fetch_next_row_group(&mut prev_record_batch).await?;
-            if row_group.is_empty() {
-                break;
-            }
-
+        while !row_group.is_empty() {
             if let Some(filter) = &mut parquet_filter {
                 filter.push_row_group_filter(self.build_row_group_filter(&row_group)?);
             }
@@ -356,6 +366,8 @@ impl RecordBatchGroupWriter {
             // allocated memory.
             arrow_row_group = Vec::with_capacity(num_batches);
             total_num_rows += num_rows;
+
+            row_group = self.fetch_next_row_group(&mut prev_record_batch).await?;
         }
 
         let parquet_meta_data = {
@@ -461,18 +473,10 @@ impl<'a> SstWriter for ParquetSstWriter<'a> {
     ) -> writer::Result<SstInfo> {
         debug!(
             "Build parquet file, request_id:{}, meta:{:?}, num_rows_per_row_group:{}",
-            request_id, meta, self.num_rows_per_row_group
+            request_id, meta, self.options.num_rows_per_row_group
         );
 
-        let group_writer = RecordBatchGroupWriter::new(
-            request_id,
-            input,
-            meta.clone(),
-            self.num_rows_per_row_group,
-            self.max_buffer_size,
-            self.compression,
-            self.level,
-        );
+        let group_writer = RecordBatchGroupWriter::new(request_id, input, meta, &self.options);
 
         let (aborter, sink) =
             ObjectStoreMultiUploadAborter::initialize_upload(self.store, self.path).await?;
@@ -511,6 +515,65 @@ impl<'a> SstWriter for ParquetSstWriter<'a> {
     }
 }
 
+/// A sampler to decide the column encoding options (whether to do dictionary
+/// encoding) with a bunch of sample row groups.
+struct ColumnEncodingSampler<'a> {
+    sample_row_groups: &'a [RecordBatchWithKey],
+    meta_data: &'a MetaData,
+    min_num_sample_rows: usize,
+    max_unique_value_ratio: f64,
+}
+
+impl<'a> ColumnEncodingSampler<'a> {
+    fn sample(&self) -> Result<HashMap<String, ColumnEncoding>> {
+        let num_total_rows: usize = self.sample_row_groups.iter().map(|v| v.num_rows()).sum();
+        if num_total_rows < self.min_num_sample_rows {
+            return Ok(HashMap::new());
+        }
+
+        assert!(self.max_unique_value_ratio <= 1.0 && self.max_unique_value_ratio >= 0.0);
+        let max_unique_values = (num_total_rows as f64 * self.max_unique_value_ratio) as usize;
+        let mut column_hashes = HashSet::with_capacity(max_unique_values);
+        let mut column_encodings = HashMap::with_capacity(self.meta_data.schema.num_columns());
+        for (col_idx, col_schema) in self.meta_data.schema.columns().iter().enumerate() {
+            // Only do dictionary encoding for string or bytes column.
+            let allowed_dict_type = matches!(
+                col_schema.data_type,
+                DatumKind::String | DatumKind::Varbinary
+            );
+            if !allowed_dict_type {
+                column_encodings.insert(
+                    col_schema.name.clone(),
+                    ColumnEncoding { enable_dict: false },
+                );
+                continue;
+            }
+
+            for row_group in self.sample_row_groups {
+                let col_block = &row_group.columns()[col_idx];
+                for idx in 0..row_group.num_rows() {
+                    if column_hashes.len() >= max_unique_values {
+                        break;
+                    }
+                    let datum_view = col_block.datum_view(idx);
+                    datum_view.do_with_bytes(|val| {
+                        let hash = hash_ext::hash64(val);
+                        column_hashes.insert(hash);
+                    })
+                }
+            }
+
+            // The dictionary encoding make senses only if the number of unique values is
+            // small.
+            let enable_dict = column_hashes.len() < max_unique_values;
+            column_hashes.clear();
+            column_encodings.insert(col_schema.name.clone(), ColumnEncoding { enable_dict });
+        }
+
+        Ok(column_encodings)
+    }
+}
+
 #[cfg(test)]
 mod tests {
 
@@ -690,7 +753,7 @@ mod tests {
                     sst_meta.time_range,
                     TimeRange::new_unchecked(100.into(), 105.into())
                 );
-                assert_eq!(&sst_meta_readback, &ParquetMetaData::from(sst_meta));
+                assert_eq!(&sst_meta_readback, &ParquetMetaData::from(&sst_meta));
                 assert_eq!(
                     expected_num_rows,
                     reader
@@ -799,20 +862,24 @@ mod tests {
             Poll::Ready(Some(Ok(batch)))
         }));
 
+        let write_options = WriteOptions {
+            num_rows_per_row_group,
+            max_buffer_size: 0,
+            compression: Compression::UNCOMPRESSED,
+            sst_level: Level::default(),
+        };
+        let meta_data = MetaData {
+            min_key: Default::default(),
+            max_key: Default::default(),
+            time_range: Default::default(),
+            max_sequence: 1,
+            schema,
+        };
         let mut group_writer = RecordBatchGroupWriter::new(
             RequestId::next_id(),
             record_batch_stream,
-            MetaData {
-                min_key: Default::default(),
-                max_key: Default::default(),
-                time_range: Default::default(),
-                max_sequence: 1,
-                schema,
-            },
-            num_rows_per_row_group,
-            0,
-            Compression::UNCOMPRESSED,
-            Level::default(),
+            &meta_data,
+            &write_options,
         );
 
         let mut prev_record_batch = None;
@@ -826,4 +893,86 @@ mod tests {
             assert_eq!(expect_num_row, actual_num_row);
         }
     }
+
+    fn check_sample_column_encoding(
+        sampler: ColumnEncodingSampler<'_>,
+        expect_enable_dicts: Option<Vec<bool>>,
+    ) {
+        let column_encodings = sampler.sample().unwrap();
+        if expect_enable_dicts.is_none() {
+            assert!(column_encodings.is_empty());
+            return;
+        }
+
+        let expect_enable_dicts = expect_enable_dicts.unwrap();
+        for (col_idx, col_schema) in sampler.meta_data.schema.columns().iter().enumerate() {
+            let expect_enable_dict = expect_enable_dicts[col_idx];
+            let column_encoding = column_encodings.get(&col_schema.name).unwrap();
+            assert_eq!(
+                expect_enable_dict, column_encoding.enable_dict,
+                "column:{}",
+                col_schema.name
+            );
+        }
+    }
+
+    #[test]
+    fn test_column_encoding_option_sample() {
+        let schema = build_schema();
+        let raw_rows = vec![
+            (b"a", 100, 10.0, "v4", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v4", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v5", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v5", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v6", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v6", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v8", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v8", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v9", 1000, 1_000_000),
+            (b"a", 100, 10.0, "v9", 1000, 1_000_000),
+        ];
+        let rows: Vec<_> = raw_rows
+            .into_iter()
+            .map(|v| build_row(v.0, v.1, v.2, v.3, v.4, v.5))
+            .collect();
+        let record_batch_with_key0 = build_record_batch_with_key(schema.clone(), rows.clone());
+        let record_batch_with_key1 = build_record_batch_with_key(schema.clone(), rows);
+        let meta_data = MetaData {
+            min_key: Bytes::from_static(b""),
+            max_key: Bytes::from_static(b""),
+            time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)),
+            max_sequence: 200,
+            schema,
+        };
+        let record_batches_with_key = vec![record_batch_with_key0, record_batch_with_key1];
+
+        // Normal case 1
+        let sampler = ColumnEncodingSampler {
+            sample_row_groups: &record_batches_with_key,
+            meta_data: &meta_data,
+            min_num_sample_rows: 10,
+            max_unique_value_ratio: 0.6,
+        };
+        let expect_enable_dicts = vec![true, false, false, true, false, false];
+        check_sample_column_encoding(sampler, Some(expect_enable_dicts));
+
+        // Normal case 2
+        let sampler = ColumnEncodingSampler {
+            sample_row_groups: &record_batches_with_key,
+            meta_data: &meta_data,
+            min_num_sample_rows: 10,
+            max_unique_value_ratio: 0.2,
+        };
+        let expect_enable_dicts = vec![true, false, false, false, false, false];
+        check_sample_column_encoding(sampler, Some(expect_enable_dicts));
+
+        // Normal case 3
+        let sampler = ColumnEncodingSampler {
+            sample_row_groups: &record_batches_with_key,
+            meta_data: &meta_data,
+            min_num_sample_rows: 30,
+            max_unique_value_ratio: 0.2,
+        };
+        check_sample_column_encoding(sampler, None);
+    }
 }
diff --git a/components/codec/src/columnar/timestamp.rs b/components/codec/src/columnar/timestamp.rs
index dda86e13ad..ed7da348e4 100644
--- a/components/codec/src/columnar/timestamp.rs
+++ b/components/codec/src/columnar/timestamp.rs
@@ -15,9 +15,11 @@
 use common_types::time::Timestamp;
 use snafu::{ensure, OptionExt, ResultExt};
 
-use super::{DecodeContext, Overflow, Result, ValuesDecoder, ValuesDecoderImpl, Varint};
 use crate::{
-    columnar::{InvalidVersion, ValuesEncoder, ValuesEncoderImpl},
+    columnar::{
+        DecodeContext, InvalidVersion, Overflow, Result, ValuesDecoder, ValuesDecoderImpl,
+        ValuesEncoder, ValuesEncoderImpl, Varint,
+    },
     consts::MAX_VARINT_BYTES,
     varint,
 };
diff --git a/components/id_allocator/src/lib.rs b/components/id_allocator/src/lib.rs
index ec43f1cef1..a57c395e6f 100644
--- a/components/id_allocator/src/lib.rs
+++ b/components/id_allocator/src/lib.rs
@@ -86,7 +86,7 @@ impl IdAllocator {
 mod test {
     use tokio::runtime::Runtime;
 
-    use super::IdAllocator;
+    use super::*;
 
     #[test]
     fn test_alloc_id() {
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index b0abb62822..03bd7fb1f3 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -65,13 +65,13 @@ use tokio::sync::mpsc::{self, Sender};
 use tokio_stream::{wrappers::ReceiverStream, Stream};
 use tonic::{Request, Response, Status};
 
-use super::metrics::REMOTE_ENGINE_WRITE_BATCH_NUM_ROWS_HISTOGRAM;
 use crate::{
     config::QueryDedupConfig,
     grpc::{
         metrics::{
             REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC,
             REMOTE_ENGINE_GRPC_HANDLER_DURATION_HISTOGRAM_VEC,
+            REMOTE_ENGINE_WRITE_BATCH_NUM_ROWS_HISTOGRAM,
         },
         remote_engine_service::error::{ErrNoCause, ErrWithCause, Result, StatusCode},
     },
diff --git a/src/wal/src/rocksdb_impl/manager.rs b/src/wal/src/rocksdb_impl/manager.rs
index 378b1f6038..5d82a1a32c 100644
--- a/src/wal/src/rocksdb_impl/manager.rs
+++ b/src/wal/src/rocksdb_impl/manager.rs
@@ -38,7 +38,6 @@ use runtime::Runtime;
 use snafu::ResultExt;
 use tokio::sync::Mutex;
 
-use super::config::RocksDBConfig;
 use crate::{
     config::{Config, StorageConfig},
     kv_encoder::{CommonLogEncoding, CommonLogKey, MaxSeqMetaEncoding, MaxSeqMetaValue, MetaKey},
@@ -48,6 +47,7 @@ use crate::{
         ScanContext, ScanRequest, SyncLogIterator, WalLocation, WalManager, WalManagerRef,
         WalRuntimes, WalsOpener, WriteContext, MANIFEST_DIR_NAME, WAL_DIR_NAME,
     },
+    rocksdb_impl::config::RocksDBConfig,
 };
 
 /// Table unit in the Wal.
diff --git a/table_engine/src/partition/rule/random.rs b/table_engine/src/partition/rule/random.rs
index 7f70f33d23..3e94dd3404 100644
--- a/table_engine/src/partition/rule/random.rs
+++ b/table_engine/src/partition/rule/random.rs
@@ -18,7 +18,7 @@ use common_types::row::RowGroup;
 use itertools::Itertools;
 
 use crate::partition::{
-    rule::{PartitionRule, PartitionedRows},
+    rule::{filter::PartitionFilter, PartitionRule, PartitionedRows},
     Result,
 };
 
@@ -44,10 +44,7 @@ impl PartitionRule for RandomRule {
         })
     }
 
-    fn locate_partitions_for_read(
-        &self,
-        _filters: &[super::filter::PartitionFilter],
-    ) -> Result<Vec<usize>> {
+    fn locate_partitions_for_read(&self, _filters: &[PartitionFilter]) -> Result<Vec<usize>> {
         Ok((0..self.partition_num).collect_vec())
     }
 }

From c5c01af656fb2c577a34fc299e48b262af3dfc62 Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Mon, 18 Dec 2023 11:37:45 +0800
Subject: [PATCH 15/38] refactor: avoid duplicate codes (#1371)

## Rationale
The codes about deciding whether to do metrics collection according to
the read frequency are duplicate.

## Detailed Changes
Remove the duplicate codes.

## Test Plan
CI.
---
 .../src/sst/parquet/async_reader.rs           | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index 0eb65c63d8..687949182f 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -100,7 +100,7 @@ pub struct Reader<'a> {
     metrics: Metrics,
     df_plan_metrics: ExecutionPlanMetricsSet,
 
-    table_level_sst_metrics: Arc<MaybeTableLevelMetrics>,
+    table_level_sst_metrics: Option<Arc<MaybeTableLevelMetrics>>,
 }
 
 #[derive(Default, Debug, Clone, TraceMetricWhenDrop)]
@@ -130,6 +130,9 @@ impl<'a> Reader<'a> {
             ..Default::default()
         };
 
+        let table_level_sst_metrics = matches!(options.frequency, ReadFrequency::Frequent)
+            .then(|| options.maybe_table_level_metrics.clone());
+
         Self {
             path,
             store,
@@ -143,7 +146,7 @@ impl<'a> Reader<'a> {
             row_projector: None,
             metrics,
             df_plan_metrics,
-            table_level_sst_metrics: options.maybe_table_level_metrics.clone(),
+            table_level_sst_metrics,
         }
     }
 
@@ -264,11 +267,11 @@ impl<'a> Reader<'a> {
         let num_row_group_after_prune = target_row_groups.len();
         // Maybe it is a sub table of partitioned table, try to extract its parent
         // table.
-        if let ReadFrequency::Frequent = self.frequency {
-            self.table_level_sst_metrics
+        if let Some(metrics) = &self.table_level_sst_metrics {
+            metrics
                 .row_group_before_prune_counter
                 .inc_by(num_row_group_before_prune as u64);
-            self.table_level_sst_metrics
+            metrics
                 .row_group_after_prune_counter
                 .inc_by(num_row_group_after_prune as u64);
         }
@@ -310,12 +313,8 @@ impl<'a> Reader<'a> {
         );
 
         let mut streams = Vec::with_capacity(target_row_group_chunks.len());
-        let metrics_collector = {
-            let metrics_for_object_store = matches!(self.frequency, ReadFrequency::Frequent)
-                .then(|| self.table_level_sst_metrics.clone());
-            ObjectStoreMetricsObserver {
-                table_level_sst_metrics: metrics_for_object_store,
-            }
+        let metrics_collector = ObjectStoreMetricsObserver {
+            table_level_sst_metrics: self.table_level_sst_metrics.clone(),
         };
         for chunk in target_row_group_chunks {
             let object_store_reader = ObjectStoreReader::with_metrics(

From 9619810aa558ae79ba474144dfaef591c5d9ef55 Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Wed, 20 Dec 2023 16:05:40 +0800
Subject: [PATCH 16/38] feat: utilize the column cardinality for deciding
 whether to do dict (#1372)

## Rationale
The column value set has been summarized into the meta data if the
column is low distinct. With such information, the sampling for the
columns can be skipped.

## Detailed Changes
Skip sampling over the low-cardinality columns.

## Test Plan
Updated the unit tests.
---
 analytic_engine/src/compaction/scheduler.rs   |   1 +
 .../src/instance/flush_compaction.rs          | 146 +++++++++++++++-
 analytic_engine/src/sst/factory.rs            |  21 ++-
 analytic_engine/src/sst/meta_data/mod.rs      |  11 +-
 analytic_engine/src/sst/parquet/encoding.rs   |   2 +-
 analytic_engine/src/sst/parquet/writer.rs     | 159 +++++++++++++-----
 benchmarks/src/sst_tools.rs                   |   1 +
 tools/src/bin/sst-convert.rs                  |   1 +
 8 files changed, 281 insertions(+), 61 deletions(-)

diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs
index 1496ef514c..72fe2f8bc8 100644
--- a/analytic_engine/src/compaction/scheduler.rs
+++ b/analytic_engine/src/compaction/scheduler.rs
@@ -505,6 +505,7 @@ impl ScheduleWorker {
             num_rows_per_row_group: table_data.table_options().num_rows_per_row_group,
             compression: table_data.table_options().compression,
             max_buffer_size: self.write_sst_max_buffer_size,
+            column_stats: Default::default(),
         };
         let scan_options = self.scan_options.clone();
 
diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs
index 81ac53d7a8..cbede944ba 100644
--- a/analytic_engine/src/instance/flush_compaction.rs
+++ b/analytic_engine/src/instance/flush_compaction.rs
@@ -14,7 +14,12 @@
 
 // Flush and compaction logic of instance
 
-use std::{cmp, collections::Bound, fmt, sync::Arc};
+use std::{
+    cmp,
+    collections::{Bound, HashMap},
+    fmt,
+    sync::Arc,
+};
 
 use common_types::{
     projected_schema::ProjectedSchema,
@@ -55,9 +60,9 @@ use crate::{
         IterOptions,
     },
     sst::{
-        factory::{self, ScanOptions, SstWriteOptions},
+        factory::{self, ColumnStats, ScanOptions, SstWriteOptions},
         file::{FileMeta, Level},
-        meta_data::SstMetaReader,
+        meta_data::{SstMetaData, SstMetaReader},
         writer::{MetaData, RecordBatchStream},
     },
     table::{
@@ -541,6 +546,7 @@ impl FlushTask {
             num_rows_per_row_group: self.table_data.table_options().num_rows_per_row_group,
             compression: self.table_data.table_options().compression,
             max_buffer_size: self.write_sst_max_buffer_size,
+            column_stats: Default::default(),
         };
 
         for time_range in &time_ranges {
@@ -712,6 +718,7 @@ impl FlushTask {
             num_rows_per_row_group: self.table_data.table_options().num_rows_per_row_group,
             compression: self.table_data.table_options().compression,
             max_buffer_size: self.write_sst_max_buffer_size,
+            column_stats: Default::default(),
         };
         let mut writer = self
             .space_store
@@ -943,7 +950,7 @@ impl SpaceStore {
             row_iter::record_batch_with_key_iter_to_stream(merge_iter)
         };
 
-        let sst_meta = {
+        let (sst_meta, column_stats) = {
             let meta_reader = SstMetaReader {
                 space_id: table_data.space_id,
                 table_id: table_data.id,
@@ -956,7 +963,9 @@ impl SpaceStore {
                 .await
                 .context(ReadSstMeta)?;
 
-            MetaData::merge(sst_metas.into_iter().map(MetaData::from), schema)
+            let column_stats = collect_column_stats_from_meta_datas(&sst_metas);
+            let merged_meta = MetaData::merge(sst_metas.into_iter().map(MetaData::from), schema);
+            (merged_meta, column_stats)
         };
 
         // Alloc file id for the merged sst.
@@ -966,10 +975,17 @@ impl SpaceStore {
             .context(AllocFileId)?;
 
         let sst_file_path = table_data.set_sst_file_path(file_id);
+        let write_options = SstWriteOptions {
+            storage_format_hint: sst_write_options.storage_format_hint,
+            num_rows_per_row_group: sst_write_options.num_rows_per_row_group,
+            compression: sst_write_options.compression,
+            max_buffer_size: sst_write_options.max_buffer_size,
+            column_stats,
+        };
         let mut sst_writer = self
             .sst_factory
             .create_writer(
-                sst_write_options,
+                &write_options,
                 &sst_file_path,
                 self.store_picker(),
                 input.output_level,
@@ -1062,6 +1078,42 @@ impl SpaceStore {
     }
 }
 
+/// Collect the column stats from a batch of sst meta data.
+fn collect_column_stats_from_meta_datas(metas: &[SstMetaData]) -> HashMap<String, ColumnStats> {
+    let mut low_cardinality_counts: HashMap<String, usize> = HashMap::new();
+    for meta_data in metas {
+        let SstMetaData::Parquet(meta_data) = meta_data;
+        if let Some(column_values) = &meta_data.column_values {
+            for (col_idx, val_set) in column_values.iter().enumerate() {
+                let low_cardinality = val_set.is_some();
+                if low_cardinality {
+                    let col_name = meta_data.schema.column(col_idx).name.clone();
+                    low_cardinality_counts
+                        .entry(col_name)
+                        .and_modify(|v| *v += 1)
+                        .or_insert(1);
+                }
+            }
+        }
+    }
+
+    // Only the column whose cardinality is low in all the metas is a
+    // low-cardinality column.
+    // TODO: shall we merge all the distinct values of the column to check whether
+    // the cardinality is still thought to be low?
+    let low_cardinality_cols = low_cardinality_counts
+        .into_iter()
+        .filter_map(|(col_name, cnt)| {
+            (cnt == metas.len()).then_some((
+                col_name,
+                ColumnStats {
+                    low_cardinality: true,
+                },
+            ))
+        });
+    HashMap::from_iter(low_cardinality_cols)
+}
+
 fn split_record_batch_with_time_ranges(
     record_batch: RecordBatchWithKey,
     time_ranges: &[TimeRange],
@@ -1126,15 +1178,26 @@ fn build_mem_table_iter(
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
+    use bytes_ext::Bytes;
     use common_types::{
+        schema::Schema,
         tests::{
-            build_record_batch_with_key_by_rows, build_row, build_row_opt,
+            build_record_batch_with_key_by_rows, build_row, build_row_opt, build_schema,
             check_record_batch_with_key_with_rows,
         },
         time::TimeRange,
     };
 
-    use crate::instance::flush_compaction::split_record_batch_with_time_ranges;
+    use super::collect_column_stats_from_meta_datas;
+    use crate::{
+        instance::flush_compaction::split_record_batch_with_time_ranges,
+        sst::{
+            meta_data::SstMetaData,
+            parquet::meta_data::{ColumnValueSet, ParquetMetaData},
+        },
+    };
 
     #[test]
     fn test_split_record_batch_with_time_ranges() {
@@ -1187,4 +1250,71 @@ mod tests {
         check_record_batch_with_key_with_rows(&rets[1], rows1.len(), column_num, rows1);
         check_record_batch_with_key_with_rows(&rets[2], rows2.len(), column_num, rows2);
     }
+
+    fn check_collect_column_stats(
+        schema: &Schema,
+        expected_low_cardinality_col_indexes: Vec<usize>,
+        meta_datas: Vec<SstMetaData>,
+    ) {
+        let column_stats = collect_column_stats_from_meta_datas(&meta_datas);
+        assert_eq!(
+            column_stats.len(),
+            expected_low_cardinality_col_indexes.len()
+        );
+
+        for col_idx in expected_low_cardinality_col_indexes {
+            let col_schema = schema.column(col_idx);
+            assert!(column_stats.contains_key(&col_schema.name));
+        }
+    }
+
+    #[test]
+    fn test_collect_column_stats_from_metadata() {
+        let schema = build_schema();
+        let build_meta_data = |low_cardinality_col_indexes: Vec<usize>| {
+            let mut column_values = vec![None; 6];
+            for idx in low_cardinality_col_indexes {
+                column_values[idx] = Some(ColumnValueSet::StringValue(Default::default()));
+            }
+            let parquet_meta_data = ParquetMetaData {
+                min_key: Bytes::new(),
+                max_key: Bytes::new(),
+                time_range: TimeRange::empty(),
+                max_sequence: 0,
+                schema: schema.clone(),
+                parquet_filter: None,
+                column_values: Some(column_values),
+            };
+            SstMetaData::Parquet(Arc::new(parquet_meta_data))
+        };
+
+        // Normal case 0
+        let meta_datas = vec![
+            build_meta_data(vec![0]),
+            build_meta_data(vec![0]),
+            build_meta_data(vec![0, 1]),
+            build_meta_data(vec![0, 2]),
+            build_meta_data(vec![0, 3]),
+        ];
+        check_collect_column_stats(&schema, vec![0], meta_datas);
+
+        // Normal case 1
+        let meta_datas = vec![
+            build_meta_data(vec![0]),
+            build_meta_data(vec![0]),
+            build_meta_data(vec![]),
+            build_meta_data(vec![1]),
+            build_meta_data(vec![3]),
+        ];
+        check_collect_column_stats(&schema, vec![], meta_datas);
+
+        // Normal case 2
+        let meta_datas = vec![
+            build_meta_data(vec![3, 5]),
+            build_meta_data(vec![0, 3, 5]),
+            build_meta_data(vec![0, 1, 2, 3, 5]),
+            build_meta_data(vec![1, 3, 5]),
+        ];
+        check_collect_column_stats(&schema, vec![3, 5], meta_datas);
+    }
 }
diff --git a/analytic_engine/src/sst/factory.rs b/analytic_engine/src/sst/factory.rs
index 8a585d8639..8d507c6e34 100644
--- a/analytic_engine/src/sst/factory.rs
+++ b/analytic_engine/src/sst/factory.rs
@@ -14,7 +14,7 @@
 
 //! Factory for different kinds sst writer and reader.
 
-use std::{fmt::Debug, sync::Arc};
+use std::{collections::HashMap, fmt::Debug, sync::Arc};
 
 use async_trait::async_trait;
 use common_types::projected_schema::ProjectedSchema;
@@ -25,6 +25,7 @@ use snafu::{ResultExt, Snafu};
 use table_engine::predicate::PredicateRef;
 use trace_metric::MetricsCollector;
 
+use super::parquet::encoding::ColumnEncoding;
 use crate::{
     sst::{
         file::Level,
@@ -146,6 +147,10 @@ pub struct SstReadOptions {
 
     pub runtime: Arc<Runtime>,
 }
+#[derive(Clone, Debug)]
+pub struct ColumnStats {
+    pub low_cardinality: bool,
+}
 
 #[derive(Debug, Clone)]
 pub struct SstWriteOptions {
@@ -153,6 +158,15 @@ pub struct SstWriteOptions {
     pub num_rows_per_row_group: usize,
     pub compression: Compression,
     pub max_buffer_size: usize,
+    pub column_stats: HashMap<String, ColumnStats>,
+}
+
+impl From<&ColumnStats> for ColumnEncoding {
+    fn from(value: &ColumnStats) -> Self {
+        ColumnEncoding {
+            enable_dict: value.low_cardinality,
+        }
+    }
 }
 
 #[derive(Debug, Default)]
@@ -203,11 +217,16 @@ impl Factory for FactoryImpl {
         store_picker: &'a ObjectStorePickerRef,
         level: Level,
     ) -> Result<Box<dyn SstWriter + Send + 'a>> {
+        let column_encodings =
+            HashMap::from_iter(options.column_stats.iter().map(|(col_name, col_stats)| {
+                (col_name.to_owned(), ColumnEncoding::from(col_stats))
+            }));
         let write_options = WriteOptions {
             num_rows_per_row_group: options.num_rows_per_row_group,
             max_buffer_size: options.max_buffer_size,
             compression: options.compression.into(),
             sst_level: level,
+            column_encodings,
         };
         Ok(Box::new(ParquetSstWriter::new(
             path,
diff --git a/analytic_engine/src/sst/meta_data/mod.rs b/analytic_engine/src/sst/meta_data/mod.rs
index 7e80afb4f5..e053ba4f7a 100644
--- a/analytic_engine/src/sst/meta_data/mod.rs
+++ b/analytic_engine/src/sst/meta_data/mod.rs
@@ -57,7 +57,7 @@ pub enum Error {
     #[snafu(display("Key value meta path in parquet is empty\nBacktrace\n:{}", backtrace))]
     KvMetaPathEmpty { backtrace: Backtrace },
 
-    #[snafu(display("Unknown mata version, value:{}.\nBacktrace\n:{}", version, backtrace))]
+    #[snafu(display("Unknown meta version, value:{}.\nBacktrace\n:{}", version, backtrace))]
     UnknownMetaVersion {
         version: String,
         backtrace: Backtrace,
@@ -66,9 +66,6 @@ pub enum Error {
     #[snafu(display("Metadata in proto struct is not found.\nBacktrace\n:{}", backtrace))]
     MetaDataNotFound { backtrace: Backtrace },
 
-    #[snafu(display("Empty custom metadata in parquet.\nBacktrace\n:{}", backtrace))]
-    EmptyCustomMetaData { backtrace: Backtrace },
-
     #[snafu(display("Failed to decode custom metadata in parquet, err:{}", source))]
     DecodeCustomMetaData { source: encoding::Error },
 
@@ -81,12 +78,6 @@ pub enum Error {
     #[snafu(display("Failed to convert parquet meta data, err:{}", source))]
     ConvertParquetMetaData { source: parquet::meta_data::Error },
 
-    #[snafu(display("Meet a object store error, err:{source}\nBacktrace:\n{backtrace}"))]
-    ObjectStoreError {
-        source: object_store::ObjectStoreError,
-        backtrace: Backtrace,
-    },
-
     #[snafu(display(
         "Failed to decode sst meta data, file_path:{file_path}, err:{source}.\nBacktrace:\n{backtrace:?}",
     ))]
diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs
index 9dc4a13277..92f94b4cfe 100644
--- a/analytic_engine/src/sst/parquet/encoding.rs
+++ b/analytic_engine/src/sst/parquet/encoding.rs
@@ -238,7 +238,7 @@ struct ColumnarRecordEncoder<W> {
     arrow_schema: ArrowSchemaRef,
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ColumnEncoding {
     pub enable_dict: bool,
 }
diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs
index b2c7874547..ef233f7053 100644
--- a/analytic_engine/src/sst/parquet/writer.rs
+++ b/analytic_engine/src/sst/parquet/writer.rs
@@ -90,7 +90,7 @@ struct RecordBatchGroupWriter<'a> {
     request_id: RequestId,
     input: RecordBatchStream,
     meta_data: &'a MetaData,
-    options: &'a WriteOptions,
+    options: WriteOptions,
 
     // inner status
     input_exhausted: bool,
@@ -101,12 +101,13 @@ struct RecordBatchGroupWriter<'a> {
     column_values: Option<Vec<Option<ColumnValueSet>>>,
 }
 
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug)]
 pub struct WriteOptions {
     pub num_rows_per_row_group: usize,
     pub max_buffer_size: usize,
     pub compression: Compression,
     pub sst_level: Level,
+    pub column_encodings: HashMap<String, ColumnEncoding>,
 }
 
 impl WriteOptions {
@@ -121,7 +122,7 @@ impl<'a> RecordBatchGroupWriter<'a> {
         request_id: RequestId,
         input: RecordBatchStream,
         meta_data: &'a MetaData,
-        options: &'a WriteOptions,
+        options: WriteOptions,
     ) -> Self {
         // No need to build complex index for the min-level sst so there is no need to
         // collect the column values.
@@ -217,12 +218,14 @@ impl<'a> RecordBatchGroupWriter<'a> {
     fn build_column_encodings(
         &self,
         sample_row_groups: &[RecordBatchWithKey],
-    ) -> Result<HashMap<String, ColumnEncoding>> {
-        let sampler = ColumnEncodingSampler {
+        column_encodings: &mut HashMap<String, ColumnEncoding>,
+    ) -> Result<()> {
+        let mut sampler = ColumnEncodingSampler {
             sample_row_groups,
             meta_data: self.meta_data,
             min_num_sample_rows: MIN_NUM_ROWS_SAMPLE_DICT_ENCODING,
             max_unique_value_ratio: MAX_UNIQUE_VALUE_RATIO_DICT_ENCODING,
+            column_encodings,
         };
         sampler.sample()
     }
@@ -321,8 +324,10 @@ impl<'a> RecordBatchGroupWriter<'a> {
         let mut arrow_row_group = Vec::new();
         let mut total_num_rows = 0;
 
+        // Build the parquet encoder.
         let mut row_group = self.fetch_next_row_group(&mut prev_record_batch).await?;
-        let column_encodings = self.build_column_encodings(&row_group)?;
+        let mut column_encodings = std::mem::take(&mut self.options.column_encodings);
+        self.build_column_encodings(&row_group, &mut column_encodings)?;
         let encode_options = EncodeOptions {
             num_rows_per_row_group: self.options.num_rows_per_row_group,
             max_buffer_size: self.options.max_buffer_size,
@@ -333,6 +338,7 @@ impl<'a> RecordBatchGroupWriter<'a> {
             ParquetEncoder::try_new(sink, &self.meta_data.schema, &encode_options)
                 .box_err()
                 .context(EncodeRecordBatch)?;
+
         let mut parquet_filter = self
             .options
             .need_custom_filter()
@@ -476,7 +482,14 @@ impl<'a> SstWriter for ParquetSstWriter<'a> {
             request_id, meta, self.options.num_rows_per_row_group
         );
 
-        let group_writer = RecordBatchGroupWriter::new(request_id, input, meta, &self.options);
+        let write_options = WriteOptions {
+            num_rows_per_row_group: self.options.num_rows_per_row_group,
+            max_buffer_size: self.options.max_buffer_size,
+            compression: self.options.compression,
+            sst_level: self.options.sst_level,
+            column_encodings: std::mem::take(&mut self.options.column_encodings),
+        };
+        let group_writer = RecordBatchGroupWriter::new(request_id, input, meta, write_options);
 
         let (aborter, sink) =
             ObjectStoreMultiUploadAborter::initialize_upload(self.store, self.path).await?;
@@ -522,33 +535,34 @@ struct ColumnEncodingSampler<'a> {
     meta_data: &'a MetaData,
     min_num_sample_rows: usize,
     max_unique_value_ratio: f64,
+    column_encodings: &'a mut HashMap<String, ColumnEncoding>,
 }
 
 impl<'a> ColumnEncodingSampler<'a> {
-    fn sample(&self) -> Result<HashMap<String, ColumnEncoding>> {
+    fn sample(&mut self) -> Result<()> {
         let num_total_rows: usize = self.sample_row_groups.iter().map(|v| v.num_rows()).sum();
-        if num_total_rows < self.min_num_sample_rows {
-            return Ok(HashMap::new());
+        let ignore_sampling = num_total_rows < self.min_num_sample_rows;
+        if ignore_sampling {
+            self.decide_column_encodings_by_data_type();
+            return Ok(());
         }
 
         assert!(self.max_unique_value_ratio <= 1.0 && self.max_unique_value_ratio >= 0.0);
         let max_unique_values = (num_total_rows as f64 * self.max_unique_value_ratio) as usize;
         let mut column_hashes = HashSet::with_capacity(max_unique_values);
-        let mut column_encodings = HashMap::with_capacity(self.meta_data.schema.num_columns());
         for (col_idx, col_schema) in self.meta_data.schema.columns().iter().enumerate() {
-            // Only do dictionary encoding for string or bytes column.
-            let allowed_dict_type = matches!(
-                col_schema.data_type,
-                DatumKind::String | DatumKind::Varbinary
-            );
-            if !allowed_dict_type {
-                column_encodings.insert(
+            if !Self::is_dictionary_type(col_schema.data_type) {
+                self.column_encodings.insert(
                     col_schema.name.clone(),
                     ColumnEncoding { enable_dict: false },
                 );
                 continue;
             }
 
+            if self.column_encodings.contains_key(&col_schema.name) {
+                continue;
+            }
+
             for row_group in self.sample_row_groups {
                 let col_block = &row_group.columns()[col_idx];
                 for idx in 0..row_group.num_rows() {
@@ -567,10 +581,28 @@ impl<'a> ColumnEncodingSampler<'a> {
             // small.
             let enable_dict = column_hashes.len() < max_unique_values;
             column_hashes.clear();
-            column_encodings.insert(col_schema.name.clone(), ColumnEncoding { enable_dict });
+            self.column_encodings
+                .insert(col_schema.name.clone(), ColumnEncoding { enable_dict });
+        }
+
+        Ok(())
+    }
+
+    fn decide_column_encodings_by_data_type(&mut self) {
+        for col_schema in self.meta_data.schema.columns().iter() {
+            if !Self::is_dictionary_type(col_schema.data_type) {
+                self.column_encodings.insert(
+                    col_schema.name.clone(),
+                    ColumnEncoding { enable_dict: false },
+                );
+            }
         }
+    }
 
-        Ok(column_encodings)
+    #[inline]
+    fn is_dictionary_type(data_type: DatumKind) -> bool {
+        // Only do dictionary encoding for string or bytes column.
+        matches!(data_type, DatumKind::String | DatumKind::Varbinary)
     }
 }
 
@@ -630,6 +662,7 @@ mod tests {
                 num_rows_per_row_group,
                 compression: table_options::Compression::Uncompressed,
                 max_buffer_size: 0,
+                column_stats: Default::default(),
             };
 
             let dir = tempdir().unwrap();
@@ -867,6 +900,7 @@ mod tests {
             max_buffer_size: 0,
             compression: Compression::UNCOMPRESSED,
             sst_level: Level::default(),
+            column_encodings: Default::default(),
         };
         let meta_data = MetaData {
             min_key: Default::default(),
@@ -879,7 +913,7 @@ mod tests {
             RequestId::next_id(),
             record_batch_stream,
             &meta_data,
-            &write_options,
+            write_options,
         );
 
         let mut prev_record_batch = None;
@@ -895,21 +929,16 @@ mod tests {
     }
 
     fn check_sample_column_encoding(
-        sampler: ColumnEncodingSampler<'_>,
-        expect_enable_dicts: Option<Vec<bool>>,
+        mut sampler: ColumnEncodingSampler<'_>,
+        expect_enable_dicts: Vec<Option<bool>>,
     ) {
-        let column_encodings = sampler.sample().unwrap();
-        if expect_enable_dicts.is_none() {
-            assert!(column_encodings.is_empty());
-            return;
-        }
-
-        let expect_enable_dicts = expect_enable_dicts.unwrap();
+        sampler.sample().unwrap();
         for (col_idx, col_schema) in sampler.meta_data.schema.columns().iter().enumerate() {
-            let expect_enable_dict = expect_enable_dicts[col_idx];
-            let column_encoding = column_encodings.get(&col_schema.name).unwrap();
+            let expect_enable_dict =
+                expect_enable_dicts[col_idx].map(|v| ColumnEncoding { enable_dict: v });
+            let column_encoding = sampler.column_encodings.get(&col_schema.name).cloned();
             assert_eq!(
-                expect_enable_dict, column_encoding.enable_dict,
+                expect_enable_dict, column_encoding,
                 "column:{}",
                 col_schema.name
             );
@@ -946,33 +975,81 @@ mod tests {
         };
         let record_batches_with_key = vec![record_batch_with_key0, record_batch_with_key1];
 
-        // Normal case 1
+        let mut column_encodings = HashMap::new();
         let sampler = ColumnEncodingSampler {
             sample_row_groups: &record_batches_with_key,
             meta_data: &meta_data,
             min_num_sample_rows: 10,
             max_unique_value_ratio: 0.6,
+            column_encodings: &mut column_encodings,
         };
-        let expect_enable_dicts = vec![true, false, false, true, false, false];
-        check_sample_column_encoding(sampler, Some(expect_enable_dicts));
+        let expect_enable_dicts = vec![
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(true),
+            Some(false),
+            Some(false),
+        ];
+        check_sample_column_encoding(sampler, expect_enable_dicts);
 
-        // Normal case 2
+        column_encodings.clear();
         let sampler = ColumnEncodingSampler {
             sample_row_groups: &record_batches_with_key,
             meta_data: &meta_data,
             min_num_sample_rows: 10,
             max_unique_value_ratio: 0.2,
+            column_encodings: &mut column_encodings,
         };
-        let expect_enable_dicts = vec![true, false, false, false, false, false];
-        check_sample_column_encoding(sampler, Some(expect_enable_dicts));
+        let expect_enable_dicts = vec![
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false),
+            Some(false),
+        ];
+        check_sample_column_encoding(sampler, expect_enable_dicts);
 
-        // Normal case 3
+        column_encodings.clear();
         let sampler = ColumnEncodingSampler {
             sample_row_groups: &record_batches_with_key,
             meta_data: &meta_data,
             min_num_sample_rows: 30,
             max_unique_value_ratio: 0.2,
+            column_encodings: &mut column_encodings,
         };
-        check_sample_column_encoding(sampler, None);
+        let expect_enable_dicts = vec![
+            None,
+            Some(false),
+            Some(false),
+            None,
+            Some(false),
+            Some(false),
+        ];
+        check_sample_column_encoding(sampler, expect_enable_dicts);
+
+        column_encodings.clear();
+        // `field1` is double type, it will still be changed to false even if it is set
+        // as true.
+        // `field2` is string type, it will be kept as the pre-set.
+        column_encodings.insert("field1".to_string(), ColumnEncoding { enable_dict: true });
+        column_encodings.insert("field2".to_string(), ColumnEncoding { enable_dict: true });
+        let sampler = ColumnEncodingSampler {
+            sample_row_groups: &record_batches_with_key,
+            meta_data: &meta_data,
+            min_num_sample_rows: 10,
+            max_unique_value_ratio: 0.2,
+            column_encodings: &mut column_encodings,
+        };
+        let expect_enable_dicts = vec![
+            Some(true),
+            Some(false),
+            Some(false),
+            Some(true),
+            Some(false),
+            Some(false),
+        ];
+        check_sample_column_encoding(sampler, expect_enable_dicts);
     }
 }
diff --git a/benchmarks/src/sst_tools.rs b/benchmarks/src/sst_tools.rs
index b9d41cbe83..62653d3c30 100644
--- a/benchmarks/src/sst_tools.rs
+++ b/benchmarks/src/sst_tools.rs
@@ -66,6 +66,7 @@ async fn create_sst_from_stream(config: SstConfig, record_batch_stream: RecordBa
         num_rows_per_row_group: config.num_rows_per_row_group,
         compression: config.compression,
         max_buffer_size: 1024 * 1024 * 10,
+        column_stats: Default::default(),
     };
 
     info!(
diff --git a/tools/src/bin/sst-convert.rs b/tools/src/bin/sst-convert.rs
index 7944a62ab1..0021a1425b 100644
--- a/tools/src/bin/sst-convert.rs
+++ b/tools/src/bin/sst-convert.rs
@@ -122,6 +122,7 @@ async fn run(args: Args, runtime: Arc<Runtime>) -> Result<()> {
         compression: Compression::parse_from(&args.compression)
             .with_context(|| format!("invalid compression:{}", args.compression))?,
         max_buffer_size: 10 * 1024 * 1024,
+        column_stats: Default::default(),
     };
     let output = Path::from(args.output);
     let mut writer = factory

From 3f5d8f450168815671c28858b4ec74378ce4f633 Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:42:35 +0800
Subject: [PATCH 17/38] fix: no write stall (#1388)

## Rationale
#1003 tries to avoid frequent flush requests which may generate massive
small ssts, but the write stall is also removed in the normal write
path.

## Detailed Changes
Introduce the `min_flush_interval` to avoid frequent flush requests and
recover the write stall mechanism.

## Test Plan
Add unit tests for the frequent flush check.
---
 analytic_engine/src/compaction/scheduler.rs   |  4 ++
 .../src/instance/flush_compaction.rs          | 64 ++++++++++++++++++-
 analytic_engine/src/instance/mod.rs           | 14 ++++
 analytic_engine/src/instance/open.rs          |  2 +
 .../src/instance/serial_executor.rs           |  6 +-
 analytic_engine/src/instance/wal_replayer.rs  |  3 +-
 analytic_engine/src/instance/write.rs         |  5 +-
 analytic_engine/src/lib.rs                    |  3 +
 analytic_engine/src/table/data.rs             |  5 +-
 9 files changed, 93 insertions(+), 13 deletions(-)

diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs
index 72fe2f8bc8..d242b44385 100644
--- a/analytic_engine/src/compaction/scheduler.rs
+++ b/analytic_engine/src/compaction/scheduler.rs
@@ -306,6 +306,7 @@ impl SchedulerImpl {
         runtime: Arc<Runtime>,
         config: SchedulerConfig,
         write_sst_max_buffer_size: usize,
+        min_flush_interval_ms: u64,
         scan_options: ScanOptions,
     ) -> Self {
         let (tx, rx) = mpsc::channel(config.schedule_channel_len);
@@ -321,6 +322,7 @@ impl SchedulerImpl {
             max_ongoing_tasks: config.max_ongoing_tasks,
             max_unflushed_duration: config.max_unflushed_duration.0,
             write_sst_max_buffer_size,
+            min_flush_interval_ms,
             scan_options,
             limit: Arc::new(OngoingTaskLimit {
                 ongoing_tasks: AtomicUsize::new(0),
@@ -399,6 +401,7 @@ struct ScheduleWorker {
     picker_manager: PickerManager,
     max_ongoing_tasks: usize,
     write_sst_max_buffer_size: usize,
+    min_flush_interval_ms: u64,
     scan_options: ScanOptions,
     limit: Arc<OngoingTaskLimit>,
     running: Arc<AtomicBool>,
@@ -665,6 +668,7 @@ impl ScheduleWorker {
             space_store: self.space_store.clone(),
             runtime: self.runtime.clone(),
             write_sst_max_buffer_size: self.write_sst_max_buffer_size,
+            min_flush_interval_ms: Some(self.min_flush_interval_ms),
         };
 
         for table_data in &tables_buf {
diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs
index cbede944ba..7094bb35ab 100644
--- a/analytic_engine/src/instance/flush_compaction.rs
+++ b/analytic_engine/src/instance/flush_compaction.rs
@@ -213,6 +213,9 @@ pub struct Flusher {
 
     pub runtime: RuntimeRef,
     pub write_sst_max_buffer_size: usize,
+    /// If the interval is set, it will generate a [`FlushTask`] with min flush
+    /// interval check.
+    pub min_flush_interval_ms: Option<u64>,
 }
 
 struct FlushTask {
@@ -220,6 +223,22 @@ struct FlushTask {
     table_data: TableDataRef,
     runtime: RuntimeRef,
     write_sst_max_buffer_size: usize,
+    // If the interval is set, it will be used to check whether flush is too frequent.
+    min_flush_interval_ms: Option<u64>,
+}
+
+/// The checker to determine whether a flush is frequent.
+struct FrequentFlushChecker {
+    min_flush_interval_ms: u64,
+    last_flush_time_ms: u64,
+}
+
+impl FrequentFlushChecker {
+    #[inline]
+    fn is_frequent_flush(&self) -> bool {
+        let now = time_ext::current_time_millis();
+        self.last_flush_time_ms + self.min_flush_interval_ms > now
+    }
 }
 
 impl Flusher {
@@ -268,6 +287,7 @@ impl Flusher {
             space_store: self.space_store.clone(),
             runtime: self.runtime.clone(),
             write_sst_max_buffer_size: self.write_sst_max_buffer_size,
+            min_flush_interval_ms: self.min_flush_interval_ms,
         };
         let flush_job = async move { flush_task.run().await };
 
@@ -281,6 +301,16 @@ impl FlushTask {
     /// Each table can only have one running flush task at the same time, which
     /// should be ensured by the caller.
     async fn run(&self) -> Result<()> {
+        let large_enough = self.table_data.should_flush_table(false);
+        if !large_enough && self.is_frequent_flush() {
+            debug!(
+                "Ignore flush task for too frequent flush of small memtable, table:{}",
+                self.table_data.name
+            );
+
+            return Ok(());
+        }
+
         let instant = Instant::now();
         let flush_req = self.preprocess_flush(&self.table_data).await?;
 
@@ -320,6 +350,18 @@ impl FlushTask {
         Ok(())
     }
 
+    fn is_frequent_flush(&self) -> bool {
+        if let Some(min_flush_interval_ms) = self.min_flush_interval_ms {
+            let checker = FrequentFlushChecker {
+                min_flush_interval_ms,
+                last_flush_time_ms: self.table_data.last_flush_time(),
+            };
+            checker.is_frequent_flush()
+        } else {
+            false
+        }
+    }
+
     async fn preprocess_flush(&self, table_data: &TableDataRef) -> Result<TableFlushRequest> {
         let current_version = table_data.current_version();
         let mut last_sequence = table_data.last_sequence();
@@ -1190,7 +1232,7 @@ mod tests {
         time::TimeRange,
     };
 
-    use super::collect_column_stats_from_meta_datas;
+    use super::{collect_column_stats_from_meta_datas, FrequentFlushChecker};
     use crate::{
         instance::flush_compaction::split_record_batch_with_time_ranges,
         sst::{
@@ -1317,4 +1359,24 @@ mod tests {
         ];
         check_collect_column_stats(&schema, vec![3, 5], meta_datas);
     }
+
+    #[test]
+    fn test_frequent_flush() {
+        let now = time_ext::current_time_millis();
+        let cases = vec![
+            (now - 1000, 100, false),
+            (now - 1000, 2000, true),
+            (now - 10000, 200, false),
+            (now - 2000, 2000, false),
+            (now + 2000, 1000, true),
+        ];
+        for (last_flush_time_ms, min_flush_interval_ms, expect) in cases {
+            let checker = FrequentFlushChecker {
+                min_flush_interval_ms,
+                last_flush_time_ms,
+            };
+
+            assert_eq!(expect, checker.is_frequent_flush());
+        }
+    }
 }
diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs
index 4b70a3f0c8..031f867ef5 100644
--- a/analytic_engine/src/instance/mod.rs
+++ b/analytic_engine/src/instance/mod.rs
@@ -176,6 +176,8 @@ pub struct Instance {
     pub(crate) replay_batch_size: usize,
     /// Write sst max buffer size
     pub(crate) write_sst_max_buffer_size: usize,
+    /// The min interval between flushes
+    pub(crate) min_flush_interval: ReadableDuration,
     /// Max retry limit to flush memtables
     pub(crate) max_retry_flush_limit: usize,
     /// Max bytes per write batch
@@ -304,6 +306,18 @@ impl Instance {
             // Do flush in write runtime
             runtime: self.runtimes.write_runtime.clone(),
             write_sst_max_buffer_size: self.write_sst_max_buffer_size,
+            min_flush_interval_ms: None,
+        }
+    }
+
+    #[inline]
+    fn make_flusher_with_min_interval(&self) -> Flusher {
+        Flusher {
+            space_store: self.space_store.clone(),
+            // Do flush in write runtime
+            runtime: self.runtimes.write_runtime.clone(),
+            write_sst_max_buffer_size: self.write_sst_max_buffer_size,
+            min_flush_interval_ms: Some(self.min_flush_interval.as_millis()),
         }
     }
 
diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs
index 446d363348..6fcf29d052 100644
--- a/analytic_engine/src/instance/open.rs
+++ b/analytic_engine/src/instance/open.rs
@@ -110,6 +110,7 @@ impl Instance {
             compaction_runtime,
             scheduler_config,
             ctx.config.write_sst_max_buffer_size.as_byte() as usize,
+            ctx.config.min_flush_interval.as_millis(),
             scan_options_for_compaction,
         ));
 
@@ -137,6 +138,7 @@ impl Instance {
             space_write_buffer_size: ctx.config.space_write_buffer_size,
             replay_batch_size: ctx.config.replay_batch_size,
             write_sst_max_buffer_size: ctx.config.write_sst_max_buffer_size.as_byte() as usize,
+            min_flush_interval: ctx.config.min_flush_interval,
             max_retry_flush_limit: ctx.config.max_retry_flush_limit,
             mem_usage_sampling_interval: ctx.config.mem_usage_sampling_interval,
             max_bytes_per_write_batch: ctx
diff --git a/analytic_engine/src/instance/serial_executor.rs b/analytic_engine/src/instance/serial_executor.rs
index 579f6892a7..b5187048ec 100644
--- a/analytic_engine/src/instance/serial_executor.rs
+++ b/analytic_engine/src/instance/serial_executor.rs
@@ -166,11 +166,7 @@ impl TableFlushScheduler {
                         *flush_state = FlushState::Flushing;
                         break;
                     }
-                    FlushState::Flushing => {
-                        if !block_on_write_thread {
-                            return Ok(());
-                        }
-                    }
+                    FlushState::Flushing => {}
                     FlushState::Failed { err_msg } => {
                         if self
                             .schedule_sync
diff --git a/analytic_engine/src/instance/wal_replayer.rs b/analytic_engine/src/instance/wal_replayer.rs
index 41ea3fb8b5..082b66d5b7 100644
--- a/analytic_engine/src/instance/wal_replayer.rs
+++ b/analytic_engine/src/instance/wal_replayer.rs
@@ -542,7 +542,8 @@ async fn replay_table_log_entries(
                 }
 
                 // Flush the table if necessary.
-                if table_data.should_flush_table(serial_exec) {
+                let in_flush = serial_exec.flush_scheduler().is_in_flush();
+                if table_data.should_flush_table(in_flush) {
                     let opts = TableFlushOptions {
                         res_sender: None,
                         max_retry_flush_limit,
diff --git a/analytic_engine/src/instance/write.rs b/analytic_engine/src/instance/write.rs
index ed738f864e..e49ccaacb3 100644
--- a/analytic_engine/src/instance/write.rs
+++ b/analytic_engine/src/instance/write.rs
@@ -620,7 +620,8 @@ impl<'a> Writer<'a> {
             }
         }
 
-        if self.table_data.should_flush_table(self.serial_exec) {
+        let in_flush = self.serial_exec.flush_scheduler().is_in_flush();
+        if self.table_data.should_flush_table(in_flush) {
             let table_data = self.table_data.clone();
             let _timer = table_data.metrics.start_table_write_flush_wait_timer();
             self.handle_memtable_flush(&table_data).await?;
@@ -673,7 +674,7 @@ impl<'a> Writer<'a> {
             res_sender: None,
             max_retry_flush_limit: self.instance.max_retry_flush_limit(),
         };
-        let flusher = self.instance.make_flusher();
+        let flusher = self.instance.make_flusher_with_min_interval();
         if table_data.id == self.table_data.id {
             let flush_scheduler = self.serial_exec.flush_scheduler();
             // Set `block_on_write_thread` to false and let flush do in background.
diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs
index 64ba8942cf..4e951c3445 100644
--- a/analytic_engine/src/lib.rs
+++ b/analytic_engine/src/lib.rs
@@ -97,6 +97,8 @@ pub struct Config {
     pub write_sst_max_buffer_size: ReadableSize,
     /// Max retry limit After flush failed
     pub max_retry_flush_limit: usize,
+    /// The min interval between two consecutive flushes
+    pub min_flush_interval: ReadableDuration,
     /// Max bytes per write batch.
     ///
     /// If this is set, the atomicity of write request will be broken.
@@ -185,6 +187,7 @@ impl Default for Config {
             scan_max_record_batches_in_flight: 1024,
             write_sst_max_buffer_size: ReadableSize::mb(10),
             max_retry_flush_limit: 0,
+            min_flush_interval: ReadableDuration::minutes(1),
             max_bytes_per_write_batch: None,
             mem_usage_sampling_interval: ReadableDuration::secs(0),
             wal_encode: WalEncodeConfig::default(),
diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs
index 29286042a2..2c011a9c1b 100644
--- a/analytic_engine/src/table/data.rs
+++ b/analytic_engine/src/table/data.rs
@@ -584,9 +584,7 @@ impl TableData {
     }
 
     /// Returns true if the memory usage of this table reaches flush threshold
-    ///
-    /// REQUIRE: Do in write worker
-    pub fn should_flush_table(&self, serial_exec: &mut TableOpSerialExecutor) -> bool {
+    pub fn should_flush_table(&self, in_flush: bool) -> bool {
         // Fallback to usize::MAX if Failed to convert arena_block_size into
         // usize (overflow)
         let max_write_buffer_size = self
@@ -602,7 +600,6 @@ impl TableData {
 
         let mutable_usage = self.current_version.mutable_memory_usage();
         let total_usage = self.current_version.total_memory_usage();
-        let in_flush = serial_exec.flush_scheduler().is_in_flush();
         // Inspired by https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94
         if mutable_usage > mutable_limit && !in_flush {
             info!(

From da4e7eae598a378dba906f04f03f90c7886f6bbc Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Fri, 22 Dec 2023 12:44:25 +0800
Subject: [PATCH 18/38] fix: missing and verbose logs (#1398)

## Rationale
Some logs about query are verbose and some key logs about opening shard
are missing.

## Detailed Changes
Remove verbose logs and add missing key logs.

## Test Plan
CI.
---
 analytic_engine/src/instance/open.rs              |  8 +++++++-
 analytic_engine/src/manifest/details.rs           |  4 ++--
 query_engine/src/datafusion_impl/executor.rs      | 10 +++++-----
 query_engine/src/datafusion_impl/physical_plan.rs |  4 ++--
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs
index 6fcf29d052..8c840bf8ef 100644
--- a/analytic_engine/src/instance/open.rs
+++ b/analytic_engine/src/instance/open.rs
@@ -327,7 +327,13 @@ impl ShardOpener {
                                 space,
                             })
                         }
-                        Ok(None) => *state = TableOpenStage::Success(None),
+                        Ok(None) => {
+                            error!(
+                                "ShardOpener trie to open a dropped table, table:{:?}, shard_id:{}",
+                                ctx.table_def, self.shard_id
+                            );
+                            *state = TableOpenStage::Success(None);
+                        }
                         Err(e) => *state = TableOpenStage::Failed(e),
                     }
                 }
diff --git a/analytic_engine/src/manifest/details.rs b/analytic_engine/src/manifest/details.rs
index 9a23cd8871..92810fff4a 100644
--- a/analytic_engine/src/manifest/details.rs
+++ b/analytic_engine/src/manifest/details.rs
@@ -540,7 +540,7 @@ impl Manifest for ManifestImpl {
             snapshot_store,
         };
         let meta_snapshot_opt = recover.recover().await?.and_then(|v| v.data);
-
+        let meta_snapshot_exists = meta_snapshot_opt.is_some();
         // Apply it to table.
         if let Some(snapshot) = meta_snapshot_opt {
             let meta_edit = MetaEdit::Snapshot(snapshot);
@@ -552,7 +552,7 @@ impl Manifest for ManifestImpl {
             self.table_meta_set.apply_edit_to_table(request)?;
         }
 
-        info!("Manifest recover finish, request:{load_req:?}");
+        info!("Manifest recover finish, request:{load_req:?}, meta_snapshot_exist:{meta_snapshot_exists}");
 
         Ok(())
     }
diff --git a/query_engine/src/datafusion_impl/executor.rs b/query_engine/src/datafusion_impl/executor.rs
index 83208d1fd5..0412c1d144 100644
--- a/query_engine/src/datafusion_impl/executor.rs
+++ b/query_engine/src/datafusion_impl/executor.rs
@@ -16,7 +16,7 @@ use std::{sync::Arc, time::Instant};
 
 use async_trait::async_trait;
 use generic_error::BoxError;
-use logger::info;
+use logger::debug;
 use snafu::ResultExt;
 use table_engine::stream::SendableRecordBatchStream;
 use time_ext::InstantExt;
@@ -70,8 +70,8 @@ impl Executor for DatafusionExecutorImpl {
         ctx: &Context,
         physical_plan: PhysicalPlanPtr,
     ) -> Result<SendableRecordBatchStream> {
-        info!(
-            "DatafusionExecutorImpl begin to execute plan, request_id:{}, physical_plan: {:?}",
+        debug!(
+            "DatafusionExecutorImpl begin to execute plan, request_id:{}, physical_plan:{:?}",
             ctx.request_id, physical_plan
         );
 
@@ -87,8 +87,8 @@ impl Executor for DatafusionExecutorImpl {
                 msg: Some("failed to execute physical plan".to_string()),
             })?;
 
-        info!(
-            "DatafusionExecutorImpl finish to execute plan, request_id:{}, cost:{}ms, plan_and_metrics: {}",
+        debug!(
+            "DatafusionExecutorImpl finish to execute plan, request_id:{}, cost:{}ms, plan_and_metrics:{}",
             ctx.request_id,
             begin_instant.saturating_elapsed().as_millis(),
             physical_plan.metrics_to_string()
diff --git a/query_engine/src/datafusion_impl/physical_plan.rs b/query_engine/src/datafusion_impl/physical_plan.rs
index 583db57413..60ba23f5f6 100644
--- a/query_engine/src/datafusion_impl/physical_plan.rs
+++ b/query_engine/src/datafusion_impl/physical_plan.rs
@@ -26,7 +26,7 @@ use datafusion::physical_plan::{
     ExecutionPlan,
 };
 use generic_error::BoxError;
-use logger::info;
+use logger::debug;
 use snafu::{OptionExt, ResultExt};
 use table_engine::stream::{FromDfStream, SendableRecordBatchStream};
 
@@ -109,7 +109,7 @@ impl PhysicalPlan for DataFusionPhysicalPlanAdapter {
             Arc::new(CoalescePartitionsExec::new(executable))
         };
 
-        info!(
+        debug!(
             "DatafusionExecutorImpl get the executable plan, request_id:{}, physical_plan:{}",
             df_task_ctx.ctx.request_id,
             displayable(executable.as_ref()).indent(true)

From 4abc76499210dd1108610fd8e97b8a7c65c8bcee Mon Sep 17 00:00:00 2001
From: kamille <34352236+Rachelint@users.noreply.github.com>
Date: Fri, 22 Dec 2023 19:11:42 +0800
Subject: [PATCH 19/38] feat: avoid pulling unnecessary columns when querying
 append mode table (#1307)

## Rationale
Closes #1302

The pulling arrow record batches are ensured to include primary key
columns, however the pulled primary key columns are unused for append
mode tables' queries.
I refactor the whole record batches pulling path in this pr for
readability and enhancement for avoiding pulling primary key columns
even they are unused.

## Detailed Changes
+ Refactor `RowProjector` to `RecordFetchingContext` holding just the
needed information, and pass it to `ScanRequest` & `SstReadOptions`
rather than the too heavy `ProjectedSchema`.
+ Refactor `RecordBatchWithKey` to `FetchingRecordBatch` which holds the
primary indexes on demand.

## Test Plan
Test by exist and new added tests.
---
 .../src/instance/flush_compaction.rs          |  60 ++-
 analytic_engine/src/instance/mod.rs           |  55 ++-
 analytic_engine/src/instance/read.rs          |  40 +-
 .../src/instance/reorder_memtable.rs          |  16 +-
 analytic_engine/src/lib.rs                    |   6 +-
 analytic_engine/src/memtable/columnar/iter.rs |  46 ++-
 analytic_engine/src/memtable/mod.rs           |   8 +-
 analytic_engine/src/memtable/reversed_iter.rs |  10 +-
 analytic_engine/src/memtable/skiplist/iter.rs |  32 +-
 analytic_engine/src/memtable/skiplist/mod.rs  |  24 +-
 analytic_engine/src/row_iter/chain.rs         |  58 ++-
 analytic_engine/src/row_iter/dedup.rs         |  40 +-
 analytic_engine/src/row_iter/merge.rs         |  89 +++--
 analytic_engine/src/row_iter/mod.rs           |   8 +-
 .../src/row_iter/record_batch_stream.rs       |  96 ++---
 analytic_engine/src/row_iter/tests.rs         |  37 +-
 analytic_engine/src/sst/factory.rs            |   4 +-
 .../src/sst/parquet/async_reader.rs           |  60 ++-
 analytic_engine/src/sst/parquet/writer.rs     |  49 ++-
 analytic_engine/src/sst/reader.rs             |   6 +-
 analytic_engine/src/sst/writer.rs             |   7 +-
 benchmarks/src/merge_memtable_bench.rs        |  44 +--
 benchmarks/src/merge_sst_bench.rs             |  50 +--
 benchmarks/src/scan_memtable_bench.rs         |   8 +-
 benchmarks/src/sst_bench.rs                   |  55 ++-
 benchmarks/src/sst_tools.rs                   |  41 ++-
 benchmarks/src/util.rs                        |   9 +-
 catalog/src/schema.rs                         |  10 +-
 common_types/src/projected_schema.rs          | 341 +++++++++++-------
 common_types/src/record_batch.rs              | 254 +++++++------
 common_types/src/row/contiguous.rs            |  25 +-
 common_types/src/row/mod.rs                   |  24 +-
 common_types/src/schema.rs                    |  13 +-
 common_types/src/tests.rs                     |  15 +-
 components/object_store/src/disk_cache.rs     |   2 +-
 .../cases/env/local/ddl/query-plan.result     |  59 ++-
 .../cases/env/local/ddl/query-plan.sql        |  37 ++
 partition_table_engine/src/scan_builder.rs    |   2 +-
 src/wal/src/message_queue_impl/region.rs      |   2 +-
 system_catalog/src/tables.rs                  |  26 +-
 table_engine/src/provider.rs                  |  35 +-
 tools/src/bin/sst-convert.rs                  |  12 +-
 42 files changed, 1128 insertions(+), 687 deletions(-)

diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs
index 7094bb35ab..880eb10ca9 100644
--- a/analytic_engine/src/instance/flush_compaction.rs
+++ b/analytic_engine/src/instance/flush_compaction.rs
@@ -22,8 +22,8 @@ use std::{
 };
 
 use common_types::{
-    projected_schema::ProjectedSchema,
-    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
+    record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
     request_id::RequestId,
     row::RowViewOnBatch,
     time::TimeRange,
@@ -46,8 +46,8 @@ use wal::manager::WalLocation;
 use crate::{
     compaction::{CompactionInputFiles, CompactionTask, ExpiredFiles},
     instance::{
-        self, create_sst_read_option, reorder_memtable::Reorder,
-        serial_executor::TableFlushScheduler, ScanType, SpaceStore, SpaceStoreRef,
+        self, reorder_memtable::Reorder, serial_executor::TableFlushScheduler, ScanType,
+        SpaceStore, SpaceStoreRef, SstReadOptionsBuilder,
     },
     manifest::meta_edit::{
         AlterOptionsMeta, AlterSchemaMeta, MetaEdit, MetaEditRequest, MetaUpdate, VersionEditMeta,
@@ -593,7 +593,7 @@ impl FlushTask {
 
         for time_range in &time_ranges {
             let (batch_record_sender, batch_record_receiver) =
-                channel::<Result<RecordBatchWithKey>>(DEFAULT_CHANNEL_SIZE);
+                channel::<Result<FetchedRecordBatch>>(DEFAULT_CHANNEL_SIZE);
             let file_id = self
                 .table_data
                 .alloc_file_id(&self.space_store.manifest)
@@ -933,20 +933,26 @@ impl SpaceStore {
         let table_options = table_data.table_options();
         let projected_schema = ProjectedSchema::no_projection(schema.clone());
         let predicate = Arc::new(Predicate::empty());
-        let sst_read_options = create_sst_read_option(
+        let maybe_table_level_metrics = table_data
+            .metrics
+            .maybe_table_level_metrics()
+            .sst_metrics
+            .clone();
+        let sst_read_options_builder = SstReadOptionsBuilder::new(
             ScanType::Compaction,
             scan_options,
-            table_data
-                .metrics
-                .maybe_table_level_metrics()
-                .sst_metrics
-                .clone(),
+            maybe_table_level_metrics,
             table_options.num_rows_per_row_group,
-            projected_schema.clone(),
             predicate,
             self.meta_cache.clone(),
             runtime,
         );
+        let fetched_schema = projected_schema.to_record_schema_with_key();
+        let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
+        let fetched_schema = fetched_schema.into_record_schema();
+        let table_schema = projected_schema.table_schema().clone();
+        let row_projector_builder =
+            RowProjectorBuilder::new(fetched_schema, table_schema, Some(primary_key_indexes));
 
         let iter_options = IterOptions {
             batch_size: table_options.num_rows_per_row_group,
@@ -966,8 +972,8 @@ impl SpaceStore {
                 sequence,
                 projected_schema,
                 predicate: Arc::new(Predicate::empty()),
+                sst_read_options_builder: sst_read_options_builder.clone(),
                 sst_factory: &self.sst_factory,
-                sst_read_options: sst_read_options.clone(),
                 store_picker: self.store_picker(),
                 merge_iter_options: iter_options.clone(),
                 need_dedup: table_options.need_dedup(),
@@ -992,6 +998,8 @@ impl SpaceStore {
             row_iter::record_batch_with_key_iter_to_stream(merge_iter)
         };
 
+        // TODO: eliminate the duplicated building of `SstReadOptions`.
+        let sst_read_options = sst_read_options_builder.build(row_projector_builder);
         let (sst_meta, column_stats) = {
             let meta_reader = SstMetaReader {
                 space_id: table_data.space_id,
@@ -1157,12 +1165,17 @@ fn collect_column_stats_from_meta_datas(metas: &[SstMetaData]) -> HashMap<String
 }
 
 fn split_record_batch_with_time_ranges(
-    record_batch: RecordBatchWithKey,
+    record_batch: FetchedRecordBatch,
     time_ranges: &[TimeRange],
     timestamp_idx: usize,
-) -> Result<Vec<RecordBatchWithKey>> {
-    let mut builders: Vec<RecordBatchWithKeyBuilder> = (0..time_ranges.len())
-        .map(|_| RecordBatchWithKeyBuilder::new(record_batch.schema_with_key().clone()))
+) -> Result<Vec<FetchedRecordBatch>> {
+    let fetched_schema = record_batch.schema();
+    let primary_key_indexes = record_batch.primary_key_indexes();
+    let mut builders: Vec<FetchedRecordBatchBuilder> = (0..time_ranges.len())
+        .map(|_| {
+            let primary_key_indexes = primary_key_indexes.map(|idxs| idxs.to_vec());
+            FetchedRecordBatchBuilder::new(fetched_schema.clone(), primary_key_indexes)
+        })
         .collect();
 
     for row_idx in 0..record_batch.num_rows() {
@@ -1203,11 +1216,18 @@ fn build_mem_table_iter(
     table_data: &TableDataRef,
 ) -> Result<ColumnarIterPtr> {
     let scan_ctx = ScanContext::default();
+    let projected_schema = ProjectedSchema::no_projection(table_data.schema());
+    let fetched_schema = projected_schema.to_record_schema_with_key();
+    let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
+    let fetched_schema = fetched_schema.into_record_schema();
+    let table_schema = projected_schema.table_schema().clone();
+    let row_projector_builder =
+        RowProjectorBuilder::new(fetched_schema, table_schema, Some(primary_key_indexes));
     let scan_req = ScanRequest {
         start_user_key: Bound::Unbounded,
         end_user_key: Bound::Unbounded,
         sequence: common_types::MAX_SEQUENCE_NUMBER,
-        projected_schema: ProjectedSchema::no_projection(table_data.schema()),
+        row_projector_builder,
         need_dedup: table_data.dedup(),
         reverse: false,
         metrics_collector: None,
@@ -1226,7 +1246,7 @@ mod tests {
     use common_types::{
         schema::Schema,
         tests::{
-            build_record_batch_with_key_by_rows, build_row, build_row_opt, build_schema,
+            build_fetched_record_batch_by_rows, build_row, build_row_opt, build_schema,
             check_record_batch_with_key_with_rows,
         },
         time::TimeRange,
@@ -1275,7 +1295,7 @@ mod tests {
             .into_iter()
             .flatten()
             .collect();
-        let record_batch_with_key = build_record_batch_with_key_by_rows(rows);
+        let record_batch_with_key = build_fetched_record_batch_by_rows(rows);
         let column_num = record_batch_with_key.num_columns();
         let time_ranges = vec![
             TimeRange::new_unchecked_for_test(0, 100),
diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs
index 031f867ef5..ab8df1ef9b 100644
--- a/analytic_engine/src/instance/mod.rs
+++ b/analytic_engine/src/instance/mod.rs
@@ -33,7 +33,7 @@ pub(crate) mod write;
 
 use std::sync::Arc;
 
-use common_types::{projected_schema::ProjectedSchema, table::TableId};
+use common_types::{projected_schema::RowProjectorBuilder, table::TableId};
 use generic_error::{BoxError, GenericError};
 use logger::{error, info};
 use macros::define_result;
@@ -327,32 +327,55 @@ impl Instance {
     }
 }
 
-// TODO: make it a builder
-#[allow(clippy::too_many_arguments)]
-fn create_sst_read_option(
+#[derive(Debug, Clone)]
+pub struct SstReadOptionsBuilder {
     scan_type: ScanType,
     scan_options: ScanOptions,
     maybe_table_level_metrics: Arc<MaybeTableLevelMetrics>,
     num_rows_per_row_group: usize,
-    projected_schema: ProjectedSchema,
     predicate: PredicateRef,
     meta_cache: Option<MetaCacheRef>,
     runtime: Arc<Runtime>,
-) -> SstReadOptions {
-    SstReadOptions {
-        maybe_table_level_metrics,
-        num_rows_per_row_group,
-        frequency: scan_type.into(),
-        projected_schema,
-        predicate,
-        meta_cache,
-        scan_options,
-        runtime,
+}
+
+impl SstReadOptionsBuilder {
+    pub fn new(
+        scan_type: ScanType,
+        scan_options: ScanOptions,
+        maybe_table_level_metrics: Arc<MaybeTableLevelMetrics>,
+        num_rows_per_row_group: usize,
+        predicate: PredicateRef,
+        meta_cache: Option<MetaCacheRef>,
+        runtime: Arc<Runtime>,
+    ) -> Self {
+        Self {
+            scan_type,
+            scan_options,
+            maybe_table_level_metrics,
+            num_rows_per_row_group,
+            predicate,
+            meta_cache,
+            runtime,
+        }
+    }
+
+    pub fn build(self, row_projector_builder: RowProjectorBuilder) -> SstReadOptions {
+        SstReadOptions {
+            maybe_table_level_metrics: self.maybe_table_level_metrics,
+            num_rows_per_row_group: self.num_rows_per_row_group,
+            frequency: self.scan_type.into(),
+            row_projector_builder,
+            predicate: self.predicate,
+            meta_cache: self.meta_cache,
+            scan_options: self.scan_options,
+            runtime: self.runtime,
+        }
     }
 }
 
 /// Scan type which mapped to the low level `ReadFrequency` in sst reader.
-enum ScanType {
+#[derive(Debug, Clone, Copy)]
+pub enum ScanType {
     Query,
     Compaction,
 }
diff --git a/analytic_engine/src/instance/read.rs b/analytic_engine/src/instance/read.rs
index f769ec689d..9624f4cfbb 100644
--- a/analytic_engine/src/instance/read.rs
+++ b/analytic_engine/src/instance/read.rs
@@ -23,7 +23,7 @@ use std::{
 use async_stream::try_stream;
 use common_types::{
     projected_schema::ProjectedSchema,
-    record_batch::{RecordBatch, RecordBatchWithKey},
+    record_batch::{FetchedRecordBatch, RecordBatch},
     schema::RecordSchema,
     time::TimeRange,
 };
@@ -42,15 +42,14 @@ use time_ext::current_time_millis;
 use trace_metric::Metric;
 
 use crate::{
-    instance::{create_sst_read_option, Instance, ScanType},
+    instance::{Instance, ScanType, SstReadOptionsBuilder},
     row_iter::{
         chain,
         chain::{ChainConfig, ChainIterator},
         dedup::DedupIterator,
         merge::{MergeBuilder, MergeConfig, MergeIterator},
-        IterOptions, RecordBatchWithKeyIterator,
+        FetchedRecordBatchIterator, IterOptions,
     },
-    sst::factory::SstReadOptions,
     table::{
         data::TableData,
         version::{ReadView, TableVersion},
@@ -123,12 +122,11 @@ impl Instance {
             None,
         ));
 
-        let sst_read_options = create_sst_read_option(
+        let sst_read_options_builder = SstReadOptionsBuilder::new(
             ScanType::Query,
             self.scan_options.clone(),
             table_metrics.sst_metrics.clone(),
             table_options.num_rows_per_row_group,
-            request.projected_schema.clone(),
             request.predicate.clone(),
             self.meta_cache.clone(),
             self.read_runtime().clone(),
@@ -136,12 +134,22 @@ impl Instance {
 
         if need_merge_sort {
             let merge_iters = self
-                .build_merge_iters(table_data, &request, &table_options, sst_read_options)
+                .build_merge_iters(
+                    table_data,
+                    &request,
+                    &table_options,
+                    sst_read_options_builder,
+                )
                 .await?;
             self.build_partitioned_streams(&request, merge_iters)
         } else {
             let chain_iters = self
-                .build_chain_iters(table_data, &request, &table_options, sst_read_options)
+                .build_chain_iters(
+                    table_data,
+                    &request,
+                    &table_options,
+                    sst_read_options_builder,
+                )
                 .await?;
             self.build_partitioned_streams(&request, chain_iters)
         }
@@ -150,7 +158,7 @@ impl Instance {
     fn build_partitioned_streams(
         &self,
         request: &ReadRequest,
-        partitioned_iters: Vec<impl RecordBatchWithKeyIterator + 'static>,
+        partitioned_iters: Vec<impl FetchedRecordBatchIterator + 'static>,
     ) -> Result<PartitionedStreams> {
         let read_parallelism = request.opts.read_parallelism;
 
@@ -179,7 +187,7 @@ impl Instance {
         table_data: &TableData,
         request: &ReadRequest,
         table_options: &TableOptions,
-        sst_read_options: SstReadOptions,
+        sst_read_options_builder: SstReadOptionsBuilder,
     ) -> Result<Vec<DedupIterator<MergeIterator>>> {
         // Current visible sequence
         let sequence = table_data.last_sequence();
@@ -203,7 +211,7 @@ impl Instance {
                 projected_schema: request.projected_schema.clone(),
                 predicate: request.predicate.clone(),
                 sst_factory: &self.space_store.sst_factory,
-                sst_read_options: sst_read_options.clone(),
+                sst_read_options_builder: sst_read_options_builder.clone(),
                 store_picker: self.space_store.store_picker(),
                 merge_iter_options: iter_options.clone(),
                 need_dedup: table_options.need_dedup(),
@@ -239,7 +247,7 @@ impl Instance {
         table_data: &TableData,
         request: &ReadRequest,
         table_options: &TableOptions,
-        sst_read_options: SstReadOptions,
+        sst_read_options_builder: SstReadOptionsBuilder,
     ) -> Result<Vec<ChainIterator>> {
         let projected_schema = request.projected_schema.clone();
 
@@ -261,7 +269,7 @@ impl Instance {
                 table_id: table_data.id,
                 projected_schema: projected_schema.clone(),
                 predicate: request.predicate.clone(),
-                sst_read_options: sst_read_options.clone(),
+                sst_read_options_builder: sst_read_options_builder.clone(),
                 sst_factory: &self.space_store.sst_factory,
                 store_picker: self.space_store.store_picker(),
             };
@@ -347,7 +355,7 @@ struct StreamStateOnMultiIters<I> {
     projected_schema: ProjectedSchema,
 }
 
-impl<I: RecordBatchWithKeyIterator + 'static> StreamStateOnMultiIters<I> {
+impl<I: FetchedRecordBatchIterator + 'static> StreamStateOnMultiIters<I> {
     fn is_exhausted(&self) -> bool {
         self.curr_iter_idx >= self.iters.len()
     }
@@ -362,7 +370,7 @@ impl<I: RecordBatchWithKeyIterator + 'static> StreamStateOnMultiIters<I> {
 
     async fn fetch_next_batch(
         &mut self,
-    ) -> Option<std::result::Result<RecordBatchWithKey, I::Error>> {
+    ) -> Option<std::result::Result<FetchedRecordBatch, I::Error>> {
         loop {
             if self.is_exhausted() {
                 return None;
@@ -379,7 +387,7 @@ impl<I: RecordBatchWithKeyIterator + 'static> StreamStateOnMultiIters<I> {
 }
 
 fn iters_to_stream(
-    iters: Vec<impl RecordBatchWithKeyIterator + 'static>,
+    iters: Vec<impl FetchedRecordBatchIterator + 'static>,
     projected_schema: ProjectedSchema,
 ) -> SendableRecordBatchStream {
     let mut state = StreamStateOnMultiIters {
diff --git a/analytic_engine/src/instance/reorder_memtable.rs b/analytic_engine/src/instance/reorder_memtable.rs
index be1db287dc..5a7a03de42 100644
--- a/analytic_engine/src/instance/reorder_memtable.rs
+++ b/analytic_engine/src/instance/reorder_memtable.rs
@@ -26,7 +26,7 @@ pub use arrow::{
 };
 use async_trait::async_trait;
 use common_types::{
-    record_batch::{RecordBatchData, RecordBatchWithKey},
+    record_batch::{FetchedRecordBatch, RecordBatchData},
     schema::Schema,
 };
 use datafusion::{
@@ -70,8 +70,8 @@ pub enum Error {
 define_result!(Error);
 
 pub type DfResult<T> = std::result::Result<T, DataFusionError>;
-type SendableRecordBatchWithkeyStream =
-    Pin<Box<dyn Stream<Item = Result<RecordBatchWithKey>> + Send>>;
+type SendableFetchingRecordBatchStream =
+    Pin<Box<dyn Stream<Item = Result<FetchedRecordBatch>> + Send>>;
 
 impl From<DataFusionError> for Error {
     fn from(df_err: DataFusionError) -> Self {
@@ -253,7 +253,7 @@ impl Reorder {
 
     // TODO: In theory we can construct a physical plan directly, here we choose
     // logical because it has a convenient builder API for use.
-    pub async fn into_stream(self) -> Result<SendableRecordBatchWithkeyStream> {
+    pub async fn into_stream(self) -> Result<SendableFetchingRecordBatchStream> {
         // 1. Init datafusion context
         let runtime = Arc::new(RuntimeEnv::default());
         let state = SessionState::with_config_rt(SessionConfig::new(), runtime);
@@ -275,12 +275,16 @@ impl Reorder {
 
         // 3. Execute plan and transform stream
         let stream = execute_stream(physical_plan, ctx.task_ctx())?;
-        let schema_with_key = self.schema.to_record_schema_with_key();
+        let record_schema = self.schema.to_record_schema();
         let stream = stream.map(move |batch| {
             let batch = batch.context(FetchRecordBatch)?;
             let data = RecordBatchData::try_from(batch).context(ConvertRecordBatchData)?;
 
-            Ok(RecordBatchWithKey::new(schema_with_key.clone(), data))
+            Ok(FetchedRecordBatch::new_from_parts(
+                record_schema.clone(),
+                None,
+                data,
+            ))
         });
 
         Ok(Box::pin(stream))
diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs
index 4e951c3445..e7d7f81027 100644
--- a/analytic_engine/src/lib.rs
+++ b/analytic_engine/src/lib.rs
@@ -43,7 +43,11 @@ use size_ext::ReadableSize;
 use time_ext::ReadableDuration;
 use wal::config::Config as WalConfig;
 
-pub use crate::{compaction::scheduler::SchedulerConfig, table_options::TableOptions};
+pub use crate::{
+    compaction::scheduler::SchedulerConfig,
+    instance::{ScanType, SstReadOptionsBuilder},
+    table_options::TableOptions,
+};
 
 /// Config of analytic engine
 #[derive(Debug, Clone, Deserialize, Serialize)]
diff --git a/analytic_engine/src/memtable/columnar/iter.rs b/analytic_engine/src/memtable/columnar/iter.rs
index e10739f240..57ea4e6ebd 100644
--- a/analytic_engine/src/memtable/columnar/iter.rs
+++ b/analytic_engine/src/memtable/columnar/iter.rs
@@ -27,8 +27,8 @@ use common_types::{
     column::Column,
     column_schema::ColumnId,
     datum::Datum,
-    projected_schema::{ProjectedSchema, RowProjector},
-    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    projected_schema::RowProjector,
+    record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
     row::Row,
     schema::Schema,
     SequenceNumber,
@@ -66,8 +66,7 @@ pub struct ColumnarIterImpl<A: Arena<Stats = BasicStats> + Clone + Sync + Send>
     /// Schema of this memtable, used to decode row
     memtable_schema: Schema,
     /// Projection of schema to read
-    projected_schema: ProjectedSchema,
-    projector: RowProjector,
+    row_projector: RowProjector,
 
     // Options related:
     batch_size: usize,
@@ -101,17 +100,16 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
         last_sequence: SequenceNumber,
         skiplist: Skiplist<BytewiseComparator, A>,
     ) -> Result<Self> {
-        let projector = request
-            .projected_schema
-            .try_project_with_key(&schema)
+        let row_projector = request
+            .row_projector_builder
+            .build(&schema)
             .context(ProjectSchema)?;
         let mut columnar_iter = Self {
             memtable,
             row_num,
             current_idx: 0,
             memtable_schema: schema,
-            projected_schema: request.projected_schema,
-            projector,
+            row_projector,
             batch_size: ctx.batch_size,
             deadline: ctx.deadline,
             start_user_key: request.start_user_key,
@@ -190,7 +188,7 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
     }
 
     /// Fetch next record batch
-    fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    fn fetch_next_record_batch(&mut self) -> Result<Option<FetchedRecordBatch>> {
         debug_assert_eq!(State::Initialized, self.state);
         assert!(self.batch_size > 0);
         let rows = if !self.need_dedup {
@@ -207,8 +205,14 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
                 }
             }
 
-            let mut builder = RecordBatchWithKeyBuilder::with_capacity(
-                self.projected_schema.to_record_schema_with_key(),
+            let fetched_schema = self.row_projector.fetched_schema().clone();
+            let primary_key_indexes = self
+                .row_projector
+                .primary_key_indexes()
+                .map(|idxs| idxs.to_vec());
+            let mut builder = FetchedRecordBatchBuilder::with_capacity(
+                fetched_schema,
+                primary_key_indexes,
                 self.batch_size,
             );
             for row in rows.into_iter() {
@@ -308,7 +312,12 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
             Row::from_datums(vec![Datum::Null; self.memtable_schema.num_columns()]);
             self.batch_size
         ];
-        for (col_idx, column_schema_idx) in self.projector.source_projection().iter().enumerate() {
+        for (col_idx, column_schema_idx) in self
+            .row_projector
+            .fetched_source_column_indexes()
+            .iter()
+            .enumerate()
+        {
             if let Some(column_schema_idx) = column_schema_idx {
                 let column_schema = self.memtable_schema.column(*column_schema_idx);
                 if let Some(column) = memtable.get(&column_schema.id) {
@@ -328,11 +337,16 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
         let mut num_rows = 0;
         let memtable = self.memtable.read().unwrap();
 
-        let record_schema = self.projected_schema.to_record_schema();
+        let record_schema = self.row_projector.fetched_schema();
         let mut rows =
             vec![Row::from_datums(vec![Datum::Null; record_schema.num_columns()]); self.batch_size];
 
-        for (col_idx, column_schema_idx) in self.projector.source_projection().iter().enumerate() {
+        for (col_idx, column_schema_idx) in self
+            .row_projector
+            .fetched_source_column_indexes()
+            .iter()
+            .enumerate()
+        {
             if let Some(column_schema_idx) = column_schema_idx {
                 let column_schema = self.memtable_schema.column(*column_schema_idx);
                 if let Some(column) = memtable.get(&column_schema.id) {
@@ -378,7 +392,7 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
 }
 
 impl Iterator for ColumnarIterImpl<MonoIncArena> {
-    type Item = Result<RecordBatchWithKey>;
+    type Item = Result<FetchedRecordBatch>;
 
     fn next(&mut self) -> Option<Self::Item> {
         if self.state != State::Initialized {
diff --git a/analytic_engine/src/memtable/mod.rs b/analytic_engine/src/memtable/mod.rs
index 28d0eeb8b3..ed3b20d348 100644
--- a/analytic_engine/src/memtable/mod.rs
+++ b/analytic_engine/src/memtable/mod.rs
@@ -24,8 +24,8 @@ use std::{ops::Bound, sync::Arc, time::Instant};
 
 use bytes_ext::{ByteVec, Bytes};
 use common_types::{
-    projected_schema::ProjectedSchema,
-    record_batch::RecordBatchWithKey,
+    projected_schema::RowProjectorBuilder,
+    record_batch::FetchedRecordBatch,
     row::Row,
     schema::{IndexInWriterSchema, Schema},
     time::TimeRange,
@@ -203,7 +203,7 @@ pub struct ScanRequest {
     /// visible.
     pub sequence: SequenceNumber,
     /// Schema and projection to read.
-    pub projected_schema: ProjectedSchema,
+    pub row_projector_builder: RowProjectorBuilder,
     pub need_dedup: bool,
     pub reverse: bool,
     /// Collector for scan metrics.
@@ -291,4 +291,4 @@ pub struct Metrics {
 pub type MemTableRef = Arc<dyn MemTable + Send + Sync>;
 
 /// A pointer to columnar iterator
-pub type ColumnarIterPtr = Box<dyn Iterator<Item = Result<RecordBatchWithKey>> + Send + Sync>;
+pub type ColumnarIterPtr = Box<dyn Iterator<Item = Result<FetchedRecordBatch>> + Send + Sync>;
diff --git a/analytic_engine/src/memtable/reversed_iter.rs b/analytic_engine/src/memtable/reversed_iter.rs
index 475eb704f3..5a9d5d75d3 100644
--- a/analytic_engine/src/memtable/reversed_iter.rs
+++ b/analytic_engine/src/memtable/reversed_iter.rs
@@ -14,7 +14,7 @@
 
 use std::iter::Rev;
 
-use common_types::record_batch::RecordBatchWithKey;
+use common_types::record_batch::FetchedRecordBatch;
 use generic_error::BoxError;
 use snafu::ResultExt;
 
@@ -26,13 +26,13 @@ use crate::memtable::{IterReverse, Result};
 // reverse  order naturally.
 pub struct ReversedColumnarIterator<I> {
     iter: I,
-    reversed_iter: Option<Rev<std::vec::IntoIter<Result<RecordBatchWithKey>>>>,
+    reversed_iter: Option<Rev<std::vec::IntoIter<Result<FetchedRecordBatch>>>>,
     num_record_batch: usize,
 }
 
 impl<I> ReversedColumnarIterator<I>
 where
-    I: Iterator<Item = Result<RecordBatchWithKey>>,
+    I: Iterator<Item = Result<FetchedRecordBatch>>,
 {
     pub fn new(iter: I, num_rows: usize, batch_size: usize) -> Self {
         Self {
@@ -57,9 +57,9 @@ where
 
 impl<I> Iterator for ReversedColumnarIterator<I>
 where
-    I: Iterator<Item = Result<RecordBatchWithKey>>,
+    I: Iterator<Item = Result<FetchedRecordBatch>>,
 {
-    type Item = Result<RecordBatchWithKey>;
+    type Item = Result<FetchedRecordBatch>;
 
     fn next(&mut self) -> Option<Self::Item> {
         self.init_if_necessary();
diff --git a/analytic_engine/src/memtable/skiplist/iter.rs b/analytic_engine/src/memtable/skiplist/iter.rs
index b101746096..60dd18ca0b 100644
--- a/analytic_engine/src/memtable/skiplist/iter.rs
+++ b/analytic_engine/src/memtable/skiplist/iter.rs
@@ -20,8 +20,8 @@ use arena::{Arena, BasicStats};
 use bytes_ext::{Bytes, BytesMut};
 use codec::row;
 use common_types::{
-    projected_schema::{ProjectedSchema, RowProjector},
-    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    projected_schema::RowProjector,
+    record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
     row::contiguous::{ContiguousRowReader, ProjectedContiguousRow},
     schema::Schema,
     SequenceNumber,
@@ -57,8 +57,7 @@ pub struct ColumnarIterImpl<A: Arena<Stats = BasicStats> + Clone + Sync + Send>
     /// Schema of this memtable, used to decode row
     memtable_schema: Schema,
     /// Projection of schema to read
-    projected_schema: ProjectedSchema,
-    projector: RowProjector,
+    row_projector: RowProjector,
 
     // Options related:
     batch_size: usize,
@@ -86,17 +85,16 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
         request: ScanRequest,
     ) -> Result<Self> {
         // Create projection for the memtable schema
-        let projector = request
-            .projected_schema
-            .try_project_with_key(&memtable.schema)
+        let row_projector = request
+            .row_projector_builder
+            .build(&memtable.schema)
             .context(ProjectSchema)?;
 
         let iter = memtable.skiplist.iter();
         let mut columnar_iter = Self {
             iter,
             memtable_schema: memtable.schema.clone(),
-            projected_schema: request.projected_schema,
-            projector,
+            row_projector,
             batch_size: ctx.batch_size,
             deadline: ctx.deadline,
             start_user_key: request.start_user_key,
@@ -148,12 +146,18 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
     }
 
     /// Fetch next record batch
-    fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    fn fetch_next_record_batch(&mut self) -> Result<Option<FetchedRecordBatch>> {
         debug_assert_eq!(State::Initialized, self.state);
         assert!(self.batch_size > 0);
 
-        let mut builder = RecordBatchWithKeyBuilder::with_capacity(
-            self.projected_schema.to_record_schema_with_key(),
+        let record_schema = self.row_projector.fetched_schema().clone();
+        let primary_key_indexes = self
+            .row_projector
+            .primary_key_indexes()
+            .map(|idxs| idxs.to_vec());
+        let mut builder = FetchedRecordBatchBuilder::with_capacity(
+            record_schema,
+            primary_key_indexes,
             self.batch_size,
         );
         let mut num_rows = 0;
@@ -161,7 +165,7 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
             if let Some(row) = self.fetch_next_row()? {
                 let row_reader = ContiguousRowReader::try_new(&row, &self.memtable_schema)
                     .context(DecodeContinuousRow)?;
-                let projected_row = ProjectedContiguousRow::new(row_reader, &self.projector);
+                let projected_row = ProjectedContiguousRow::new(row_reader, &self.row_projector);
 
                 trace!("Column iterator fetch next row, row:{:?}", projected_row);
 
@@ -293,7 +297,7 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> ColumnarIterImpl<A> {
 }
 
 impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send> Iterator for ColumnarIterImpl<A> {
-    type Item = Result<RecordBatchWithKey>;
+    type Item = Result<FetchedRecordBatch>;
 
     fn next(&mut self) -> Option<Self::Item> {
         if self.state != State::Initialized {
diff --git a/analytic_engine/src/memtable/skiplist/mod.rs b/analytic_engine/src/memtable/skiplist/mod.rs
index 6298903904..a71a82a612 100644
--- a/analytic_engine/src/memtable/skiplist/mod.rs
+++ b/analytic_engine/src/memtable/skiplist/mod.rs
@@ -274,8 +274,8 @@ mod tests {
     use codec::memcomparable::MemComparable;
     use common_types::{
         datum::Datum,
-        projected_schema::ProjectedSchema,
-        record_batch::RecordBatchWithKey,
+        projected_schema::{ProjectedSchema, RowProjectorBuilder},
+        record_batch::FetchedRecordBatch,
         row::Row,
         schema::IndexInWriterSchema,
         tests::{build_row, build_schema},
@@ -294,7 +294,10 @@ mod tests {
     ) {
         let projection: Vec<usize> = (0..schema.num_columns()).collect();
         let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap();
-
+        let fetched_schema = projected_schema.to_record_schema();
+        let table_schema = projected_schema.table_schema();
+        let row_projector_builder =
+            RowProjectorBuilder::new(fetched_schema, table_schema.clone(), None);
         let testcases = vec![
             (
                 // limited by sequence
@@ -302,7 +305,7 @@ mod tests {
                     start_user_key: Bound::Unbounded,
                     end_user_key: Bound::Unbounded,
                     sequence: 2,
-                    projected_schema: projected_schema.clone(),
+                    row_projector_builder: row_projector_builder.clone(),
                     need_dedup: true,
                     reverse: false,
                     metrics_collector: None,
@@ -322,7 +325,7 @@ mod tests {
                     start_user_key: Bound::Included(build_scan_key("a", 1)),
                     end_user_key: Bound::Excluded(build_scan_key("e", 5)),
                     sequence: 2,
-                    projected_schema: projected_schema.clone(),
+                    row_projector_builder: row_projector_builder.clone(),
                     need_dedup: true,
                     reverse: false,
                     metrics_collector: None,
@@ -341,7 +344,7 @@ mod tests {
                     start_user_key: Bound::Included(build_scan_key("a", 1)),
                     end_user_key: Bound::Excluded(build_scan_key("e", 5)),
                     sequence: 1,
-                    projected_schema,
+                    row_projector_builder,
                     need_dedup: true,
                     reverse: false,
                     metrics_collector: None,
@@ -367,13 +370,16 @@ mod tests {
     ) {
         let projection: Vec<usize> = (0..2).collect();
         let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap();
-
+        let fetched_schema = projected_schema.to_record_schema();
+        let table_schema = projected_schema.table_schema();
+        let row_projector_builder =
+            RowProjectorBuilder::new(fetched_schema, table_schema.clone(), None);
         let testcases = vec![(
             ScanRequest {
                 start_user_key: Bound::Included(build_scan_key("a", 1)),
                 end_user_key: Bound::Excluded(build_scan_key("e", 5)),
                 sequence: 2,
-                projected_schema,
+                row_projector_builder,
                 need_dedup: true,
                 reverse: false,
                 metrics_collector: None,
@@ -457,7 +463,7 @@ mod tests {
         test_memtable_scan_for_projection(schema, memtable);
     }
 
-    fn check_iterator<T: Iterator<Item = Result<RecordBatchWithKey>>>(
+    fn check_iterator<T: Iterator<Item = Result<FetchedRecordBatch>>>(
         iter: T,
         expected_rows: Vec<Row>,
     ) {
diff --git a/analytic_engine/src/row_iter/chain.rs b/analytic_engine/src/row_iter/chain.rs
index 71df6e2f9c..3f8bff6bb9 100644
--- a/analytic_engine/src/row_iter/chain.rs
+++ b/analytic_engine/src/row_iter/chain.rs
@@ -19,7 +19,9 @@ use std::{
 
 use async_trait::async_trait;
 use common_types::{
-    projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, request_id::RequestId,
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
+    record_batch::FetchedRecordBatch,
+    request_id::RequestId,
     schema::RecordSchemaWithKey,
 };
 use generic_error::GenericError;
@@ -30,13 +32,16 @@ use table_engine::{predicate::PredicateRef, table::TableId};
 use trace_metric::{MetricsCollector, TraceMetricWhenDrop};
 
 use crate::{
+    instance::SstReadOptionsBuilder,
     row_iter::{
-        record_batch_stream, record_batch_stream::BoxedPrefetchableRecordBatchStream,
-        RecordBatchWithKeyIterator,
+        record_batch_stream::{
+            self, BoxedPrefetchableRecordBatchStream, MemtableStreamContext, SstStreamContext,
+        },
+        FetchedRecordBatchIterator,
     },
     space::SpaceId,
     sst::{
-        factory::{FactoryRef as SstFactoryRef, ObjectStorePickerRef, SstReadOptions},
+        factory::{FactoryRef as SstFactoryRef, ObjectStorePickerRef},
         file::FileHandle,
     },
     table::version::{MemTableVec, SamplingMemTable},
@@ -74,7 +79,7 @@ pub struct ChainConfig<'a> {
     pub predicate: PredicateRef,
     pub num_streams_to_prefetch: usize,
 
-    pub sst_read_options: SstReadOptions,
+    pub sst_read_options_builder: SstReadOptionsBuilder,
     /// Sst factory
     pub sst_factory: &'a SstFactoryRef,
     /// Store picker for persisting sst.
@@ -119,6 +124,29 @@ impl<'a> Builder<'a> {
 
 impl<'a> Builder<'a> {
     pub async fn build(self) -> Result<ChainIterator> {
+        let fetched_schema = self.config.projected_schema.to_record_schema();
+        let table_schema = self.config.projected_schema.table_schema();
+        let row_projector_builder =
+            RowProjectorBuilder::new(fetched_schema.clone(), table_schema.clone(), None);
+        let sst_read_options = self
+            .config
+            .sst_read_options_builder
+            .build(row_projector_builder.clone());
+
+        let memtable_stream_ctx = MemtableStreamContext {
+            row_projector_builder,
+            fetched_schema: fetched_schema.clone(),
+            predicate: self.config.predicate,
+            need_dedup: false,
+            reverse: false,
+            deadline: self.config.deadline,
+        };
+
+        let sst_stream_ctx = SstStreamContext {
+            sst_read_options,
+            fetched_schema,
+        };
+
         let total_sst_streams: usize = self.ssts.iter().map(|v| v.len()).sum();
         let mut total_streams = self.memtables.len() + total_sst_streams;
         if self.sampling_mem.is_some() {
@@ -128,12 +156,8 @@ impl<'a> Builder<'a> {
 
         if let Some(v) = &self.sampling_mem {
             let stream = record_batch_stream::filtered_stream_from_memtable(
-                self.config.projected_schema.clone(),
-                false,
                 &v.mem,
-                false,
-                self.config.predicate.as_ref(),
-                self.config.deadline,
+                &memtable_stream_ctx,
                 self.config.metrics_collector.clone(),
             )
             .context(BuildStreamFromMemtable)?;
@@ -142,14 +166,10 @@ impl<'a> Builder<'a> {
 
         for memtable in &self.memtables {
             let stream = record_batch_stream::filtered_stream_from_memtable(
-                self.config.projected_schema.clone(),
-                false,
                 // chain iterator only handle the case reading in no order so just read in asc
                 // order by default.
                 &memtable.mem,
-                false,
-                self.config.predicate.as_ref(),
-                self.config.deadline,
+                &memtable_stream_ctx,
                 self.config.metrics_collector.clone(),
             )
             .context(BuildStreamFromMemtable)?;
@@ -163,8 +183,8 @@ impl<'a> Builder<'a> {
                     self.config.table_id,
                     sst,
                     self.config.sst_factory,
-                    &self.config.sst_read_options,
                     self.config.store_picker,
+                    &sst_stream_ctx,
                     self.config.metrics_collector.clone(),
                 )
                 .await
@@ -307,7 +327,7 @@ impl ChainIterator {
         }
     }
 
-    async fn next_batch_internal(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    async fn next_batch_internal(&mut self) -> Result<Option<FetchedRecordBatch>> {
         self.init_if_necessary();
         self.maybe_prefetch().await;
 
@@ -357,14 +377,14 @@ impl Drop for ChainIterator {
 }
 
 #[async_trait]
-impl RecordBatchWithKeyIterator for ChainIterator {
+impl FetchedRecordBatchIterator for ChainIterator {
     type Error = Error;
 
     fn schema(&self) -> &RecordSchemaWithKey {
         &self.schema
     }
 
-    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    async fn next_batch(&mut self) -> Result<Option<FetchedRecordBatch>> {
         let timer = Instant::now();
         let res = self.next_batch_internal().await;
         self.metrics.scan_duration += timer.elapsed();
diff --git a/analytic_engine/src/row_iter/dedup.rs b/analytic_engine/src/row_iter/dedup.rs
index cdcffeb5a7..a35d1489f2 100644
--- a/analytic_engine/src/row_iter/dedup.rs
+++ b/analytic_engine/src/row_iter/dedup.rs
@@ -16,7 +16,7 @@ use std::cmp::Ordering;
 
 use async_trait::async_trait;
 use common_types::{
-    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
     request_id::RequestId,
     row::{Row, RowViewOnBatch, RowWithMeta},
     schema::RecordSchemaWithKey,
@@ -26,7 +26,7 @@ use logger::{info, trace};
 use macros::define_result;
 use snafu::{ResultExt, Snafu};
 
-use crate::row_iter::{IterOptions, RecordBatchWithKeyIterator};
+use crate::row_iter::{FetchedRecordBatchIterator, IterOptions};
 
 #[derive(Debug, Snafu)]
 pub enum Error {
@@ -54,7 +54,7 @@ define_result!(Error);
 pub struct DedupIterator<I> {
     request_id: RequestId,
     schema: RecordSchemaWithKey,
-    record_batch_builder: RecordBatchWithKeyBuilder,
+    record_batch_builder: FetchedRecordBatchBuilder,
     iter: I,
     /// Previous row returned.
     prev_row: Option<Row>,
@@ -67,15 +67,19 @@ pub struct DedupIterator<I> {
     total_selected_rows: usize,
 }
 
-impl<I: RecordBatchWithKeyIterator> DedupIterator<I> {
+impl<I: FetchedRecordBatchIterator> DedupIterator<I> {
     pub fn new(request_id: RequestId, iter: I, iter_options: IterOptions) -> Self {
-        let schema = iter.schema();
-
-        let record_batch_builder =
-            RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size);
+        let schema_with_key = iter.schema();
+        let primary_key_indexes = schema_with_key.primary_key_idx().to_vec();
+        let fetched_schema = schema_with_key.to_record_schema();
+        let record_batch_builder = FetchedRecordBatchBuilder::with_capacity(
+            fetched_schema,
+            Some(primary_key_indexes),
+            iter_options.batch_size,
+        );
         Self {
             request_id,
-            schema: schema.clone(),
+            schema: schema_with_key.clone(),
             record_batch_builder,
             iter,
             prev_row: None,
@@ -85,7 +89,7 @@ impl<I: RecordBatchWithKeyIterator> DedupIterator<I> {
         }
     }
 
-    fn dedup_batch(&mut self, record_batch: RecordBatchWithKey) -> Result<RecordBatchWithKey> {
+    fn dedup_batch(&mut self, record_batch: FetchedRecordBatch) -> Result<FetchedRecordBatch> {
         self.selected_rows.clear();
         // Ignore all rows by default.
         self.selected_rows.resize(record_batch.num_rows(), false);
@@ -141,9 +145,9 @@ impl<I: RecordBatchWithKeyIterator> DedupIterator<I> {
     /// Filter batch by `selected_rows`.
     fn filter_batch(
         &mut self,
-        record_batch: RecordBatchWithKey,
+        record_batch: FetchedRecordBatch,
         selected_num: usize,
-    ) -> Result<RecordBatchWithKey> {
+    ) -> Result<FetchedRecordBatch> {
         self.total_selected_rows += selected_num;
         self.total_duplications += record_batch.num_rows() - selected_num;
 
@@ -169,14 +173,14 @@ impl<I: RecordBatchWithKeyIterator> DedupIterator<I> {
 }
 
 #[async_trait]
-impl<I: RecordBatchWithKeyIterator> RecordBatchWithKeyIterator for DedupIterator<I> {
+impl<I: FetchedRecordBatchIterator> FetchedRecordBatchIterator for DedupIterator<I> {
     type Error = Error;
 
     fn schema(&self) -> &RecordSchemaWithKey {
         &self.schema
     }
 
-    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    async fn next_batch(&mut self) -> Result<Option<FetchedRecordBatch>> {
         match self
             .iter
             .next_batch()
@@ -210,7 +214,9 @@ mod tests {
     use common_types::tests::{build_row, build_schema};
 
     use super::*;
-    use crate::row_iter::tests::{build_record_batch_with_key, check_iterator, VectorIterator};
+    use crate::row_iter::tests::{
+        build_fetched_record_batch_with_key, check_iterator, VectorIterator,
+    };
 
     #[tokio::test]
     async fn test_dedup_iterator() {
@@ -219,7 +225,7 @@ mod tests {
         let iter = VectorIterator::new(
             schema.to_record_schema_with_key(),
             vec![
-                build_record_batch_with_key(
+                build_fetched_record_batch_with_key(
                     schema.clone(),
                     vec![
                         build_row(b"a", 1, 10.0, "v1", 1000, 1_000_000),
@@ -227,7 +233,7 @@ mod tests {
                         build_row(b"a", 2, 10.0, "v2", 2000, 2_000_000),
                     ],
                 ),
-                build_record_batch_with_key(
+                build_fetched_record_batch_with_key(
                     schema,
                     vec![
                         build_row(b"a", 2, 10.0, "v", 2000, 2_000_000),
diff --git a/analytic_engine/src/row_iter/merge.rs b/analytic_engine/src/row_iter/merge.rs
index db39f78d2f..e9029060cc 100644
--- a/analytic_engine/src/row_iter/merge.rs
+++ b/analytic_engine/src/row_iter/merge.rs
@@ -23,8 +23,8 @@ use std::{
 
 use async_trait::async_trait;
 use common_types::{
-    projected_schema::ProjectedSchema,
-    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
+    record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
     request_id::RequestId,
     row::RowViewOnBatch,
     schema::RecordSchemaWithKey,
@@ -39,14 +39,17 @@ use table_engine::{predicate::PredicateRef, table::TableId};
 use trace_metric::{MetricsCollector, TraceMetricWhenDrop};
 
 use crate::{
+    instance::SstReadOptionsBuilder,
     row_iter::{
-        record_batch_stream,
-        record_batch_stream::{BoxedPrefetchableRecordBatchStream, SequencedRecordBatch},
-        IterOptions, RecordBatchWithKeyIterator,
+        record_batch_stream::{
+            self, BoxedPrefetchableRecordBatchStream, MemtableStreamContext, SequencedRecordBatch,
+            SstStreamContext,
+        },
+        FetchedRecordBatchIterator, IterOptions,
     },
     space::SpaceId,
     sst::{
-        factory::{FactoryRef as SstFactoryRef, ObjectStorePickerRef, SstReadOptions},
+        factory::{FactoryRef as SstFactoryRef, ObjectStorePickerRef},
         file::{FileHandle, Level, SST_LEVEL_NUM},
     },
     table::version::{MemTableVec, SamplingMemTable},
@@ -108,7 +111,7 @@ pub struct MergeConfig<'a> {
     /// The predicate of the query.
     pub predicate: PredicateRef,
 
-    pub sst_read_options: SstReadOptions,
+    pub sst_read_options_builder: SstReadOptionsBuilder,
     /// Sst factory
     pub sst_factory: &'a SstFactoryRef,
     /// Store picker for persisting sst.
@@ -129,8 +132,10 @@ pub struct MergeBuilder<'a> {
 
     /// Sampling memtable to read.
     sampling_mem: Option<SamplingMemTable>,
+
     /// MemTables to read.
     memtables: MemTableVec,
+
     /// Ssts to read of each level.
     ssts: Vec<Vec<FileHandle>>,
 }
@@ -170,6 +175,34 @@ impl<'a> MergeBuilder<'a> {
     }
 
     pub async fn build(self) -> Result<MergeIterator> {
+        let fetched_schema = self.config.projected_schema.to_record_schema_with_key();
+        let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
+        let fetched_schema = fetched_schema.into_record_schema();
+        let table_schema = self.config.projected_schema.table_schema();
+        let row_projector_builder = RowProjectorBuilder::new(
+            fetched_schema.clone(),
+            table_schema.clone(),
+            Some(primary_key_indexes),
+        );
+        let sst_read_options = self
+            .config
+            .sst_read_options_builder
+            .build(row_projector_builder.clone());
+
+        let memtable_stream_ctx = MemtableStreamContext {
+            row_projector_builder,
+            fetched_schema: fetched_schema.clone(),
+            predicate: self.config.predicate,
+            need_dedup: self.config.need_dedup,
+            reverse: self.config.reverse,
+            deadline: self.config.deadline,
+        };
+
+        let sst_stream_ctx = SstStreamContext {
+            sst_read_options,
+            fetched_schema,
+        };
+
         let sst_streams_num: usize = self
             .ssts
             .iter()
@@ -192,12 +225,8 @@ impl<'a> MergeBuilder<'a> {
 
         if let Some(v) = &self.sampling_mem {
             let stream = record_batch_stream::filtered_stream_from_memtable(
-                self.config.projected_schema.clone(),
-                self.config.need_dedup,
                 &v.mem,
-                self.config.reverse,
-                self.config.predicate.as_ref(),
-                self.config.deadline,
+                &memtable_stream_ctx,
                 self.config.metrics_collector.clone(),
             )
             .context(BuildStreamFromMemtable)?;
@@ -206,12 +235,8 @@ impl<'a> MergeBuilder<'a> {
 
         for memtable in &self.memtables {
             let stream = record_batch_stream::filtered_stream_from_memtable(
-                self.config.projected_schema.clone(),
-                self.config.need_dedup,
                 &memtable.mem,
-                self.config.reverse,
-                self.config.predicate.as_ref(),
-                self.config.deadline,
+                &memtable_stream_ctx,
                 self.config.metrics_collector.clone(),
             )
             .context(BuildStreamFromMemtable)?;
@@ -226,8 +251,8 @@ impl<'a> MergeBuilder<'a> {
                     self.config.table_id,
                     f,
                     self.config.sst_factory,
-                    &self.config.sst_read_options,
                     self.config.store_picker,
+                    &sst_stream_ctx,
                     self.config.metrics_collector.clone(),
                 )
                 .await
@@ -324,7 +349,7 @@ impl BufferedStreamState {
     /// Returns number of rows added.
     fn append_rows_to(
         &mut self,
-        builder: &mut RecordBatchWithKeyBuilder,
+        builder: &mut FetchedRecordBatchBuilder,
         len: usize,
     ) -> Result<usize> {
         let added = builder
@@ -336,7 +361,7 @@ impl BufferedStreamState {
 
     /// Take record batch slice with at most `len` rows from cursor and advance
     /// the cursor.
-    fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey {
+    fn take_record_batch_slice(&mut self, len: usize) -> FetchedRecordBatch {
         let len_to_fetch = cmp::min(
             self.buffered_record_batch.record_batch.num_rows() - self.cursor,
             len,
@@ -403,14 +428,14 @@ impl BufferedStream {
     /// REQUIRE: the buffer is not exhausted.
     fn append_rows_to(
         &mut self,
-        builder: &mut RecordBatchWithKeyBuilder,
+        builder: &mut FetchedRecordBatchBuilder,
         len: usize,
     ) -> Result<usize> {
         self.state.as_mut().unwrap().append_rows_to(builder, len)
     }
 
     /// REQUIRE: the buffer is not exhausted.
-    fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey {
+    fn take_record_batch_slice(&mut self, len: usize) -> FetchedRecordBatch {
         self.state.as_mut().unwrap().take_record_batch_slice(len)
     }
 
@@ -634,7 +659,7 @@ pub struct MergeIterator {
     request_id: RequestId,
     inited: bool,
     schema: RecordSchemaWithKey,
-    record_batch_builder: RecordBatchWithKeyBuilder,
+    record_batch_builder: FetchedRecordBatchBuilder,
     origin_streams: Vec<BoxedPrefetchableRecordBatchStream>,
     /// ssts are kept here to avoid them from being purged.
     #[allow(dead_code)]
@@ -661,8 +686,14 @@ impl MergeIterator {
         metrics: Metrics,
     ) -> Self {
         let heap_cap = streams.len();
-        let record_batch_builder =
-            RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size);
+        let primary_key_indexes = schema.primary_key_idx().to_vec();
+        let fetched_schema = schema.to_record_schema();
+        let record_batch_builder = FetchedRecordBatchBuilder::with_capacity(
+            fetched_schema,
+            Some(primary_key_indexes),
+            iter_options.batch_size,
+        );
+
         Self {
             table_id,
             request_id,
@@ -790,7 +821,7 @@ impl MergeIterator {
     async fn fetch_rows_from_one_stream(
         &mut self,
         num_rows_to_fetch: usize,
-    ) -> Result<Option<RecordBatchWithKey>> {
+    ) -> Result<Option<FetchedRecordBatch>> {
         assert_eq!(self.hot.len(), 1);
         self.metrics.times_fetch_rows_from_one += 1;
 
@@ -834,7 +865,7 @@ impl MergeIterator {
     /// Fetch the next batch from the streams.
     ///
     /// `init_if_necessary` should be finished before this method.
-    async fn fetch_next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    async fn fetch_next_batch(&mut self) -> Result<Option<FetchedRecordBatch>> {
         self.init_if_necessary().await?;
 
         self.record_batch_builder.clear();
@@ -869,14 +900,14 @@ impl MergeIterator {
 }
 
 #[async_trait]
-impl RecordBatchWithKeyIterator for MergeIterator {
+impl FetchedRecordBatchIterator for MergeIterator {
     type Error = Error;
 
     fn schema(&self) -> &RecordSchemaWithKey {
         &self.schema
     }
 
-    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    async fn next_batch(&mut self) -> Result<Option<FetchedRecordBatch>> {
         let record_batch = self.fetch_next_batch().await?;
 
         trace!("MergeIterator send next record batch:{:?}", record_batch);
diff --git a/analytic_engine/src/row_iter/mod.rs b/analytic_engine/src/row_iter/mod.rs
index a2e28dc24b..f3c5ac4d35 100644
--- a/analytic_engine/src/row_iter/mod.rs
+++ b/analytic_engine/src/row_iter/mod.rs
@@ -16,7 +16,7 @@
 
 use async_stream::try_stream;
 use async_trait::async_trait;
-use common_types::{record_batch::RecordBatchWithKey, schema::RecordSchemaWithKey};
+use common_types::{record_batch::FetchedRecordBatch, schema::RecordSchemaWithKey};
 use generic_error::BoxError;
 
 use crate::sst::writer::RecordBatchStream;
@@ -38,15 +38,15 @@ pub struct IterOptions {
 /// The `schema()` should be the same as the RecordBatch from `read()`.
 /// The reader is exhausted if the `read()` returns the `Ok(None)`.
 #[async_trait]
-pub trait RecordBatchWithKeyIterator: Send {
+pub trait FetchedRecordBatchIterator: Send {
     type Error: std::error::Error + Send + Sync + 'static;
 
     fn schema(&self) -> &RecordSchemaWithKey;
 
-    async fn next_batch(&mut self) -> std::result::Result<Option<RecordBatchWithKey>, Self::Error>;
+    async fn next_batch(&mut self) -> std::result::Result<Option<FetchedRecordBatch>, Self::Error>;
 }
 
-pub fn record_batch_with_key_iter_to_stream<I: RecordBatchWithKeyIterator + Unpin + 'static>(
+pub fn record_batch_with_key_iter_to_stream<I: FetchedRecordBatchIterator + Unpin + 'static>(
     mut iter: I,
 ) -> RecordBatchStream {
     let record_batch_stream = try_stream! {
diff --git a/analytic_engine/src/row_iter/record_batch_stream.rs b/analytic_engine/src/row_iter/record_batch_stream.rs
index 0c0fe35ae5..dd0f4d132e 100644
--- a/analytic_engine/src/row_iter/record_batch_stream.rs
+++ b/analytic_engine/src/row_iter/record_batch_stream.rs
@@ -23,7 +23,8 @@ use arrow::{
     datatypes::{DataType as ArrowDataType, SchemaRef as ArrowSchemaRef},
 };
 use common_types::{
-    projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, SequenceNumber,
+    projected_schema::RowProjectorBuilder, record_batch::FetchedRecordBatch, schema::RecordSchema,
+    SequenceNumber,
 };
 use datafusion::{
     common::ToDFSchema,
@@ -34,9 +35,13 @@ use datafusion::{
 };
 use futures::stream::{self, StreamExt};
 use generic_error::{BoxError, GenericResult};
+use itertools::Itertools;
 use macros::define_result;
 use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
-use table_engine::{predicate::Predicate, table::TableId};
+use table_engine::{
+    predicate::{Predicate, PredicateRef},
+    table::TableId,
+};
 use trace_metric::MetricsCollector;
 
 use crate::{
@@ -125,11 +130,11 @@ pub enum Error {
 
 define_result!(Error);
 
-// TODO(yingwen): Can we move sequence to RecordBatchWithKey and remove this
+// TODO(yingwen): Can we move sequence to FetchedRecordBatch and remove this
 // struct? But what is the sequence after merge?
 #[derive(Debug)]
 pub struct SequencedRecordBatch {
-    pub record_batch: RecordBatchWithKey,
+    pub record_batch: FetchedRecordBatch,
     pub sequence: SequenceNumber,
 }
 
@@ -212,56 +217,44 @@ pub fn filter_stream(
 /// Build filtered (by `predicate`) [SequencedRecordBatchStream] from a
 /// memtable.
 pub fn filtered_stream_from_memtable(
-    projected_schema: ProjectedSchema,
-    need_dedup: bool,
     memtable: &MemTableRef,
-    reverse: bool,
-    predicate: &Predicate,
-    deadline: Option<Instant>,
+    ctx: &MemtableStreamContext,
     metrics_collector: Option<MetricsCollector>,
 ) -> Result<BoxedPrefetchableRecordBatchStream> {
-    stream_from_memtable(
-        projected_schema.clone(),
-        need_dedup,
-        memtable,
-        reverse,
-        deadline,
-        metrics_collector,
-    )
-    .and_then(|origin_stream| {
+    stream_from_memtable(memtable, ctx, metrics_collector).and_then(|origin_stream| {
         filter_stream(
             origin_stream,
-            projected_schema
-                .as_record_schema_with_key()
-                .to_arrow_schema_ref(),
-            predicate,
+            ctx.fetched_schema.to_arrow_schema_ref(),
+            &ctx.predicate,
         )
     })
 }
 
 /// Build [SequencedRecordBatchStream] from a memtable.
 pub fn stream_from_memtable(
-    projected_schema: ProjectedSchema,
-    need_dedup: bool,
     memtable: &MemTableRef,
-    reverse: bool,
-    deadline: Option<Instant>,
+    ctx: &MemtableStreamContext,
     metrics_collector: Option<MetricsCollector>,
 ) -> Result<BoxedPrefetchableRecordBatchStream> {
     let scan_ctx = ScanContext {
-        deadline,
+        deadline: ctx.deadline,
         ..Default::default()
     };
     let max_seq = memtable.last_sequence();
-    let scan_memtable_desc = format!("scan_memtable_{max_seq}");
+    let fetched_cols = ctx
+        .fetched_schema
+        .columns()
+        .iter()
+        .format_with(",", |col, f| f(&format_args!("{}", col.name)));
+    let scan_memtable_desc = format!("scan_memtable_{max_seq}, fetched_columns:[{fetched_cols}]",);
     let metrics_collector = metrics_collector.map(|v| v.span(scan_memtable_desc));
     let scan_req = ScanRequest {
         start_user_key: Bound::Unbounded,
         end_user_key: Bound::Unbounded,
         sequence: max_seq,
-        projected_schema,
-        need_dedup,
-        reverse,
+        row_projector_builder: ctx.row_projector_builder.clone(),
+        need_dedup: ctx.need_dedup,
+        reverse: ctx.reverse,
         metrics_collector,
     };
 
@@ -277,6 +270,15 @@ pub fn stream_from_memtable(
     Ok(Box::new(NoopPrefetcher(Box::new(stream))))
 }
 
+pub struct MemtableStreamContext {
+    pub row_projector_builder: RowProjectorBuilder,
+    pub fetched_schema: RecordSchema,
+    pub predicate: PredicateRef,
+    pub need_dedup: bool,
+    pub reverse: bool,
+    pub deadline: Option<Instant>,
+}
+
 /// Build the filtered by `sst_read_options.predicate`
 /// [SequencedRecordBatchStream] from a sst.
 pub async fn filtered_stream_from_sst_file(
@@ -284,8 +286,8 @@ pub async fn filtered_stream_from_sst_file(
     table_id: TableId,
     sst_file: &FileHandle,
     sst_factory: &SstFactoryRef,
-    sst_read_options: &SstReadOptions,
     store_picker: &ObjectStorePickerRef,
+    ctx: &SstStreamContext,
     metrics_collector: Option<MetricsCollector>,
 ) -> Result<BoxedPrefetchableRecordBatchStream> {
     stream_from_sst_file(
@@ -293,19 +295,16 @@ pub async fn filtered_stream_from_sst_file(
         table_id,
         sst_file,
         sst_factory,
-        sst_read_options,
         store_picker,
+        ctx,
         metrics_collector,
     )
     .await
     .and_then(|origin_stream| {
         filter_stream(
             origin_stream,
-            sst_read_options
-                .projected_schema
-                .as_record_schema_with_key()
-                .to_arrow_schema_ref(),
-            sst_read_options.predicate.as_ref(),
+            ctx.fetched_schema.to_arrow_schema_ref(),
+            &ctx.sst_read_options.predicate,
         )
     })
 }
@@ -316,8 +315,8 @@ pub async fn stream_from_sst_file(
     table_id: TableId,
     sst_file: &FileHandle,
     sst_factory: &SstFactoryRef,
-    sst_read_options: &SstReadOptions,
     store_picker: &ObjectStorePickerRef,
+    ctx: &SstStreamContext,
     metrics_collector: Option<MetricsCollector>,
 ) -> Result<BoxedPrefetchableRecordBatchStream> {
     sst_file.read_meter().mark();
@@ -327,12 +326,20 @@ pub async fn stream_from_sst_file(
         file_size: Some(sst_file.size() as usize),
         file_format: Some(sst_file.storage_format()),
     };
-    let scan_sst_desc = format!("scan_sst_{}", sst_file.id());
+    let fetched_cols = ctx
+        .fetched_schema
+        .columns()
+        .iter()
+        .format_with(",", |col, f| f(&format_args!("{}", col.name)));
+    let scan_sst_desc = format!(
+        "scan_sst_{}, fetched_columns:[{fetched_cols}]",
+        sst_file.id()
+    );
     let metrics_collector = metrics_collector.map(|v| v.span(scan_sst_desc));
     let mut sst_reader = sst_factory
         .create_reader(
             &path,
-            sst_read_options,
+            &ctx.sst_read_options,
             read_hint,
             store_picker,
             metrics_collector,
@@ -353,6 +360,11 @@ pub async fn stream_from_sst_file(
     Ok(Box::new(stream))
 }
 
+pub struct SstStreamContext {
+    pub sst_read_options: SstReadOptions,
+    pub fetched_schema: RecordSchema,
+}
+
 #[cfg(test)]
 pub mod tests {
     use common_types::{row::Row, schema::Schema};
@@ -369,7 +381,7 @@ pub mod tests {
             .into_iter()
             .map(|(seq, rows)| {
                 let batch = SequencedRecordBatch {
-                    record_batch: row_iter::tests::build_record_batch_with_key(
+                    record_batch: row_iter::tests::build_fetched_record_batch_with_key(
                         schema.clone(),
                         rows,
                     ),
diff --git a/analytic_engine/src/row_iter/tests.rs b/analytic_engine/src/row_iter/tests.rs
index 3484980c29..0db3c8bd91 100644
--- a/analytic_engine/src/row_iter/tests.rs
+++ b/analytic_engine/src/row_iter/tests.rs
@@ -14,8 +14,8 @@
 
 use async_trait::async_trait;
 use common_types::{
-    projected_schema::ProjectedSchema,
-    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    projected_schema::{ProjectedSchema, RowProjector},
+    record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
     row::{
         contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow},
         Row,
@@ -25,7 +25,7 @@ use common_types::{
 use macros::define_result;
 use snafu::Snafu;
 
-use crate::row_iter::RecordBatchWithKeyIterator;
+use crate::row_iter::FetchedRecordBatchIterator;
 
 #[derive(Debug, Snafu)]
 pub enum Error {}
@@ -34,12 +34,12 @@ define_result!(Error);
 
 pub struct VectorIterator {
     schema: RecordSchemaWithKey,
-    items: Vec<Option<RecordBatchWithKey>>,
+    items: Vec<Option<FetchedRecordBatch>>,
     idx: usize,
 }
 
 impl VectorIterator {
-    pub fn new(schema: RecordSchemaWithKey, items: Vec<RecordBatchWithKey>) -> Self {
+    pub fn new(schema: RecordSchemaWithKey, items: Vec<FetchedRecordBatch>) -> Self {
         Self {
             schema,
             items: items.into_iter().map(Some).collect(),
@@ -49,14 +49,14 @@ impl VectorIterator {
 }
 
 #[async_trait]
-impl RecordBatchWithKeyIterator for VectorIterator {
+impl FetchedRecordBatchIterator for VectorIterator {
     type Error = Error;
 
     fn schema(&self) -> &RecordSchemaWithKey {
         &self.schema
     }
 
-    async fn next_batch(&mut self) -> Result<Option<RecordBatchWithKey>> {
+    async fn next_batch(&mut self) -> Result<Option<FetchedRecordBatch>> {
         if self.idx == self.items.len() {
             return Ok(None);
         }
@@ -68,13 +68,26 @@ impl RecordBatchWithKeyIterator for VectorIterator {
     }
 }
 
-pub fn build_record_batch_with_key(schema: Schema, rows: Vec<Row>) -> RecordBatchWithKey {
+pub fn build_fetched_record_batch_with_key(schema: Schema, rows: Vec<Row>) -> FetchedRecordBatch {
     assert!(schema.num_columns() > 1);
     let projection: Vec<usize> = (0..schema.num_columns()).collect();
     let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap();
-    let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap();
+    let fetched_schema = projected_schema.to_record_schema_with_key();
+    let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
+    let fetched_schema = fetched_schema.to_record_schema();
+    let table_schema = projected_schema.table_schema();
+    let row_projector = RowProjector::new(
+        &fetched_schema,
+        Some(primary_key_indexes),
+        table_schema,
+        table_schema,
+    )
+    .unwrap();
+    let primary_key_indexes = row_projector
+        .primary_key_indexes()
+        .map(|idxs| idxs.to_vec());
     let mut builder =
-        RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2);
+        FetchedRecordBatchBuilder::with_capacity(fetched_schema, primary_key_indexes, 2);
     let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
 
     let mut buf = Vec::new();
@@ -84,7 +97,7 @@ pub fn build_record_batch_with_key(schema: Schema, rows: Vec<Row>) -> RecordBatc
         writer.write_row(&row).unwrap();
 
         let source_row = ContiguousRowReader::try_new(&buf, &schema).unwrap();
-        let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema);
+        let projected_row = ProjectedContiguousRow::new(source_row, &row_projector);
         builder
             .append_projected_contiguous_row(&projected_row)
             .unwrap();
@@ -92,7 +105,7 @@ pub fn build_record_batch_with_key(schema: Schema, rows: Vec<Row>) -> RecordBatc
     builder.build().unwrap()
 }
 
-pub async fn check_iterator<T: RecordBatchWithKeyIterator>(iter: &mut T, expected_rows: Vec<Row>) {
+pub async fn check_iterator<T: FetchedRecordBatchIterator>(iter: &mut T, expected_rows: Vec<Row>) {
     let mut visited_rows = 0;
     while let Some(batch) = iter.next_batch().await.unwrap() {
         for row_idx in 0..batch.num_rows() {
diff --git a/analytic_engine/src/sst/factory.rs b/analytic_engine/src/sst/factory.rs
index 8d507c6e34..9f0c00313d 100644
--- a/analytic_engine/src/sst/factory.rs
+++ b/analytic_engine/src/sst/factory.rs
@@ -17,7 +17,7 @@
 use std::{collections::HashMap, fmt::Debug, sync::Arc};
 
 use async_trait::async_trait;
-use common_types::projected_schema::ProjectedSchema;
+use common_types::projected_schema::RowProjectorBuilder;
 use macros::define_result;
 use object_store::{ObjectStoreRef, Path};
 use runtime::Runtime;
@@ -140,7 +140,7 @@ pub struct SstReadOptions {
 
     pub frequency: ReadFrequency,
     pub num_rows_per_row_group: usize,
-    pub projected_schema: ProjectedSchema,
+    pub row_projector_builder: RowProjectorBuilder,
     pub predicate: PredicateRef,
     pub meta_cache: Option<MetaCacheRef>,
     pub scan_options: ScanOptions,
diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index 687949182f..be98479619 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -26,8 +26,8 @@ use arrow::{datatypes::SchemaRef, record_batch::RecordBatch as ArrowRecordBatch}
 use async_trait::async_trait;
 use bytes_ext::Bytes;
 use common_types::{
-    projected_schema::{ProjectedSchema, RowProjector},
-    record_batch::{ArrowRecordBatchProjector, RecordBatchWithKey},
+    projected_schema::{RowProjector, RowProjectorBuilder},
+    record_batch::FetchedRecordBatch,
 };
 use datafusion::{
     common::ToDFSchema,
@@ -77,7 +77,7 @@ use crate::{
 
 const PRUNE_ROW_GROUPS_METRICS_COLLECTOR_NAME: &str = "prune_row_groups";
 type SendableRecordBatchStream = Pin<Box<dyn Stream<Item = Result<ArrowRecordBatch>> + Send>>;
-type RecordBatchWithKeyStream = Box<dyn Stream<Item = Result<RecordBatchWithKey>> + Send + Unpin>;
+type FetchedRecordBatchStream = Box<dyn Stream<Item = Result<FetchedRecordBatch>> + Send + Unpin>;
 
 pub struct Reader<'a> {
     /// The path where the data is persisted.
@@ -87,13 +87,14 @@ pub struct Reader<'a> {
     /// The hint for the sst file size.
     file_size_hint: Option<usize>,
     num_rows_per_row_group: usize,
-    projected_schema: ProjectedSchema,
     meta_cache: Option<MetaCacheRef>,
     predicate: PredicateRef,
     /// Current frequency decides the cache policy.
     frequency: ReadFrequency,
     /// Init those fields in `init_if_necessary`
     meta_data: Option<MetaData>,
+
+    row_projector_builder: RowProjectorBuilder,
     row_projector: Option<RowProjector>,
 
     /// Options for `read_parallelly`
@@ -138,11 +139,11 @@ impl<'a> Reader<'a> {
             store,
             file_size_hint,
             num_rows_per_row_group: options.num_rows_per_row_group,
-            projected_schema: options.projected_schema.clone(),
             meta_cache: options.meta_cache.clone(),
             predicate: options.predicate.clone(),
             frequency: options.frequency,
             meta_data: None,
+            row_projector_builder: options.row_projector_builder.clone(),
             row_projector: None,
             metrics,
             df_plan_metrics,
@@ -153,7 +154,7 @@ impl<'a> Reader<'a> {
     async fn maybe_read_parallelly(
         &mut self,
         read_parallelism: usize,
-    ) -> Result<Vec<RecordBatchWithKeyStream>> {
+    ) -> Result<Vec<FetchedRecordBatchStream>> {
         assert!(read_parallelism > 0);
 
         self.init_if_necessary().await?;
@@ -162,11 +163,7 @@ impl<'a> Reader<'a> {
             return Ok(Vec::new());
         }
 
-        let row_projector = {
-            let row_projector = self.row_projector.take().unwrap();
-            ArrowRecordBatchProjector::from(row_projector)
-        };
-
+        let row_projector = self.row_projector.take().unwrap();
         let streams: Vec<_> = streams
             .into_iter()
             .map(|stream| {
@@ -366,12 +363,14 @@ impl<'a> Reader<'a> {
         };
 
         let row_projector = self
-            .projected_schema
-            .try_project_with_key(&meta_data.custom().schema)
+            .row_projector_builder
+            .build(&meta_data.custom().schema)
             .box_err()
             .context(Projection)?;
+
         self.meta_data = Some(meta_data);
         self.row_projector = Some(row_projector);
+
         Ok(())
     }
 
@@ -493,7 +492,7 @@ pub(crate) struct ProjectorMetrics {
 
 struct RecordBatchProjector {
     stream: SendableRecordBatchStream,
-    row_projector: ArrowRecordBatchProjector,
+    row_projector: RowProjector,
 
     metrics: ProjectorMetrics,
     start_time: Instant,
@@ -502,7 +501,7 @@ struct RecordBatchProjector {
 impl RecordBatchProjector {
     fn new(
         stream: SendableRecordBatchStream,
-        row_projector: ArrowRecordBatchProjector,
+        row_projector: RowProjector,
         metrics_collector: Option<MetricsCollector>,
     ) -> Self {
         let metrics = ProjectorMetrics {
@@ -520,7 +519,7 @@ impl RecordBatchProjector {
 }
 
 impl Stream for RecordBatchProjector {
-    type Item = Result<RecordBatchWithKey>;
+    type Item = Result<FetchedRecordBatch>;
 
     fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         let projector = self.get_mut();
@@ -541,11 +540,10 @@ impl Stream for RecordBatchProjector {
                         }
                         projector.metrics.row_num += record_batch.num_rows();
 
-                        let projected_batch = projector
-                            .row_projector
-                            .project_to_record_batch_with_key(record_batch)
-                            .box_err()
-                            .context(DecodeRecordBatch {});
+                        let projected_batch =
+                            FetchedRecordBatch::try_new(&projector.row_projector, record_batch)
+                                .box_err()
+                                .context(DecodeRecordBatch {});
 
                         Poll::Ready(Some(projected_batch))
                     }
@@ -576,7 +574,7 @@ impl<'a> SstReader for Reader<'a> {
 
     async fn read(
         &mut self,
-    ) -> Result<Box<dyn PrefetchableStream<Item = Result<RecordBatchWithKey>>>> {
+    ) -> Result<Box<dyn PrefetchableStream<Item = Result<FetchedRecordBatch>>>> {
         let mut streams = self.maybe_read_parallelly(1).await?;
         assert_eq!(streams.len(), 1);
         let stream = streams.pop().expect("impossible to fetch no stream");
@@ -587,7 +585,7 @@ impl<'a> SstReader for Reader<'a> {
 
 struct RecordBatchReceiver {
     bg_prefetch_tx: Option<watch::Sender<()>>,
-    rx_group: Vec<Receiver<Result<RecordBatchWithKey>>>,
+    rx_group: Vec<Receiver<Result<FetchedRecordBatch>>>,
     cur_rx_idx: usize,
     #[allow(dead_code)]
     drop_helper: AbortOnDropMany<()>,
@@ -595,13 +593,13 @@ struct RecordBatchReceiver {
 
 #[async_trait]
 impl PrefetchableStream for RecordBatchReceiver {
-    type Item = Result<RecordBatchWithKey>;
+    type Item = Result<FetchedRecordBatch>;
 
     async fn start_prefetch(&mut self) {
         // Start the prefetch work in background when first poll is called.
         if let Some(tx) = self.bg_prefetch_tx.take() {
             if tx.send(()).is_err() {
-                error!("The receiver for start prefetching has been closed");
+                error!("The receiver for start prefetched has been closed");
             }
         }
     }
@@ -612,7 +610,7 @@ impl PrefetchableStream for RecordBatchReceiver {
 }
 
 impl Stream for RecordBatchReceiver {
-    type Item = Result<RecordBatchWithKey>;
+    type Item = Result<FetchedRecordBatch>;
 
     fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         if self.rx_group.is_empty() {
@@ -622,7 +620,7 @@ impl Stream for RecordBatchReceiver {
         // Start the prefetch work in background when first poll is called.
         if let Some(tx) = self.bg_prefetch_tx.take() {
             if tx.send(()).is_err() {
-                error!("The receiver for start prefetching has been closed");
+                error!("The receiver for start prefetched has been closed");
             }
         }
 
@@ -692,8 +690,8 @@ impl<'a> ThreadedReader<'a> {
 
     fn read_record_batches_from_sub_reader(
         &mut self,
-        mut reader: Box<dyn Stream<Item = Result<RecordBatchWithKey>> + Send + Unpin>,
-        tx: Sender<Result<RecordBatchWithKey>>,
+        mut reader: Box<dyn Stream<Item = Result<FetchedRecordBatch>> + Send + Unpin>,
+        tx: Sender<Result<FetchedRecordBatch>>,
         mut rx: watch::Receiver<()>,
     ) -> JoinHandle<()> {
         self.runtime.spawn(async move {
@@ -720,7 +718,7 @@ impl<'a> SstReader for ThreadedReader<'a> {
 
     async fn read(
         &mut self,
-    ) -> Result<Box<dyn PrefetchableStream<Item = Result<RecordBatchWithKey>>>> {
+    ) -> Result<Box<dyn PrefetchableStream<Item = Result<FetchedRecordBatch>>>> {
         // Get underlying sst readers and channels.
         let sub_readers = self
             .inner
@@ -744,7 +742,7 @@ impl<'a> SstReader for ThreadedReader<'a> {
         let channel_cap_per_sub_reader = self.channel_cap / sub_readers.len();
         let channel_cap_per_sub_reader = channel_cap_per_sub_reader.max(1);
         let (tx_group, rx_group): (Vec<_>, Vec<_>) = (0..read_parallelism)
-            .map(|_| mpsc::channel::<Result<RecordBatchWithKey>>(channel_cap_per_sub_reader))
+            .map(|_| mpsc::channel::<Result<FetchedRecordBatch>>(channel_cap_per_sub_reader))
             .unzip();
 
         let (bg_prefetch_tx, bg_prefetch_rx) = watch::channel(());
diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs
index ef233f7053..e84adea7f8 100644
--- a/analytic_engine/src/sst/parquet/writer.rs
+++ b/analytic_engine/src/sst/parquet/writer.rs
@@ -18,7 +18,7 @@ use std::collections::{HashMap, HashSet};
 
 use async_trait::async_trait;
 use common_types::{
-    datum::DatumKind, record_batch::RecordBatchWithKey, request_id::RequestId, time::TimeRange,
+    datum::DatumKind, record_batch::FetchedRecordBatch, request_id::RequestId, time::TimeRange,
 };
 use datafusion::parquet::basic::Compression;
 use futures::StreamExt;
@@ -41,8 +41,9 @@ use crate::{
             },
         },
         writer::{
-            self, BuildParquetFilter, EncodePbData, EncodeRecordBatch, ExpectTimestampColumn, Io,
-            MetaData, PollRecordBatch, RecordBatchStream, Result, SstInfo, SstWriter, Storage,
+            self, BuildParquetFilter, BuildParquetFilterNoCause, EncodePbData, EncodeRecordBatch,
+            ExpectTimestampColumn, Io, MetaData, PollRecordBatch, RecordBatchStream, Result,
+            SstInfo, SstWriter, Storage,
         },
     },
     table::sst_util,
@@ -160,8 +161,8 @@ impl<'a> RecordBatchGroupWriter<'a> {
     /// the left rows.
     async fn fetch_next_row_group(
         &mut self,
-        prev_record_batch: &mut Option<RecordBatchWithKey>,
-    ) -> Result<Vec<RecordBatchWithKey>> {
+        prev_record_batch: &mut Option<FetchedRecordBatch>,
+    ) -> Result<Vec<FetchedRecordBatch>> {
         let mut curr_row_group = vec![];
         // Used to record the number of remaining rows to fill `curr_row_group`.
         let mut remaining = self.options.num_rows_per_row_group;
@@ -217,7 +218,7 @@ impl<'a> RecordBatchGroupWriter<'a> {
 
     fn build_column_encodings(
         &self,
-        sample_row_groups: &[RecordBatchWithKey],
+        sample_row_groups: &[FetchedRecordBatch],
         column_encodings: &mut HashMap<String, ColumnEncoding>,
     ) -> Result<()> {
         let mut sampler = ColumnEncodingSampler {
@@ -233,9 +234,15 @@ impl<'a> RecordBatchGroupWriter<'a> {
     /// Build the parquet filter for the given `row_group`.
     fn build_row_group_filter(
         &self,
-        row_group_batch: &[RecordBatchWithKey],
+        row_group_batch: &[FetchedRecordBatch],
     ) -> Result<RowGroupFilter> {
-        let mut builder = RowGroupFilterBuilder::new(row_group_batch[0].schema_with_key());
+        let schema_with_key =
+            row_group_batch[0]
+                .schema_with_key()
+                .with_context(|| BuildParquetFilterNoCause {
+                    msg: "primary key indexes not exist",
+                })?;
+        let mut builder = RowGroupFilterBuilder::new(&schema_with_key);
 
         for partial_batch in row_group_batch {
             for (col_idx, column) in partial_batch.columns().iter().enumerate() {
@@ -253,7 +260,7 @@ impl<'a> RecordBatchGroupWriter<'a> {
 
     fn update_column_values(
         column_values: &mut [Option<ColumnValueSet>],
-        record_batch: &RecordBatchWithKey,
+        record_batch: &FetchedRecordBatch,
     ) {
         for (col_idx, col_values) in column_values.iter_mut().enumerate() {
             let mut too_many_values = false;
@@ -320,7 +327,7 @@ impl<'a> RecordBatchGroupWriter<'a> {
         sink: W,
         meta_path: &Path,
     ) -> Result<(usize, ParquetMetaData)> {
-        let mut prev_record_batch: Option<RecordBatchWithKey> = None;
+        let mut prev_record_batch: Option<FetchedRecordBatch> = None;
         let mut arrow_row_group = Vec::new();
         let mut total_num_rows = 0;
 
@@ -531,7 +538,7 @@ impl<'a> SstWriter for ParquetSstWriter<'a> {
 /// A sampler to decide the column encoding options (whether to do dictionary
 /// encoding) with a bunch of sample row groups.
 struct ColumnEncodingSampler<'a> {
-    sample_row_groups: &'a [RecordBatchWithKey],
+    sample_row_groups: &'a [FetchedRecordBatch],
     meta_data: &'a MetaData,
     min_num_sample_rows: usize,
     max_unique_value_ratio: f64,
@@ -613,7 +620,7 @@ mod tests {
 
     use bytes_ext::Bytes;
     use common_types::{
-        projected_schema::ProjectedSchema,
+        projected_schema::{ProjectedSchema, RowProjectorBuilder},
         tests::{build_row, build_row_for_dictionary, build_schema, build_schema_with_dictionary},
         time::{TimeRange, Timestamp},
     };
@@ -625,7 +632,7 @@ mod tests {
 
     use super::*;
     use crate::{
-        row_iter::tests::build_record_batch_with_key,
+        row_iter::tests::build_fetched_record_batch_with_key,
         sst::{
             factory::{
                 Factory, FactoryImpl, ReadFrequency, ScanOptions, SstReadOptions, SstWriteOptions,
@@ -722,7 +729,7 @@ mod tests {
                         "tagv2",
                     ),
                 ];
-                let batch = build_record_batch_with_key(schema.clone(), rows);
+                let batch = build_fetched_record_batch_with_key(schema.clone(), rows);
                 Poll::Ready(Some(Ok(batch)))
             }));
 
@@ -748,15 +755,20 @@ mod tests {
 
             let scan_options = ScanOptions::default();
             // read sst back to test
+            let row_projector_builder = RowProjectorBuilder::new(
+                reader_projected_schema.to_record_schema(),
+                reader_projected_schema.table_schema().clone(),
+                None,
+            );
             let sst_read_options = SstReadOptions {
                 maybe_table_level_metrics: Arc::new(MaybeTableLevelMetrics::new("test")),
                 frequency: ReadFrequency::Frequent,
                 num_rows_per_row_group: 5,
-                projected_schema: reader_projected_schema,
                 predicate: Arc::new(Predicate::empty()),
                 meta_cache: None,
                 scan_options,
                 runtime: runtime.clone(),
+                row_projector_builder,
             };
 
             let mut reader: Box<dyn SstReader + Send> = {
@@ -889,7 +901,7 @@ mod tests {
                 .map(|_| build_row(b"a", 100, 10.0, "v4", 1000, 1_000_000))
                 .collect::<Vec<_>>();
 
-            let batch = build_record_batch_with_key(schema_clone.clone(), rows);
+            let batch = build_fetched_record_batch_with_key(schema_clone.clone(), rows);
             poll_cnt += 1;
 
             Poll::Ready(Some(Ok(batch)))
@@ -964,8 +976,9 @@ mod tests {
             .into_iter()
             .map(|v| build_row(v.0, v.1, v.2, v.3, v.4, v.5))
             .collect();
-        let record_batch_with_key0 = build_record_batch_with_key(schema.clone(), rows.clone());
-        let record_batch_with_key1 = build_record_batch_with_key(schema.clone(), rows);
+        let record_batch_with_key0 =
+            build_fetched_record_batch_with_key(schema.clone(), rows.clone());
+        let record_batch_with_key1 = build_fetched_record_batch_with_key(schema.clone(), rows);
         let meta_data = MetaData {
             min_key: Bytes::from_static(b""),
             max_key: Bytes::from_static(b""),
diff --git a/analytic_engine/src/sst/reader.rs b/analytic_engine/src/sst/reader.rs
index fbb36364bc..66cebc047c 100644
--- a/analytic_engine/src/sst/reader.rs
+++ b/analytic_engine/src/sst/reader.rs
@@ -15,7 +15,7 @@
 //! Sst reader trait definition.
 
 use async_trait::async_trait;
-use common_types::record_batch::RecordBatchWithKey;
+use common_types::record_batch::FetchedRecordBatch;
 
 use crate::{prefetchable_stream::PrefetchableStream, sst::meta_data::SstMetaData};
 
@@ -105,7 +105,7 @@ pub trait SstReader {
 
     async fn read(
         &mut self,
-    ) -> Result<Box<dyn PrefetchableStream<Item = Result<RecordBatchWithKey>>>>;
+    ) -> Result<Box<dyn PrefetchableStream<Item = Result<FetchedRecordBatch>>>>;
 }
 
 #[cfg(test)]
@@ -117,7 +117,7 @@ pub mod tests {
 
     pub async fn check_stream<S>(stream: &mut S, expected_rows: Vec<Row>)
     where
-        S: PrefetchableStream<Item = Result<RecordBatchWithKey>> + Unpin,
+        S: PrefetchableStream<Item = Result<FetchedRecordBatch>> + Unpin,
     {
         let mut visited_rows = 0;
         while let Some(batch) = stream.fetch_next().await {
diff --git a/analytic_engine/src/sst/writer.rs b/analytic_engine/src/sst/writer.rs
index 355ef1827e..773715cdb9 100644
--- a/analytic_engine/src/sst/writer.rs
+++ b/analytic_engine/src/sst/writer.rs
@@ -19,7 +19,7 @@ use std::cmp;
 use async_trait::async_trait;
 use bytes_ext::Bytes;
 use common_types::{
-    record_batch::RecordBatchWithKey, request_id::RequestId, schema::Schema, time::TimeRange,
+    record_batch::FetchedRecordBatch, request_id::RequestId, schema::Schema, time::TimeRange,
     SequenceNumber,
 };
 use futures::Stream;
@@ -82,6 +82,9 @@ pub mod error {
         #[snafu(display("Failed to build parquet filter, err:{}", source))]
         BuildParquetFilter { source: GenericError },
 
+        #[snafu(display("Failed to build parquet filter msg:{msg}.\nBacktrace:\n{backtrace}"))]
+        BuildParquetFilterNoCause { msg: String, backtrace: Backtrace },
+
         #[snafu(display("Failed to poll record batch, err:{}", source))]
         PollRecordBatch { source: GenericError },
 
@@ -97,7 +100,7 @@ pub mod error {
 
 pub use error::*;
 
-pub type RecordBatchStreamItem = std::result::Result<RecordBatchWithKey, GenericError>;
+pub type RecordBatchStreamItem = std::result::Result<FetchedRecordBatch, GenericError>;
 // TODO(yingwen): SstReader also has a RecordBatchStream, can we use same type?
 pub type RecordBatchStream = Box<dyn Stream<Item = RecordBatchStreamItem> + Send + Unpin>;
 
diff --git a/benchmarks/src/merge_memtable_bench.rs b/benchmarks/src/merge_memtable_bench.rs
index 1ca655e50b..35765a0a96 100644
--- a/benchmarks/src/merge_memtable_bench.rs
+++ b/benchmarks/src/merge_memtable_bench.rs
@@ -24,14 +24,11 @@ use analytic_engine::{
     row_iter::{
         dedup::DedupIterator,
         merge::{MergeBuilder, MergeConfig},
-        IterOptions, RecordBatchWithKeyIterator,
+        FetchedRecordBatchIterator, IterOptions,
     },
     space::SpaceId,
     sst::{
-        factory::{
-            FactoryImpl, FactoryRef as SstFactoryRef, ObjectStorePickerRef, ReadFrequency,
-            ScanOptions, SstReadOptions,
-        },
+        factory::{FactoryImpl, FactoryRef as SstFactoryRef, ObjectStorePickerRef, ScanOptions},
         meta_data::cache::MetaCacheRef,
         metrics::MaybeTableLevelMetrics as SstMaybeTableLevelMetrics,
     },
@@ -39,6 +36,7 @@ use analytic_engine::{
         sst_util,
         version::{MemTableState, MemTableVec},
     },
+    ScanType, SstReadOptionsBuilder,
 };
 use arena::NoopCollector;
 use common_types::{
@@ -61,7 +59,8 @@ pub struct MergeMemTableBench {
     space_id: SpaceId,
     table_id: TableId,
     dedup: bool,
-    sst_read_options: SstReadOptions,
+    sst_read_options_builder: SstReadOptionsBuilder,
+    num_rows_per_row_group: usize,
 }
 
 impl MergeMemTableBench {
@@ -113,7 +112,8 @@ impl MergeMemTableBench {
                 id: *id,
             });
         }
-        let sst_read_options = mock_sst_read_options(projected_schema.clone(), runtime.clone());
+        let sst_read_options_builder =
+            mock_sst_read_options_builder(projected_schema.clone(), runtime.clone());
 
         MergeMemTableBench {
             store,
@@ -125,7 +125,8 @@ impl MergeMemTableBench {
             space_id,
             table_id,
             dedup: true,
-            sst_read_options,
+            sst_read_options_builder,
+            num_rows_per_row_group: 500,
         }
     }
 
@@ -149,7 +150,7 @@ impl MergeMemTableBench {
         let projected_schema = self.projected_schema.clone();
         let sst_factory: SstFactoryRef = Arc::new(FactoryImpl);
         let iter_options = IterOptions {
-            batch_size: self.sst_read_options.num_rows_per_row_group,
+            batch_size: self.num_rows_per_row_group,
         };
 
         let request_id = RequestId::next_id();
@@ -164,7 +165,7 @@ impl MergeMemTableBench {
             projected_schema,
             predicate: Arc::new(Predicate::empty()),
             sst_factory: &sst_factory,
-            sst_read_options: self.sst_read_options.clone(),
+            sst_read_options_builder: self.sst_read_options_builder.clone(),
             store_picker: &store_picker,
             merge_iter_options: iter_options.clone(),
             need_dedup: true,
@@ -206,23 +207,24 @@ impl MergeMemTableBench {
     }
 }
 
-fn mock_sst_read_options(
-    projected_schema: ProjectedSchema,
+fn mock_sst_read_options_builder(
+    _projected_schema: ProjectedSchema,
     runtime: Arc<Runtime>,
-) -> SstReadOptions {
+) -> SstReadOptionsBuilder {
     let scan_options = ScanOptions {
         background_read_parallelism: 1,
         max_record_batches_in_flight: 1024,
         num_streams_to_prefetch: 0,
     };
-    SstReadOptions {
-        maybe_table_level_metrics: Arc::new(SstMaybeTableLevelMetrics::new("bench")),
-        frequency: ReadFrequency::Frequent,
-        num_rows_per_row_group: 500,
-        projected_schema,
-        predicate: Arc::new(Predicate::empty()),
-        meta_cache: None,
+    let maybe_table_level_metrics = Arc::new(SstMaybeTableLevelMetrics::new("bench"));
+
+    SstReadOptionsBuilder::new(
+        ScanType::Query,
         scan_options,
+        maybe_table_level_metrics,
+        500,
+        Arc::new(Predicate::empty()),
+        None,
         runtime,
-    }
+    )
 }
diff --git a/benchmarks/src/merge_sst_bench.rs b/benchmarks/src/merge_sst_bench.rs
index 434f452b70..c8b07a21b2 100644
--- a/benchmarks/src/merge_sst_bench.rs
+++ b/benchmarks/src/merge_sst_bench.rs
@@ -22,19 +22,17 @@ use analytic_engine::{
         chain::ChainConfig,
         dedup::DedupIterator,
         merge::{MergeBuilder, MergeConfig},
-        IterOptions, RecordBatchWithKeyIterator,
+        FetchedRecordBatchIterator, IterOptions,
     },
     space::SpaceId,
     sst::{
-        factory::{
-            FactoryImpl, FactoryRef as SstFactoryRef, ObjectStorePickerRef, ReadFrequency,
-            ScanOptions, SstReadOptions,
-        },
+        factory::{FactoryImpl, FactoryRef as SstFactoryRef, ObjectStorePickerRef, ScanOptions},
         file::{FileHandle, FilePurgeQueue, Level, Request},
         meta_data::cache::MetaCacheRef,
         metrics::MaybeTableLevelMetrics as SstMaybeTableLevelMetrics,
     },
     table::sst_util,
+    ScanType, SstReadOptionsBuilder,
 };
 use common_types::{projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema};
 use logger::info;
@@ -49,7 +47,9 @@ pub struct MergeSstBench {
     store: ObjectStoreRef,
     max_projections: usize,
     schema: Schema,
-    sst_read_options: SstReadOptions,
+    projected_schema: Option<ProjectedSchema>,
+    sst_read_options_builder: SstReadOptionsBuilder,
+    num_rows_per_row_group: usize,
     runtime: Arc<Runtime>,
     space_id: SpaceId,
     table_id: TableId,
@@ -73,22 +73,24 @@ impl MergeSstBench {
         let schema = runtime.block_on(util::schema_from_sst(&store, &sst_path, &meta_cache));
 
         let predicate = config.predicate.into_predicate();
-        let projected_schema = ProjectedSchema::no_projection(schema.clone());
+        let _projected_schema = ProjectedSchema::no_projection(schema.clone());
         let scan_options = ScanOptions {
             background_read_parallelism: 1,
             max_record_batches_in_flight: 1024,
             num_streams_to_prefetch: 0,
         };
-        let sst_read_options = SstReadOptions {
-            maybe_table_level_metrics: Arc::new(SstMaybeTableLevelMetrics::new("bench")),
-            frequency: ReadFrequency::Frequent,
-            num_rows_per_row_group: config.num_rows_per_row_group,
-            projected_schema,
-            predicate,
-            meta_cache: meta_cache.clone(),
+
+        let maybe_table_level_metrics = Arc::new(SstMaybeTableLevelMetrics::new("bench"));
+        let scan_type = ScanType::Query;
+        let sst_read_options_builder = SstReadOptionsBuilder::new(
+            scan_type,
             scan_options,
-            runtime: runtime.clone(),
-        };
+            maybe_table_level_metrics,
+            config.num_rows_per_row_group,
+            predicate,
+            meta_cache.clone(),
+            runtime.clone(),
+        );
         let max_projections = cmp::min(config.max_projections, schema.num_columns());
 
         let (tx, rx) = mpsc::unbounded_channel();
@@ -107,7 +109,9 @@ impl MergeSstBench {
             store,
             max_projections,
             schema,
-            sst_read_options,
+            sst_read_options_builder,
+            num_rows_per_row_group: config.num_rows_per_row_group,
+            projected_schema: None,
             runtime,
             space_id,
             table_id,
@@ -126,7 +130,7 @@ impl MergeSstBench {
         let projected_schema =
             util::projected_schema_by_number(&self.schema, i, self.max_projections);
 
-        self.sst_read_options.projected_schema = projected_schema;
+        self.projected_schema = Some(projected_schema);
         self.dedup = dedup;
     }
 
@@ -134,10 +138,10 @@ impl MergeSstBench {
         let space_id = self.space_id;
         let table_id = self.table_id;
         let sequence = u64::MAX;
-        let projected_schema = self.sst_read_options.projected_schema.clone();
+        let projected_schema = self.projected_schema.clone().unwrap();
         let sst_factory: SstFactoryRef = Arc::new(FactoryImpl);
         let iter_options = IterOptions {
-            batch_size: self.sst_read_options.num_rows_per_row_group,
+            batch_size: self.num_rows_per_row_group,
         };
 
         let request_id = RequestId::next_id();
@@ -152,7 +156,7 @@ impl MergeSstBench {
             projected_schema,
             predicate: Arc::new(Predicate::empty()),
             sst_factory: &sst_factory,
-            sst_read_options: self.sst_read_options.clone(),
+            sst_read_options_builder: self.sst_read_options_builder.clone(),
             store_picker: &store_picker,
             merge_iter_options: iter_options.clone(),
             need_dedup: true,
@@ -190,7 +194,7 @@ impl MergeSstBench {
     fn run_no_dedup_bench(&self) {
         let space_id = self.space_id;
         let table_id = self.table_id;
-        let projected_schema = self.sst_read_options.projected_schema.clone();
+        let projected_schema = self.projected_schema.clone().unwrap();
         let sst_factory: SstFactoryRef = Arc::new(FactoryImpl);
 
         let request_id = RequestId::next_id();
@@ -204,7 +208,7 @@ impl MergeSstBench {
             projected_schema,
             predicate: Arc::new(Predicate::empty()),
             sst_factory: &sst_factory,
-            sst_read_options: self.sst_read_options.clone(),
+            sst_read_options_builder: self.sst_read_options_builder.clone(),
             store_picker: &store_picker,
             num_streams_to_prefetch: 0,
         })
diff --git a/benchmarks/src/scan_memtable_bench.rs b/benchmarks/src/scan_memtable_bench.rs
index a738a9c100..72e09a054c 100644
--- a/benchmarks/src/scan_memtable_bench.rs
+++ b/benchmarks/src/scan_memtable_bench.rs
@@ -25,7 +25,7 @@ use analytic_engine::{
     sst::meta_data::cache::MetaCacheRef,
 };
 use arena::NoopCollector;
-use common_types::projected_schema::ProjectedSchema;
+use common_types::projected_schema::{ProjectedSchema, RowProjectorBuilder};
 use logger::info;
 use object_store::{LocalFileSystem, Path};
 
@@ -91,14 +91,18 @@ impl ScanMemTableBench {
 
     pub fn run_bench(&self) {
         let scan_ctx = ScanContext::default();
+        let fetched_schema = self.projected_schema.to_record_schema();
+        let table_schema = self.projected_schema.table_schema();
+        let row_projector_builder =
+            RowProjectorBuilder::new(fetched_schema, table_schema.clone(), None);
         let scan_req = ScanRequest {
             start_user_key: Bound::Unbounded,
             end_user_key: Bound::Unbounded,
             sequence: common_types::MAX_SEQUENCE_NUMBER,
-            projected_schema: self.projected_schema.clone(),
             need_dedup: true,
             reverse: false,
             metrics_collector: None,
+            row_projector_builder,
         };
 
         let iter = self.memtable.scan(scan_ctx, scan_req).unwrap();
diff --git a/benchmarks/src/sst_bench.rs b/benchmarks/src/sst_bench.rs
index 38273bb758..3e9ed3d8da 100644
--- a/benchmarks/src/sst_bench.rs
+++ b/benchmarks/src/sst_bench.rs
@@ -16,15 +16,18 @@
 
 use std::{cmp, sync::Arc, time::Instant};
 
-use analytic_engine::sst::{
-    factory::{
-        Factory, FactoryImpl, ObjectStorePickerRef, ReadFrequency, ScanOptions, SstReadHint,
-        SstReadOptions,
+use analytic_engine::{
+    sst::{
+        factory::{Factory, FactoryImpl, ObjectStorePickerRef, ScanOptions, SstReadHint},
+        meta_data::cache::{MetaCache, MetaCacheRef},
+        metrics::MaybeTableLevelMetrics as SstMaybeTableLevelMetrics,
     },
-    meta_data::cache::{MetaCache, MetaCacheRef},
-    metrics::MaybeTableLevelMetrics as SstMaybeTableLevelMetrics,
+    ScanType, SstReadOptionsBuilder,
+};
+use common_types::{
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
+    schema::Schema,
 };
-use common_types::{projected_schema::ProjectedSchema, schema::Schema};
 use logger::info;
 use object_store::{LocalFileSystem, ObjectStoreRef, Path};
 use runtime::Runtime;
@@ -36,7 +39,8 @@ pub struct SstBench {
     pub sst_file_name: String,
     max_projections: usize,
     schema: Schema,
-    sst_read_options: SstReadOptions,
+    projected_schema: Option<ProjectedSchema>,
+    sst_read_options_builder: SstReadOptionsBuilder,
     runtime: Arc<Runtime>,
 }
 
@@ -57,16 +61,16 @@ impl SstBench {
             max_record_batches_in_flight: 1024,
             num_streams_to_prefetch: 0,
         };
-        let sst_read_options = SstReadOptions {
-            maybe_table_level_metrics: Arc::new(SstMaybeTableLevelMetrics::new("bench")),
-            frequency: ReadFrequency::Frequent,
-            num_rows_per_row_group: config.num_rows_per_row_group,
-            projected_schema,
+        let maybe_table_level_metrics = Arc::new(SstMaybeTableLevelMetrics::new("bench"));
+        let sst_read_options_builder = SstReadOptionsBuilder::new(
+            ScanType::Query,
+            scan_options,
+            maybe_table_level_metrics,
+            config.num_rows_per_row_group,
             predicate,
             meta_cache,
-            scan_options,
-            runtime: runtime.clone(),
-        };
+            runtime.clone(),
+        );
         let max_projections = cmp::min(config.max_projections, schema.num_columns());
 
         SstBench {
@@ -74,7 +78,8 @@ impl SstBench {
             sst_file_name: config.sst_file_name,
             max_projections,
             schema,
-            sst_read_options,
+            projected_schema: Some(projected_schema),
+            sst_read_options_builder: sst_read_options_builder.clone(),
             runtime,
         }
     }
@@ -88,7 +93,7 @@ impl SstBench {
         let projected_schema =
             util::projected_schema_by_number(&self.schema, i, self.max_projections);
 
-        self.sst_read_options.projected_schema = projected_schema;
+        self.projected_schema = Some(projected_schema);
     }
 
     pub fn run_bench(&self) {
@@ -97,11 +102,23 @@ impl SstBench {
         let sst_factory = FactoryImpl;
         let store_picker: ObjectStorePickerRef = Arc::new(self.store.clone());
 
+        let fetched_schema = self.projected_schema.as_ref().unwrap().to_record_schema();
+        let table_schema = self
+            .projected_schema
+            .as_ref()
+            .unwrap()
+            .table_schema()
+            .clone();
+        let row_projector_builder = RowProjectorBuilder::new(fetched_schema, table_schema, None);
+        let sst_read_options = self
+            .sst_read_options_builder
+            .clone()
+            .build(row_projector_builder);
         self.runtime.block_on(async {
             let mut sst_reader = sst_factory
                 .create_reader(
                     &sst_path,
-                    &self.sst_read_options,
+                    &sst_read_options,
                     SstReadHint::default(),
                     &store_picker,
                     None,
diff --git a/benchmarks/src/sst_tools.rs b/benchmarks/src/sst_tools.rs
index 62653d3c30..12a090e0ba 100644
--- a/benchmarks/src/sst_tools.rs
+++ b/benchmarks/src/sst_tools.rs
@@ -38,8 +38,12 @@ use analytic_engine::{
     },
     table::sst_util,
     table_options::{Compression, StorageFormatHint},
+    ScanType, SstReadOptionsBuilder,
+};
+use common_types::{
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
+    request_id::RequestId,
 };
-use common_types::{projected_schema::ProjectedSchema, request_id::RequestId};
 use generic_error::BoxError;
 use logger::info;
 use object_store::{LocalFileSystem, ObjectStoreRef, Path};
@@ -121,15 +125,19 @@ pub async fn rebuild_sst(config: RebuildSstConfig, runtime: Arc<Runtime>) {
         max_record_batches_in_flight: 1024,
         num_streams_to_prefetch: 2,
     };
+
+    let fetched_schema = projected_schema.to_record_schema();
+    let table_schema = projected_schema.table_schema().clone();
+    let row_projector_builder = RowProjectorBuilder::new(fetched_schema, table_schema, None);
     let sst_read_options = SstReadOptions {
         maybe_table_level_metrics: Arc::new(SstMaybeTableLevelMetrics::new("bench")),
         frequency: ReadFrequency::Once,
         num_rows_per_row_group: config.num_rows_per_row_group,
-        projected_schema,
         predicate: config.predicate.into_predicate(),
         meta_cache: None,
         scan_options,
         runtime,
+        row_projector_builder,
     };
 
     let record_batch_stream =
@@ -224,6 +232,7 @@ pub async fn merge_sst(config: MergeSstConfig, runtime: Arc<Runtime>) {
     let iter_options = IterOptions {
         batch_size: config.num_rows_per_row_group,
     };
+
     let scan_options = ScanOptions {
         background_read_parallelism: 1,
         max_record_batches_in_flight: 1024,
@@ -234,16 +243,23 @@ pub async fn merge_sst(config: MergeSstConfig, runtime: Arc<Runtime>) {
     let sst_factory: SstFactoryRef = Arc::new(FactoryImpl);
     let store_picker: ObjectStorePickerRef = Arc::new(store);
     let projected_schema = ProjectedSchema::no_projection(schema.clone());
-    let sst_read_options = SstReadOptions {
-        maybe_table_level_metrics: Arc::new(SstMaybeTableLevelMetrics::new("bench")),
-        frequency: ReadFrequency::Once,
-        num_rows_per_row_group: config.num_rows_per_row_group,
-        projected_schema: projected_schema.clone(),
-        predicate: config.predicate.into_predicate(),
-        meta_cache: None,
+    let maybe_table_level_metrics = Arc::new(SstMaybeTableLevelMetrics::new("bench"));
+    let sst_read_options_builder = SstReadOptionsBuilder::new(
+        ScanType::Query,
         scan_options,
-        runtime: runtime.clone(),
-    };
+        maybe_table_level_metrics,
+        config.num_rows_per_row_group,
+        config.predicate.into_predicate(),
+        None,
+        runtime.clone(),
+    );
+    let fetched_schema = projected_schema.to_record_schema_with_key();
+    let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
+    let fetched_schema = fetched_schema.into_record_schema();
+    let table_schema = projected_schema.table_schema().clone();
+    let row_projector_builder =
+        RowProjectorBuilder::new(fetched_schema, table_schema, Some(primary_key_indexes));
+
     let iter = {
         let space_id = config.space_id;
         let table_id = config.table_id;
@@ -260,11 +276,11 @@ pub async fn merge_sst(config: MergeSstConfig, runtime: Arc<Runtime>) {
             projected_schema,
             predicate: Arc::new(Predicate::empty()),
             sst_factory: &sst_factory,
-            sst_read_options: sst_read_options.clone(),
             store_picker: &store_picker,
             merge_iter_options: iter_options.clone(),
             need_dedup: true,
             reverse: false,
+            sst_read_options_builder: sst_read_options_builder.clone(),
         });
         builder
             .mut_ssts_of_level(Level::MIN)
@@ -280,6 +296,7 @@ pub async fn merge_sst(config: MergeSstConfig, runtime: Arc<Runtime>) {
         row_iter::record_batch_with_key_iter_to_stream(iter)
     };
 
+    let sst_read_options = sst_read_options_builder.build(row_projector_builder);
     let sst_meta = {
         let meta_reader = SstMetaReader {
             space_id,
diff --git a/benchmarks/src/util.rs b/benchmarks/src/util.rs
index d00c00ef8b..3c52b26011 100644
--- a/benchmarks/src/util.rs
+++ b/benchmarks/src/util.rs
@@ -35,7 +35,7 @@ use analytic_engine::{
 };
 use bytes_ext::{BufMut, SafeBufMut};
 use common_types::{
-    projected_schema::ProjectedSchema,
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
     schema::{IndexInWriterSchema, Schema},
 };
 use macros::define_result;
@@ -123,15 +123,20 @@ pub async fn load_sst_to_memtable(
         max_record_batches_in_flight: 1024,
         num_streams_to_prefetch: 0,
     };
+    let projected_schema = ProjectedSchema::no_projection(schema.clone());
+
+    let fetched_schema = projected_schema.to_record_schema();
+    let table_schema = projected_schema.table_schema().clone();
+    let row_projector_builder = RowProjectorBuilder::new(fetched_schema, table_schema, None);
     let sst_read_options = SstReadOptions {
         maybe_table_level_metrics: Arc::new(SstMaybeTableLevelMetrics::new("bench")),
         frequency: ReadFrequency::Frequent,
         num_rows_per_row_group: 8192,
-        projected_schema: ProjectedSchema::no_projection(schema.clone()),
         predicate: Arc::new(Predicate::empty()),
         meta_cache: None,
         scan_options,
         runtime,
+        row_projector_builder,
     };
     let sst_factory = FactoryImpl;
     let store_picker: ObjectStorePickerRef = Arc::new(store.clone());
diff --git a/catalog/src/schema.rs b/catalog/src/schema.rs
index 01d27d5447..51fb7f82d2 100644
--- a/catalog/src/schema.rs
+++ b/catalog/src/schema.rs
@@ -26,6 +26,8 @@ use table_engine::{
     table::{SchemaId, TableId, TableRef},
 };
 
+// FIXME: `CreateExistTable` can lead to `segmentation fault` if including
+// backtrace.
 #[derive(Debug, Snafu)]
 #[snafu(visibility(pub))]
 pub enum Error {
@@ -118,12 +120,8 @@ pub enum Error {
     #[snafu(display("Failed to close table, source:{}", source))]
     CloseTableWithCause { source: GenericError },
 
-    #[snafu(display(
-        "Failed to create table, table already exists, table:{}.\nBacktrace:\n{}",
-        table,
-        backtrace
-    ))]
-    CreateExistTable { table: String, backtrace: Backtrace },
+    #[snafu(display("Failed to create table, table already exists, table:{table}."))]
+    CreateExistTable { table: String },
 
     #[snafu(display(
         "Failed to create table, cannot persist meta, table:{}, err:{}",
diff --git a/common_types/src/projected_schema.rs b/common_types/src/projected_schema.rs
index 77962765d9..d0f780d8b6 100644
--- a/common_types/src/projected_schema.rs
+++ b/common_types/src/projected_schema.rs
@@ -62,19 +62,155 @@ pub type Result<T> = std::result::Result<T, Error>;
 
 #[derive(Debug, Clone)]
 pub struct RowProjector {
-    schema_with_key: RecordSchemaWithKey,
+    /// The schema for data fetched
+    /// It is derived from table schema and some columns may not exist in data
+    /// source.
+    target_record_schema: RecordSchema,
+
+    /// Primary key indexes in `fetched_schema`.
+    /// It will be `None` if update mode of table is `append`,
+    /// and will be `Some` if the mode is `overwrite`.
+    primary_key_indexes: Option<Vec<usize>>,
+
+    /// Schema in data source
+    /// It is possible to be different with the table
+    /// schema caused by table schema altering.
     source_schema: Schema,
-    /// The Vec stores the column index in source, and `None` means this column
-    /// is not in source but required by reader, and need to filled by null.
-    /// The length of Vec is the same as the number of columns reader intended
-    /// to read.
-    source_projection: Vec<Option<usize>>,
+
+    /// The Vec stores the column index in data source, and `None` means this
+    /// column is not in source but required by reader, and need to filled
+    /// by null. The length of Vec is the same as the number of columns
+    /// reader intended to read.
+    source_projection_indexes: Vec<Option<usize>>,
+
+    /// Used to reorder columns in arrow record batch fetched from sst to the
+    /// needed projection order.
+    /// Actually, It stores the record column indexes in
+    /// projected order similar as `source_projection_indexes`.
+    ///
+    /// Why we need it?
+    /// Because in current rust parquet impl, we can just define which columns
+    /// we wanted to fetch without their order.
+    ///
+    /// For example:
+    ///     wanted columns in order: 2,1,3
+    ///     actual fetched columns: 1,2,3
+    ///
+    /// However, projection is not only wanted columns but with wanted order, so
+    /// we need this remapping to reorder the fetched record.
+    ///
+    /// For example:
+    ///   source columns in sst: 0,1,2,3,4
+    ///   target projection columns: 2,1,3
+    ///   
+    ///   the actual columns in fetched record: 1,2,3
+    ///   relative columns indexes in fetched record: 0,1,2
+    ///
+    ///   finally, the remapping to the relative indexes: 1,0,2
+    target_record_projection_remapping: Vec<Option<usize>>,
 }
 
 impl RowProjector {
+    pub fn new(
+        fetched_schema: &RecordSchema,
+        primary_key_indexes: Option<Vec<usize>>,
+        table_schema: &Schema,
+        source_schema: &Schema,
+    ) -> Result<Self> {
+        // Get `fetched_source_column_indexes`.
+        let mut fetched_source_column_indexes = Vec::with_capacity(fetched_schema.num_columns());
+        let mut projected_source_indexes = Vec::with_capacity(fetched_schema.num_columns());
+        for column_schema in fetched_schema.columns() {
+            Self::try_project_column(
+                column_schema,
+                table_schema,
+                source_schema,
+                &mut fetched_source_column_indexes,
+                &mut projected_source_indexes,
+            )?;
+        }
+
+        // Get `fetched_projected_source_column_indexes` from
+        // `fetched_source_column_indexes`.
+        projected_source_indexes.sort_unstable();
+        let fetched_projected_source_column_indexes = fetched_source_column_indexes
+            .iter()
+            .map(|source_idx_opt| {
+                source_idx_opt.map(|src_idx| {
+                    // Safe to unwrap, index exists in `fetched_source_column_indexes` is ensured
+                    // to exist in `projected_source_indexes`.
+                    projected_source_indexes
+                        .iter()
+                        .position(|proj_idx| src_idx == *proj_idx)
+                        .unwrap()
+                })
+            })
+            .collect();
+
+        Ok(RowProjector {
+            target_record_schema: fetched_schema.clone(),
+            primary_key_indexes,
+            source_schema: source_schema.clone(),
+            source_projection_indexes: fetched_source_column_indexes,
+            target_record_projection_remapping: fetched_projected_source_column_indexes,
+        })
+    }
+
+    fn try_project_column(
+        column: &ColumnSchema,
+        table_schema: &Schema,
+        source_schema: &Schema,
+        fetched_source_column_indexes: &mut Vec<Option<usize>>,
+        projected_source_indexes: &mut Vec<usize>,
+    ) -> Result<()> {
+        match source_schema.index_of(&column.name) {
+            Some(source_idx) => {
+                // Column is in source
+                if table_schema.version() == source_schema.version() {
+                    // Same version, just use that column in source
+                    fetched_source_column_indexes.push(Some(source_idx));
+                    projected_source_indexes.push(source_idx);
+                } else {
+                    // Different version, need to check column schema
+                    let source_column = source_schema.column(source_idx);
+                    // TODO(yingwen): Data type is not checked here because we do not support alter
+                    // data type now.
+                    match column
+                        .compatible_for_read(source_column)
+                        .context(IncompatReadColumn)?
+                    {
+                        ReadOp::Exact => {
+                            fetched_source_column_indexes.push(Some(source_idx));
+                            projected_source_indexes.push(source_idx);
+                        }
+                        ReadOp::FillNull => {
+                            fetched_source_column_indexes.push(None);
+                        }
+                    }
+                }
+            }
+            None => {
+                // Column is not in source
+                ensure!(column.is_nullable, MissingReadColumn { name: &column.name });
+                // Column is nullable, fill this column by null
+                fetched_source_column_indexes.push(None);
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn source_schema(&self) -> &Schema {
+        &self.source_schema
+    }
+
+    pub fn fetched_schema(&self) -> &RecordSchema {
+        &self.target_record_schema
+    }
+
     /// The projected indexes of existed columns in the source schema.
     pub fn existed_source_projection(&self) -> Vec<usize> {
-        self.source_projection
+        self.source_projection_indexes
             .iter()
             .filter_map(|index| *index)
             .collect()
@@ -82,12 +218,18 @@ impl RowProjector {
 
     /// The projected indexes of all columns(existed and not exist) in the
     /// source schema.
-    pub fn source_projection(&self) -> &[Option<usize>] {
-        &self.source_projection
+    pub fn fetched_source_column_indexes(&self) -> &[Option<usize>] {
+        &self.source_projection_indexes
     }
 
-    pub fn schema_with_key(&self) -> &RecordSchemaWithKey {
-        &self.schema_with_key
+    /// The projected indexes of all columns(existed and not exist) in the
+    /// projected source schema.
+    pub fn fetched_projected_source_column_indexes(&self) -> &[Option<usize>] {
+        &self.target_record_projection_remapping
+    }
+
+    pub fn primary_key_indexes(&self) -> Option<&[usize]> {
+        self.primary_key_indexes.as_deref()
     }
 
     /// Project the row.
@@ -96,9 +238,9 @@ impl RowProjector {
     pub fn project_row(&self, row: &Row, mut datums_buffer: Vec<Datum>) -> Row {
         assert_eq!(self.source_schema.num_columns(), row.num_columns());
 
-        datums_buffer.reserve(self.schema_with_key.num_columns());
+        datums_buffer.reserve(self.target_record_schema.num_columns());
 
-        for p in &self.source_projection {
+        for p in &self.source_projection_indexes {
             let datum = match p {
                 Some(index_in_source) => row[*index_in_source].clone(),
                 None => Datum::Null,
@@ -119,13 +261,43 @@ impl RowProjector {
     }
 }
 
+#[derive(Debug, Clone)]
+pub struct RowProjectorBuilder {
+    fetched_schema: RecordSchema,
+    table_schema: Schema,
+    primary_key_indexes: Option<Vec<usize>>,
+}
+
+impl RowProjectorBuilder {
+    pub fn new(
+        fetched_schema: RecordSchema,
+        table_schema: Schema,
+        primary_key_indexes: Option<Vec<usize>>,
+    ) -> Self {
+        Self {
+            fetched_schema,
+            table_schema,
+            primary_key_indexes,
+        }
+    }
+
+    pub fn build(&self, source_schema: &Schema) -> Result<RowProjector> {
+        RowProjector::new(
+            &self.fetched_schema,
+            self.primary_key_indexes.clone(),
+            &self.table_schema,
+            source_schema,
+        )
+    }
+}
+
 #[derive(Clone)]
 pub struct ProjectedSchema(Arc<ProjectedSchemaInner>);
 
 impl fmt::Debug for ProjectedSchema {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("ProjectedSchema")
-            .field("original_schema", &self.0.original_schema)
+            .field("original_schema", &self.0.table_schema)
             .field("projection", &self.0.projection)
             .finish()
     }
@@ -137,8 +309,8 @@ impl ProjectedSchema {
         Self(Arc::new(inner))
     }
 
-    pub fn new(schema: Schema, projection: Option<Vec<usize>>) -> Result<Self> {
-        let inner = ProjectedSchemaInner::new(schema, projection)?;
+    pub fn new(table_schema: Schema, projection: Option<Vec<usize>>) -> Result<Self> {
+        let inner = ProjectedSchemaInner::new(table_schema, projection)?;
         Ok(Self(Arc::new(inner)))
     }
 
@@ -150,42 +322,33 @@ impl ProjectedSchema {
         self.0.projection()
     }
 
-    /// Returns the [RowProjector] to project the rows with source schema to
-    /// rows with [RecordSchemaWithKey].
-    ///
-    /// REQUIRE: The key columns are the same as this schema.
-    #[inline]
-    pub fn try_project_with_key(&self, source_schema: &Schema) -> Result<RowProjector> {
-        self.0.try_project_with_key(source_schema)
-    }
-
     // Returns the record schema after projection with key.
     pub fn to_record_schema_with_key(&self) -> RecordSchemaWithKey {
-        self.0.schema_with_key.clone()
+        self.0.record_schema_with_key.clone()
     }
 
     pub fn as_record_schema_with_key(&self) -> &RecordSchemaWithKey {
-        &self.0.schema_with_key
+        &self.0.record_schema_with_key
     }
 
     // Returns the record schema after projection.
     pub fn to_record_schema(&self) -> RecordSchema {
-        self.0.record_schema.clone()
+        self.0.target_record_schema.clone()
     }
 
     /// Returns the arrow schema after projection.
     pub fn to_projected_arrow_schema(&self) -> ArrowSchemaRef {
-        self.0.record_schema.to_arrow_schema_ref()
+        self.0.target_record_schema.to_arrow_schema_ref()
     }
 
-    pub fn original_schema(&self) -> &Schema {
-        &self.0.original_schema
+    pub fn table_schema(&self) -> &Schema {
+        &self.0.table_schema
     }
 }
 
 impl From<ProjectedSchema> for ceresdbproto::schema::ProjectedSchema {
     fn from(request: ProjectedSchema) -> Self {
-        let table_schema_pb = (&request.0.original_schema).into();
+        let table_schema_pb = (&request.0.table_schema).into();
         let projection_pb = request.0.projection.as_ref().map(|project| {
             let project = project
                 .iter()
@@ -223,55 +386,56 @@ impl TryFrom<ceresdbproto::schema::ProjectedSchema> for ProjectedSchema {
 
 /// Schema with projection informations
 struct ProjectedSchemaInner {
-    /// The schema before projection that the reader intended to read, may
-    /// differ from current schema of the table.
-    original_schema: Schema,
+    /// The table schema used to generate plan, possible to differ from
+    /// schema in ssts/memtable.
+    table_schema: Schema,
     /// Index of the projected columns in `self.schema`, `None` if
     /// all columns are needed.
     projection: Option<Vec<usize>>,
 
-    /// The record schema from `self.schema` with key columns after projection.
-    schema_with_key: RecordSchemaWithKey,
-    /// The record schema from `self.schema` after projection.
-    record_schema: RecordSchema,
+    /// The fetched record schema from `self.schema` with key columns after
+    /// projection.
+    record_schema_with_key: RecordSchemaWithKey,
+    /// The fetched record schema from `self.schema` after projection.
+    target_record_schema: RecordSchema,
 }
 
 impl ProjectedSchemaInner {
-    fn no_projection(schema: Schema) -> Self {
-        let schema_with_key = schema.to_record_schema_with_key();
-        let record_schema = schema.to_record_schema();
+    fn no_projection(table_schema: Schema) -> Self {
+        let record_schema_with_key = table_schema.to_record_schema_with_key();
+        let target_record_schema = table_schema.to_record_schema();
 
         Self {
-            original_schema: schema,
+            table_schema,
             projection: None,
-            schema_with_key,
-            record_schema,
+            record_schema_with_key,
+            target_record_schema,
         }
     }
 
-    fn new(schema: Schema, projection: Option<Vec<usize>>) -> Result<Self> {
+    fn new(table_schema: Schema, projection: Option<Vec<usize>>) -> Result<Self> {
         if let Some(p) = &projection {
             // Projection is provided, validate the projection is valid. This is necessary
             // to avoid panic when creating RecordSchema and
             // RecordSchemaWithKey.
             if let Some(max_idx) = p.iter().max() {
                 ensure!(
-                    *max_idx < schema.num_columns(),
+                    *max_idx < table_schema.num_columns(),
                     InvalidProjectionIndex { index: *max_idx }
                 );
             }
 
-            let schema_with_key = schema.project_record_schema_with_key(p);
-            let record_schema = schema.project_record_schema(p);
+            let record_schema_with_key = table_schema.project_record_schema_with_key(p);
+            let target_record_schema = table_schema.project_record_schema(p);
 
             Ok(Self {
-                original_schema: schema,
+                table_schema,
                 projection,
-                schema_with_key,
-                record_schema,
+                record_schema_with_key,
+                target_record_schema,
             })
         } else {
-            Ok(Self::no_projection(schema))
+            Ok(Self::no_projection(table_schema))
         }
     }
 
@@ -283,75 +447,6 @@ impl ProjectedSchemaInner {
     fn projection(&self) -> Option<Vec<usize>> {
         self.projection.clone()
     }
-
-    // TODO(yingwen): We can fill missing not null column with default value instead
-    //  of returning error.
-    fn try_project_with_key(&self, source_schema: &Schema) -> Result<RowProjector> {
-        // When do primary key sample, this will assert will fail.
-        // TODO: maybe we can add a flag to only skip this assert when sampling.
-        //
-        // debug_assert_eq!(
-        //     self.schema_with_key.key_columns(),
-        //     source_schema.key_columns()
-        // );
-        // We consider the two schema is equal if they have same version.
-        // if self.original_schema.version() == source_schema.version() {
-        //     debug_assert_eq!(self.original_schema, *source_schema);
-        // }
-
-        let mut source_projection = Vec::with_capacity(self.schema_with_key.num_columns());
-        // For each column in `schema_with_key`
-        for column_schema in self.schema_with_key.columns() {
-            self.try_project_column(column_schema, source_schema, &mut source_projection)?;
-        }
-
-        Ok(RowProjector {
-            schema_with_key: self.schema_with_key.clone(),
-            source_schema: source_schema.clone(),
-            source_projection,
-        })
-    }
-
-    fn try_project_column(
-        &self,
-        column: &ColumnSchema,
-        source_schema: &Schema,
-        source_projection: &mut Vec<Option<usize>>,
-    ) -> Result<()> {
-        match source_schema.index_of(&column.name) {
-            Some(source_idx) => {
-                // Column is in source
-                if self.original_schema.version() == source_schema.version() {
-                    // Same version, just use that column in source
-                    source_projection.push(Some(source_idx));
-                } else {
-                    // Different version, need to check column schema
-                    let source_column = source_schema.column(source_idx);
-                    // TODO(yingwen): Data type is not checked here because we do not support alter
-                    // data type now.
-                    match column
-                        .compatible_for_read(source_column)
-                        .context(IncompatReadColumn)?
-                    {
-                        ReadOp::Exact => {
-                            source_projection.push(Some(source_idx));
-                        }
-                        ReadOp::FillNull => {
-                            source_projection.push(None);
-                        }
-                    }
-                }
-            }
-            None => {
-                // Column is not in source
-                ensure!(column.is_nullable, MissingReadColumn { name: &column.name });
-                // Column is nullable, fill this column by null
-                source_projection.push(None);
-            }
-        }
-
-        Ok(())
-    }
 }
 
 #[cfg(test)]
@@ -365,7 +460,7 @@ mod tests {
         let projection: Vec<usize> = (0..schema.num_columns() - 1).collect();
         let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap();
         assert_eq!(
-            projected_schema.0.schema_with_key.num_columns(),
+            projected_schema.0.record_schema_with_key.num_columns(),
             schema.num_columns() - 1
         );
         assert!(!projected_schema.is_all_projection());
diff --git a/common_types/src/record_batch.rs b/common_types/src/record_batch.rs
index af9cca487d..1b7d610d8e 100644
--- a/common_types/src/record_batch.rs
+++ b/common_types/src/record_batch.rs
@@ -362,15 +362,70 @@ fn cast_arrow_record_batch(source: ArrowRecordBatch) -> Result<ArrowRecordBatch>
 }
 
 #[derive(Debug)]
-pub struct RecordBatchWithKey {
-    schema_with_key: RecordSchemaWithKey,
+pub struct FetchedRecordBatch {
+    schema: RecordSchema,
+    // TODO: remove it later, `FetchedRecordBatch` is unnecessary to know anything about primary
+    // keys.
+    primary_key_indexes: Option<Vec<usize>>,
     data: RecordBatchData,
 }
 
-impl RecordBatchWithKey {
-    pub fn new(schema_with_key: RecordSchemaWithKey, data: RecordBatchData) -> Self {
+impl FetchedRecordBatch {
+    pub fn try_new(ctx: &RowProjector, arrow_record_batch: ArrowRecordBatch) -> Result<Self> {
+        let column_indexes = ctx.fetched_projected_source_column_indexes();
+        let schema = ctx.fetched_schema().clone();
+        let mut column_blocks = Vec::with_capacity(schema.num_columns());
+
+        let num_rows = arrow_record_batch.num_rows();
+        let num_columns = arrow_record_batch.num_columns();
+        for (col_idx_opt, col_schema) in column_indexes.iter().zip(schema.columns()) {
+            match col_idx_opt {
+                Some(col_idx) => {
+                    ensure!(
+                        *col_idx < num_columns,
+                        OutOfIndexProjection {
+                            source_projection: column_indexes,
+                            arrow_schema: arrow_record_batch.schema()
+                        }
+                    );
+
+                    let array = arrow_record_batch.column(*col_idx);
+                    let column_block =
+                        ColumnBlock::try_from_arrow_array_ref(&col_schema.data_type, array)
+                            .context(CreateColumnBlock)?;
+
+                    column_blocks.push(column_block);
+                }
+                None => {
+                    // Need to push row with specific type.
+                    let null_block = ColumnBlock::new_null_with_type(
+                        &col_schema.data_type,
+                        num_rows,
+                        col_schema.is_dictionary,
+                    )
+                    .context(CreateColumnBlock)?;
+                    column_blocks.push(null_block);
+                }
+            }
+        }
+
+        let data = RecordBatchData::new(schema.to_arrow_schema_ref(), column_blocks)?;
+
+        Ok(FetchedRecordBatch {
+            schema,
+            primary_key_indexes: ctx.primary_key_indexes().map(|idxs| idxs.to_vec()),
+            data,
+        })
+    }
+
+    pub fn new_from_parts(
+        schema: RecordSchema,
+        primary_key_indexes: Option<Vec<usize>>,
+        data: RecordBatchData,
+    ) -> Self {
         Self {
-            schema_with_key,
+            schema,
+            primary_key_indexes,
             data,
         }
     }
@@ -398,27 +453,22 @@ impl RecordBatchWithKey {
         Row::from_datums(datums)
     }
 
-    /// Project the [RecordBatchWithKey] into a [RecordBatch] according to
+    /// Project the [FetchedRecordBatch] into a [RecordBatch] according to
     /// [ProjectedSchema].
-    ///
-    /// REQUIRE: The schema_with_key of the [RecordBatchWithKey] is the same as
-    /// the schema_with_key of [ProjectedSchema].
+    // TODO: how do we ensure `ProjectedSchema` passed here is same as the source
+    // `ProjectedSchema` of `RecordSchema` here?
     pub fn try_project(mut self, projected_schema: &ProjectedSchema) -> Result<RecordBatch> {
-        debug_assert_eq!(
-            &self.schema_with_key,
-            projected_schema.as_record_schema_with_key()
-        );
-
         // Get the schema after projection.
         let record_schema = projected_schema.to_record_schema();
         let mut column_blocks = Vec::with_capacity(record_schema.num_columns());
 
         for column_schema in record_schema.columns() {
-            let column_index = self.schema_with_key.index_of(&column_schema.name).context(
-                ColumnNotInSchemaWithKey {
-                    name: &column_schema.name,
-                },
-            )?;
+            let column_index =
+                self.schema
+                    .index_of(&column_schema.name)
+                    .context(ColumnNotInSchemaWithKey {
+                        name: &column_schema.name,
+                    })?;
 
             // Take the column block out.
             let column_block = self.data.take_column_block(column_index);
@@ -435,7 +485,7 @@ impl RecordBatchWithKey {
 
     pub fn into_record_batch(self) -> RecordBatch {
         RecordBatch {
-            schema: self.schema_with_key.into_record_schema(),
+            schema: self.schema,
             data: self.data,
         }
     }
@@ -448,9 +498,20 @@ impl RecordBatchWithKey {
         self.data.arrow_record_batch
     }
 
+    pub fn schema_with_key(&self) -> Option<RecordSchemaWithKey> {
+        self.primary_key_indexes
+            .clone()
+            .map(|idxs| RecordSchemaWithKey::new(self.schema.clone(), idxs))
+    }
+
+    #[inline]
+    pub fn schema(&self) -> &RecordSchema {
+        &self.schema
+    }
+
     #[inline]
-    pub fn schema_with_key(&self) -> &RecordSchemaWithKey {
-        &self.schema_with_key
+    pub fn primary_key_indexes(&self) -> Option<&[usize]> {
+        self.primary_key_indexes.as_deref()
     }
 
     #[inline]
@@ -485,7 +546,8 @@ impl RecordBatchWithKey {
     #[must_use]
     pub fn slice(&self, offset: usize, length: usize) -> Self {
         Self {
-            schema_with_key: self.schema_with_key.clone(),
+            schema: self.schema.clone(),
+            primary_key_indexes: self.primary_key_indexes.clone(),
             data: self.data.slice(offset, length),
         }
     }
@@ -506,14 +568,15 @@ impl RecordBatchWithKey {
     }
 }
 
-pub struct RecordBatchWithKeyBuilder {
-    schema_with_key: RecordSchemaWithKey,
+pub struct FetchedRecordBatchBuilder {
+    fetched_schema: RecordSchema,
+    primary_key_indexes: Option<Vec<usize>>,
     builders: Vec<ColumnBlockBuilder>,
 }
 
-impl RecordBatchWithKeyBuilder {
-    pub fn new(schema_with_key: RecordSchemaWithKey) -> Self {
-        let builders = schema_with_key
+impl FetchedRecordBatchBuilder {
+    pub fn new(fetched_schema: RecordSchema, primary_key_indexes: Option<Vec<usize>>) -> Self {
+        let builders = fetched_schema
             .columns()
             .iter()
             .map(|column_schema| {
@@ -525,13 +588,18 @@ impl RecordBatchWithKeyBuilder {
             })
             .collect();
         Self {
-            schema_with_key,
+            fetched_schema,
+            primary_key_indexes,
             builders,
         }
     }
 
-    pub fn with_capacity(schema_with_key: RecordSchemaWithKey, capacity: usize) -> Self {
-        let builders = schema_with_key
+    pub fn with_capacity(
+        record_schema: RecordSchema,
+        primary_key_indexes: Option<Vec<usize>>,
+        capacity: usize,
+    ) -> Self {
+        let builders = record_schema
             .columns()
             .iter()
             .map(|column_schema| {
@@ -543,7 +611,8 @@ impl RecordBatchWithKeyBuilder {
             })
             .collect();
         Self {
-            schema_with_key,
+            fetched_schema: record_schema,
+            primary_key_indexes,
             builders,
         }
     }
@@ -598,7 +667,7 @@ impl RecordBatchWithKeyBuilder {
     /// - The `record_batch` and the builder must have the same schema.
     pub fn append_batch_range(
         &mut self,
-        record_batch: &RecordBatchWithKey,
+        record_batch: &FetchedRecordBatch,
         start: usize,
         len: usize,
     ) -> Result<usize> {
@@ -638,115 +707,41 @@ impl RecordBatchWithKeyBuilder {
         }
     }
 
-    /// Build [RecordBatchWithKey] and reset the builder.
-    pub fn build(&mut self) -> Result<RecordBatchWithKey> {
+    /// Build [FetchedRecordBatch] and reset the builder.
+    pub fn build(&mut self) -> Result<FetchedRecordBatch> {
         let column_blocks: Vec<_> = self
             .builders
             .iter_mut()
             .map(|builder| builder.build())
             .collect();
-        let arrow_schema = self.schema_with_key.to_arrow_schema_ref();
+        let arrow_schema = self.fetched_schema.to_arrow_schema_ref();
 
-        Ok(RecordBatchWithKey {
-            schema_with_key: self.schema_with_key.clone(),
+        Ok(FetchedRecordBatch {
+            schema: self.fetched_schema.clone(),
+            primary_key_indexes: self.primary_key_indexes.clone(),
             data: RecordBatchData::new(arrow_schema, column_blocks)?,
         })
     }
 }
 
-#[derive(Debug, Clone)]
-pub struct ArrowRecordBatchProjector {
-    row_projector: RowProjector,
-}
-
-impl From<RowProjector> for ArrowRecordBatchProjector {
-    fn from(row_projector: RowProjector) -> Self {
-        Self { row_projector }
-    }
-}
-
-impl ArrowRecordBatchProjector {
-    /// Project the [arrow::RecordBatch] to [RecordBatchWithKey] and these
-    /// things are to be done:
-    ///  - Insert the null column if the projected column does not appear in the
-    ///    source schema.
-    ///  - Convert the [arrow::RecordBatch] to [RecordBatchWithKey].
-    ///
-    /// REQUIRE: Schema of the `arrow_record_batch` is the same as the
-    /// projection of existing column in the source schema.
-    pub fn project_to_record_batch_with_key(
-        &self,
-        arrow_record_batch: ArrowRecordBatch,
-    ) -> Result<RecordBatchWithKey> {
-        let schema_with_key = self.row_projector.schema_with_key().clone();
-        let source_projection = self.row_projector.source_projection();
-        let mut column_blocks = Vec::with_capacity(schema_with_key.num_columns());
-
-        let num_rows = arrow_record_batch.num_rows();
-        // ensure next_arrow_column_idx < num_columns
-        let mut next_arrow_column_idx = 0;
-        let num_columns = arrow_record_batch.num_columns();
-
-        for (source_idx, column_schema) in source_projection.iter().zip(schema_with_key.columns()) {
-            match source_idx {
-                Some(_) => {
-                    ensure!(
-                        next_arrow_column_idx < num_columns,
-                        OutOfIndexProjection {
-                            source_projection,
-                            arrow_schema: arrow_record_batch.schema()
-                        }
-                    );
-
-                    let array = arrow_record_batch.column(next_arrow_column_idx);
-                    next_arrow_column_idx += 1;
-
-                    let column_block =
-                        ColumnBlock::try_from_arrow_array_ref(&column_schema.data_type, array)
-                            .context(CreateColumnBlock)?;
-
-                    column_blocks.push(column_block);
-                }
-                None => {
-                    // Need to push row with specific type.
-                    let null_block = ColumnBlock::new_null_with_type(
-                        &column_schema.data_type,
-                        num_rows,
-                        column_schema.is_dictionary,
-                    )
-                    .context(CreateColumnBlock)?;
-                    column_blocks.push(null_block);
-                }
-            }
-        }
-
-        let data = RecordBatchData::new(schema_with_key.to_arrow_schema_ref(), column_blocks)?;
-
-        Ok(RecordBatchWithKey {
-            schema_with_key,
-            data,
-        })
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use crate::{
-        record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+        record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
         row::RowViewOnBatch,
         tests::{
-            build_projected_schema, build_record_batch_with_key_by_rows, build_rows,
+            build_fetched_record_batch_by_rows, build_projected_schema, build_rows,
             check_record_batch_with_key_with_rows,
         },
     };
 
-    fn build_record_batch_with_key() -> RecordBatchWithKey {
+    fn build_fetched_record_batch() -> FetchedRecordBatch {
         let rows = build_rows();
-        build_record_batch_with_key_by_rows(rows)
+        build_fetched_record_batch_by_rows(rows)
     }
 
     fn check_record_batch_with_key(
-        record_batch_with_key: RecordBatchWithKey,
+        record_batch_with_key: FetchedRecordBatch,
         row_num: usize,
         column_num: usize,
     ) -> bool {
@@ -756,7 +751,7 @@ mod tests {
 
     #[test]
     fn test_append_projected_contiguous_row() {
-        let record_batch_with_key = build_record_batch_with_key();
+        let record_batch_with_key = build_fetched_record_batch();
         assert_eq!(record_batch_with_key.num_rows(), 5);
         assert_eq!(record_batch_with_key.num_columns(), 5);
 
@@ -766,15 +761,11 @@ mod tests {
     #[test]
     fn test_append_row_view() {
         let projected_schema = build_projected_schema();
-
-        let record_batch_with_key = build_record_batch_with_key();
-
-        let mut builder = RecordBatchWithKeyBuilder::with_capacity(
-            projected_schema.to_record_schema_with_key(),
-            2,
-        );
+        let fetched_record_batch = build_fetched_record_batch();
+        let mut builder =
+            FetchedRecordBatchBuilder::with_capacity(projected_schema.to_record_schema(), None, 2);
         let view = RowViewOnBatch {
-            record_batch: &record_batch_with_key,
+            record_batch: &fetched_record_batch,
             row_idx: 1,
         };
         builder.append_row_view(&view).unwrap();
@@ -788,13 +779,10 @@ mod tests {
     #[test]
     fn test_append_batch_range() {
         let projected_schema = build_projected_schema();
+        let record_batch_with_key = build_fetched_record_batch();
 
-        let record_batch_with_key = build_record_batch_with_key();
-
-        let mut builder = RecordBatchWithKeyBuilder::with_capacity(
-            projected_schema.to_record_schema_with_key(),
-            2,
-        );
+        let mut builder =
+            FetchedRecordBatchBuilder::with_capacity(projected_schema.to_record_schema(), None, 2);
         builder
             .append_batch_range(&record_batch_with_key, 0, 2)
             .unwrap();
diff --git a/common_types/src/row/contiguous.rs b/common_types/src/row/contiguous.rs
index db055e66ea..d16960959b 100644
--- a/common_types/src/row/contiguous.rs
+++ b/common_types/src/row/contiguous.rs
@@ -248,27 +248,24 @@ fn datum_view_at<'a>(
 /// schema of source row.
 pub struct ProjectedContiguousRow<'a, T> {
     source_row: T,
-    projector: &'a RowProjector,
+    ctx: &'a RowProjector,
 }
 
 impl<'a, T: ContiguousRow> ProjectedContiguousRow<'a, T> {
-    pub fn new(source_row: T, projector: &'a RowProjector) -> Self {
-        Self {
-            source_row,
-            projector,
-        }
+    pub fn new(source_row: T, ctx: &'a RowProjector) -> Self {
+        Self { source_row, ctx }
     }
 
     pub fn num_datum_views(&self) -> usize {
-        self.projector.source_projection().len()
+        self.ctx.fetched_source_column_indexes().len()
     }
 
     pub fn datum_view_at(&self, index: usize) -> DatumView {
-        let p = self.projector.source_projection()[index];
+        let p = self.ctx.fetched_source_column_indexes()[index];
 
         match p {
             Some(index_in_source) => {
-                let datum_kind = self.projector.datum_kind(index_in_source);
+                let datum_kind = self.ctx.datum_kind(index_in_source);
                 self.source_row.datum_view_at(index_in_source, datum_kind)
             }
             None => DatumView::Null,
@@ -801,7 +798,13 @@ mod tests {
         let projection: Vec<usize> = (0..schema.num_columns() - 1).collect();
         let projected_schema =
             ProjectedSchema::new(schema.clone(), Some(projection.clone())).unwrap();
-        let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap();
+        let ctx = RowProjector::new(
+            &projected_schema.to_record_schema(),
+            None,
+            projected_schema.table_schema(),
+            &schema,
+        )
+        .unwrap();
         let rows = build_rows();
         let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
 
@@ -812,7 +815,7 @@ mod tests {
             writer.write_row(&row).unwrap();
 
             let source_row = ContiguousRowReader::try_new(&buf, &schema).unwrap();
-            let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema);
+            let projected_row = ProjectedContiguousRow::new(source_row, &ctx);
 
             let range = projection.clone();
             for i in range {
diff --git a/common_types/src/row/mod.rs b/common_types/src/row/mod.rs
index 4fb8139283..652611a892 100644
--- a/common_types/src/row/mod.rs
+++ b/common_types/src/row/mod.rs
@@ -24,7 +24,7 @@ use snafu::{ensure, Backtrace, OptionExt, Snafu};
 use crate::{
     column_schema::{ColumnId, ColumnSchema},
     datum::{Datum, DatumKind, DatumView},
-    record_batch::RecordBatchWithKey,
+    record_batch::FetchedRecordBatch,
     schema::{RecordSchemaWithKey, Schema},
     time::Timestamp,
 };
@@ -560,13 +560,13 @@ pub trait RowView {
     fn column_by_idx(&self, column_idx: usize) -> Datum;
 }
 
-// TODO(yingwen): Add a method to get row view on RecordBatchWithKey.
-/// A row view on the [RecordBatchWithKey].
+// TODO(yingwen): Add a method to get row view on FetchedRecordBatch.
+/// A row view on the [FetchedRecordBatch].
 ///
 /// `row_idx < record_batch.num_rows()` is ensured.
 #[derive(Debug)]
 pub struct RowViewOnBatch<'a> {
-    pub record_batch: &'a RecordBatchWithKey,
+    pub record_batch: &'a FetchedRecordBatch,
     pub row_idx: usize,
 }
 
@@ -583,18 +583,18 @@ impl<'a> RowViewOnBatch<'a> {
 pub struct RowViewOnBatchColumnIter<'a> {
     next_column_idx: usize,
     row_idx: usize,
-    record_batch: &'a RecordBatchWithKey,
+    record_batch: &'a FetchedRecordBatch,
 }
 
 impl<'a> RowView for RowViewOnBatch<'a> {
     fn try_get_column_by_name(&self, column_name: &str) -> Result<Option<Datum>> {
-        let column_idx = self
-            .record_batch
-            .schema_with_key()
-            .index_of(column_name)
-            .context(ColumnNameNotFound {
-                column: column_name,
-            })?;
+        let column_idx =
+            self.record_batch
+                .schema()
+                .index_of(column_name)
+                .context(ColumnNameNotFound {
+                    column: column_name,
+                })?;
         Ok(Some(self.column_by_idx(column_idx)))
     }
 
diff --git a/common_types/src/schema.rs b/common_types/src/schema.rs
index 2ceaa46576..5abdeabb95 100644
--- a/common_types/src/schema.rs
+++ b/common_types/src/schema.rs
@@ -539,6 +539,13 @@ pub struct RecordSchemaWithKey {
 }
 
 impl RecordSchemaWithKey {
+    pub fn new(record_schema: RecordSchema, primary_key_indexes: Vec<usize>) -> Self {
+        Self {
+            record_schema,
+            primary_key_indexes,
+        }
+    }
+
     pub fn num_columns(&self) -> usize {
         self.record_schema.num_columns()
     }
@@ -578,7 +585,11 @@ impl RecordSchemaWithKey {
             .collect::<Vec<_>>()
     }
 
-    pub(crate) fn into_record_schema(self) -> RecordSchema {
+    pub fn to_record_schema(&self) -> RecordSchema {
+        self.record_schema.clone()
+    }
+
+    pub fn into_record_schema(self) -> RecordSchema {
         self.record_schema
     }
 
diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs
index c3abca0060..4d5d8e1f54 100644
--- a/common_types/src/tests.rs
+++ b/common_types/src/tests.rs
@@ -18,8 +18,8 @@ use sqlparser::ast::{BinaryOperator, Expr, Value};
 use crate::{
     column_schema,
     datum::{Datum, DatumKind},
-    projected_schema::ProjectedSchema,
-    record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder},
+    projected_schema::{ProjectedSchema, RowProjector},
+    record_batch::{FetchedRecordBatch, FetchedRecordBatchBuilder},
     row::{
         contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow},
         Row,
@@ -357,15 +357,16 @@ pub fn build_rows() -> Vec<Row> {
     ]
 }
 
-pub fn build_record_batch_with_key_by_rows(rows: Vec<Row>) -> RecordBatchWithKey {
+pub fn build_fetched_record_batch_by_rows(rows: Vec<Row>) -> FetchedRecordBatch {
     let schema = build_schema();
     assert!(schema.num_columns() > 1);
     let projection: Vec<usize> = (0..schema.num_columns() - 1).collect();
     let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap();
-    let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap();
+    let row_projector =
+        RowProjector::new(&projected_schema.to_record_schema(), None, &schema, &schema).unwrap();
 
     let mut builder =
-        RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2);
+        FetchedRecordBatchBuilder::with_capacity(row_projector.fetched_schema().clone(), None, 2);
     let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns());
 
     let mut buf = Vec::new();
@@ -375,7 +376,7 @@ pub fn build_record_batch_with_key_by_rows(rows: Vec<Row>) -> RecordBatchWithKey
         writer.write_row(&row).unwrap();
 
         let source_row = ContiguousRowReader::try_new(&buf, &schema).unwrap();
-        let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema);
+        let projected_row = ProjectedContiguousRow::new(source_row, &row_projector);
         builder
             .append_projected_contiguous_row(&projected_row)
             .unwrap();
@@ -384,7 +385,7 @@ pub fn build_record_batch_with_key_by_rows(rows: Vec<Row>) -> RecordBatchWithKey
 }
 
 pub fn check_record_batch_with_key_with_rows(
-    record_batch_with_key: &RecordBatchWithKey,
+    record_batch_with_key: &FetchedRecordBatch,
     row_num: usize,
     column_num: usize,
     rows: Vec<Row>,
diff --git a/components/object_store/src/disk_cache.rs b/components/object_store/src/disk_cache.rs
index 5b37431c20..53d537ffa6 100644
--- a/components/object_store/src/disk_cache.rs
+++ b/components/object_store/src/disk_cache.rs
@@ -825,7 +825,7 @@ impl ObjectStore for DiskCacheStore {
     }
 
     async fn get(&self, location: &Path) -> Result<GetResult> {
-        // In sst module, we only use get_range, fetching a whole file is not used, and
+        // In sst module, we only use get_range, fetched a whole file is not used, and
         // it is not good for disk cache.
         self.underlying_store.get(location).await
     }
diff --git a/integration_tests/cases/env/local/ddl/query-plan.result b/integration_tests/cases/env/local/ddl/query-plan.result
index ec2258d64d..26dcf9098e 100644
--- a/integration_tests/cases/env/local/ddl/query-plan.result
+++ b/integration_tests/cases/env/local/ddl/query-plan.result
@@ -2,6 +2,10 @@ DROP TABLE IF EXISTS `03_dml_select_real_time_range`;
 
 affected_rows: 0
 
+DROP TABLE IF EXISTS `03_append_mode_table`;
+
+affected_rows: 0
+
 CREATE TABLE `03_dml_select_real_time_range` (
     name string TAG,
     value double NOT NULL,
@@ -27,7 +31,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348001000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=1\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1:\n=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=1\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t]:\n=0]\n"),
 
 
 -- This query should not include memtable
@@ -47,7 +51,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348001000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=1\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_sst_1:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=320\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=1\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_sst_1, fetched_columns:[tsid,t]:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=320\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
 
 
 -- This query should not include SST
@@ -58,7 +62,58 @@ plan_type,plan,
 String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n=0]\n"),
 
 
+-- Table with an 'append' update mode
+CREATE TABLE `03_append_mode_table` (
+    name string TAG,
+    value double NOT NULL,
+    t timestamp NOT NULL,
+    timestamp KEY (t)) ENGINE = Analytic WITH (
+    enable_ttl = 'false',
+    segment_duration = '2h',
+    update_mode = 'append'
+);
+
+affected_rows: 0
+
+INSERT INTO `03_append_mode_table` (t, name, value)
+    VALUES
+    (1695348000000, "ceresdb", 100),
+    (1695348001000, "ceresdb", 200),
+    (1695348002000, "ceresdb", 300);
+
+affected_rows: 3
+
+-- Should just fetch projected columns from memtable
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE since_create=\d+.?\d*(µ|m|n) since_create=xx
+-- SQLNESS REPLACE since_init=\d+.?\d*(µ|m|n) since_init=xx
+-- SQLNESS REPLACE elapsed_compute=\d+.?\d*(µ|m|n) elapsed_compute=xx
+explain analyze select t from `03_append_mode_table`
+where t >= 1695348001000 and name = 'ceresdb';
+
+plan_type,plan,
+String("Plan with Metrics"),String("ProjectionExec: expr=[t@0 as t], metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=03_append_mode_table, parallelism=8, metrics=[\nPredicate { exprs:[t >= TimestampMillisecond(1695348001000, None), name = Utf8(\"ceresdb\")], time_range:TimeRange { inclusive_start: Timestamp(1695348001000), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=false\n    chain_iter_0:\n        num_memtables=1\n        num_ssts=0\n        scan_duration=xxs\n        since_create=xxs\n        since_init=xxs\n        total_batch_fetched=1\n        total_rows_fetched=2\n        scan_memtable_1, fetched_columns:[t,name]:\n=0]\n"),
+
+
+-- Should just fetch projected columns from SST
+-- SQLNESS ARG pre_cmd=flush
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE since_create=\d+.?\d*(µ|m|n) since_create=xx
+-- SQLNESS REPLACE since_init=\d+.?\d*(µ|m|n) since_init=xx
+-- SQLNESS REPLACE elapsed_compute=\d+.?\d*(µ|m|n) elapsed_compute=xx
+-- SQLNESS REPLACE project_record_batch=\d+.?\d*(µ|m|n) project_record_batch=xx
+explain analyze select t from `03_append_mode_table`
+where t >= 1695348001000 and name = 'ceresdb';
+
+plan_type,plan,
+String("Plan with Metrics"),String("ProjectionExec: expr=[t@0 as t], metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=03_append_mode_table, parallelism=8, metrics=[\nPredicate { exprs:[t >= TimestampMillisecond(1695348001000, None), name = Utf8(\"ceresdb\")], time_range:TimeRange { inclusive_start: Timestamp(1695348001000), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=false\n    chain_iter_0:\n        num_memtables=0\n        num_ssts=1\n        scan_duration=xxs\n        since_create=xxs\n        since_init=xxs\n        total_batch_fetched=1\n        total_rows_fetched=2\n        scan_sst_1, fetched_columns:[t,name]:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=408\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
+
+
 DROP TABLE `03_dml_select_real_time_range`;
 
 affected_rows: 0
 
+DROP TABLE `03_append_mode_table`;
+
+affected_rows: 0
+
diff --git a/integration_tests/cases/env/local/ddl/query-plan.sql b/integration_tests/cases/env/local/ddl/query-plan.sql
index 00fb19e05c..a0baff5b81 100644
--- a/integration_tests/cases/env/local/ddl/query-plan.sql
+++ b/integration_tests/cases/env/local/ddl/query-plan.sql
@@ -1,4 +1,5 @@
 DROP TABLE IF EXISTS `03_dml_select_real_time_range`;
+DROP TABLE IF EXISTS `03_append_mode_table`;
 
 CREATE TABLE `03_dml_select_real_time_range` (
     name string TAG,
@@ -36,4 +37,40 @@ where t > 1695348001000;
 explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348002000;
 
+-- Table with an 'append' update mode
+CREATE TABLE `03_append_mode_table` (
+    name string TAG,
+    value double NOT NULL,
+    t timestamp NOT NULL,
+    timestamp KEY (t)) ENGINE = Analytic WITH (
+    enable_ttl = 'false',
+    segment_duration = '2h',
+    update_mode = 'append'
+);
+
+INSERT INTO `03_append_mode_table` (t, name, value)
+    VALUES
+    (1695348000000, "ceresdb", 100),
+    (1695348001000, "ceresdb", 200),
+    (1695348002000, "ceresdb", 300);
+
+-- Should just fetch projected columns from memtable
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE since_create=\d+.?\d*(µ|m|n) since_create=xx
+-- SQLNESS REPLACE since_init=\d+.?\d*(µ|m|n) since_init=xx
+-- SQLNESS REPLACE elapsed_compute=\d+.?\d*(µ|m|n) elapsed_compute=xx
+explain analyze select t from `03_append_mode_table`
+where t >= 1695348001000 and name = 'ceresdb';
+
+-- Should just fetch projected columns from SST
+-- SQLNESS ARG pre_cmd=flush
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE since_create=\d+.?\d*(µ|m|n) since_create=xx
+-- SQLNESS REPLACE since_init=\d+.?\d*(µ|m|n) since_init=xx
+-- SQLNESS REPLACE elapsed_compute=\d+.?\d*(µ|m|n) elapsed_compute=xx
+-- SQLNESS REPLACE project_record_batch=\d+.?\d*(µ|m|n) project_record_batch=xx
+explain analyze select t from `03_append_mode_table`
+where t >= 1695348001000 and name = 'ceresdb';
+
 DROP TABLE `03_dml_select_real_time_range`;
+DROP TABLE `03_append_mode_table`;
diff --git a/partition_table_engine/src/scan_builder.rs b/partition_table_engine/src/scan_builder.rs
index 1291508fd2..247bcae98b 100644
--- a/partition_table_engine/src/scan_builder.rs
+++ b/partition_table_engine/src/scan_builder.rs
@@ -79,7 +79,7 @@ impl PartitionedTableScanBuilder {
 impl TableScanBuilder for PartitionedTableScanBuilder {
     async fn build(&self, request: ReadRequest) -> Result<Arc<dyn ExecutionPlan>> {
         // Build partition rule.
-        let table_schema_snapshot = request.projected_schema.original_schema();
+        let table_schema_snapshot = request.projected_schema.table_schema();
         let df_partition_rule =
             DfPartitionRuleAdapter::new(self.partition_info.clone(), table_schema_snapshot)
                 .map_err(|e| {
diff --git a/src/wal/src/message_queue_impl/region.rs b/src/wal/src/message_queue_impl/region.rs
index 1ebfd176a7..292d0469c9 100644
--- a/src/wal/src/message_queue_impl/region.rs
+++ b/src/wal/src/message_queue_impl/region.rs
@@ -810,7 +810,7 @@ pub struct MessageQueueLogIterator<C: ConsumeIterator> {
 
     /// Polling's end point
     ///
-    /// While fetching in slave node, it will be set to `None`, and
+    /// While fetched in slave node, it will be set to `None`, and
     /// reading will not stop.
     /// Otherwise, it will be set to high watermark.
     terminate_offset: Option<Offset>,
diff --git a/system_catalog/src/tables.rs b/system_catalog/src/tables.rs
index 7593f7d754..dc1113f784 100644
--- a/system_catalog/src/tables.rs
+++ b/system_catalog/src/tables.rs
@@ -21,7 +21,8 @@ use catalog::{manager::ManagerRef, schema::SchemaRef, CatalogRef};
 use common_types::{
     column_schema,
     datum::{Datum, DatumKind},
-    record_batch::RecordBatchWithKeyBuilder,
+    projected_schema::RowProjector,
+    record_batch::FetchedRecordBatchBuilder,
     row::Row,
     schema,
     schema::Schema,
@@ -153,13 +154,22 @@ impl SystemTable for Tables {
             .all_catalogs()
             .box_err()
             .context(table_engine::table::Scan { table: self.name() })?;
-        let projected_record_schema = request.projected_schema.to_record_schema_with_key();
-        let mut builder = RecordBatchWithKeyBuilder::new(projected_record_schema);
+        let fetched_schema = request.projected_schema.to_record_schema_with_key();
+        let primary_key_indexes = fetched_schema.primary_key_idx().to_vec();
+        let fetched_schema = fetched_schema.to_record_schema();
+        let mut builder = FetchedRecordBatchBuilder::new(
+            fetched_schema.clone(),
+            Some(primary_key_indexes.clone()),
+        );
 
-        let projector = request
-            .projected_schema
-            .try_project_with_key(&self.schema)
-            .expect("Should succeed to try_project_key of sys_tables");
+        let table_schema = request.projected_schema.table_schema();
+        let row_projector = RowProjector::new(
+            &fetched_schema,
+            Some(primary_key_indexes),
+            table_schema,
+            &self.schema,
+        )
+        .expect("Should succeed to try_project_key of sys_tables");
         for catalog in &catalogs {
             for schema in &catalog
                 .all_schemas()
@@ -172,7 +182,7 @@ impl SystemTable for Tables {
                     .context(table_engine::table::Scan { table: self.name() })?
                 {
                     let row = self.from_table(catalog.clone(), schema.clone(), table.clone());
-                    let projected_row = projector.project_row(&row, Vec::new());
+                    let projected_row = row_projector.project_row(&row, Vec::new());
                     builder
                         .append_row(projected_row)
                         .box_err()
diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs
index 7f9e974708..6b0c38a770 100644
--- a/table_engine/src/provider.rs
+++ b/table_engine/src/provider.rs
@@ -141,10 +141,11 @@ impl TableScanBuilder for NormalTableScanBuilder {
 #[derive(Debug)]
 pub struct TableProviderAdapter<B> {
     table: TableRef,
+
     /// The schema of the table when this adapter is created, used as schema
     /// snapshot for read to avoid the reader sees different schema during
     /// query
-    read_schema: Schema,
+    current_table_schema: Schema,
 
     /// Table scan builder
     builder: B,
@@ -153,11 +154,11 @@ pub struct TableProviderAdapter<B> {
 impl<B: TableScanBuilder> TableProviderAdapter<B> {
     pub fn new(table: TableRef, builder: B) -> Self {
         // Take a snapshot of the schema
-        let read_schema = table.schema();
+        let current_table_schema = table.schema();
 
         Self {
             table,
-            read_schema,
+            current_table_schema,
             builder,
         }
     }
@@ -193,12 +194,14 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
         );
 
         let predicate = self.check_and_build_predicate_from_filters(filters);
-        let projected_schema = ProjectedSchema::new(self.read_schema.clone(), projection.cloned())
-            .map_err(|e| {
-                DataFusionError::Internal(format!(
-                    "Invalid projection, plan:{self:?}, projection:{projection:?}, err:{e:?}"
-                ))
-            })?;
+        let projected_schema =
+            ProjectedSchema::new(self.current_table_schema.clone(), projection.cloned()).map_err(
+                |e| {
+                    DataFusionError::Internal(format!(
+                        "Invalid projection, plan:{self:?}, projection:{projection:?}, err:{e:?}"
+                    ))
+                },
+            )?;
 
         let opts = ReadOptions {
             deadline,
@@ -224,7 +227,9 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
             .filter_map(|filter| {
                 let filter_cols = visitor::find_columns_by_expr(filter);
 
-                let support_pushdown = self.table.support_pushdown(&self.read_schema, &filter_cols);
+                let support_pushdown = self
+                    .table
+                    .support_pushdown(&self.current_table_schema, &filter_cols);
                 if support_pushdown {
                     Some(filter.clone())
                 } else {
@@ -235,7 +240,7 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
 
         PredicateBuilder::default()
             .add_pushdown_exprs(&pushdown_filters)
-            .extract_time_range(&self.read_schema, filters)
+            .extract_time_range(&self.current_table_schema, filters)
             .build()
     }
 
@@ -245,7 +250,9 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
             .map(|filter| {
                 let filter_cols = visitor::find_columns_by_expr(filter);
 
-                let support_pushdown = self.table.support_pushdown(&self.read_schema, &filter_cols);
+                let support_pushdown = self
+                    .table
+                    .support_pushdown(&self.current_table_schema, &filter_cols);
                 if support_pushdown {
                     TableProviderFilterPushDown::Exact
                 } else {
@@ -264,7 +271,7 @@ impl<B: TableScanBuilder> TableProvider for TableProviderAdapter<B> {
 
     fn schema(&self) -> SchemaRef {
         // We use the `read_schema` as the schema of this `TableProvider`
-        self.read_schema.clone().into_arrow_schema_ref()
+        self.current_table_schema.clone().into_arrow_schema_ref()
     }
 
     async fn scan(
@@ -297,7 +304,7 @@ impl<B: TableScanBuilder> TableSource for TableProviderAdapter<B> {
 
     /// Get a reference to the schema for this table
     fn schema(&self) -> SchemaRef {
-        self.read_schema.clone().into_arrow_schema_ref()
+        self.current_table_schema.clone().into_arrow_schema_ref()
     }
 
     /// Get the type of this table for metadata/catalog purposes.
diff --git a/tools/src/bin/sst-convert.rs b/tools/src/bin/sst-convert.rs
index 0021a1425b..57c8f8f5fa 100644
--- a/tools/src/bin/sst-convert.rs
+++ b/tools/src/bin/sst-convert.rs
@@ -30,7 +30,10 @@ use analytic_engine::{
 };
 use anyhow::{Context, Result};
 use clap::Parser;
-use common_types::{projected_schema::ProjectedSchema, request_id::RequestId};
+use common_types::{
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
+    request_id::RequestId,
+};
 use generic_error::BoxError;
 use object_store::{LocalFileSystem, Path};
 use runtime::Runtime;
@@ -92,15 +95,20 @@ async fn run(args: Args, runtime: Arc<Runtime>) -> Result<()> {
     let sst_meta = sst_util::meta_from_sst(&store, &input_path).await;
     let factory = FactoryImpl;
     let scan_options = ScanOptions::default();
+    let projected_schema = ProjectedSchema::no_projection(sst_meta.schema.clone());
+
+    let fetched_schema = projected_schema.to_record_schema();
+    let table_schema = projected_schema.table_schema().clone();
+    let row_projector_builder = RowProjectorBuilder::new(fetched_schema, table_schema, None);
     let reader_opts = SstReadOptions {
         maybe_table_level_metrics: Arc::new(SstMaybeTableLevelMetrics::new("tool")),
         frequency: ReadFrequency::Once,
         num_rows_per_row_group: 8192,
-        projected_schema: ProjectedSchema::no_projection(sst_meta.schema.clone()),
         predicate: Arc::new(Predicate::empty()),
         meta_cache: None,
         scan_options,
         runtime,
+        row_projector_builder,
     };
     let store_picker: ObjectStorePickerRef = Arc::new(store);
     let mut reader = factory

From bb4db609811939130067f0ab3ba9310950156b3f Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Mon, 25 Dec 2023 11:16:00 +0800
Subject: [PATCH 20/38] chore: refactor for better readability (#1400)

## Rationale
Reduce two `match`to only one.

## Detailed Changes


## Test Plan
CI
---
 analytic_engine/src/instance/open.rs | 51 +++++++++++-----------------
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs
index 8c840bf8ef..0f9ad18ac7 100644
--- a/analytic_engine/src/instance/open.rs
+++ b/analytic_engine/src/instance/open.rs
@@ -294,48 +294,38 @@ impl ShardOpener {
 
     /// Recover table meta data from manifest based on shard.
     async fn recover_table_metas(&mut self) -> Result<()> {
-        info!(
-            "ShardOpener recover table metas begin, shard_id:{}",
-            self.shard_id
-        );
+        let shard_id = self.shard_id;
+        let table_num = self.stages.len();
+        info!("ShardOpener recover table metas begin, shard_id:{shard_id}, table_num:{table_num}");
 
         for (table_id, state) in self.stages.iter_mut() {
             match state {
                 // Only do the meta recovery work in `RecoverTableMeta` state.
-                TableOpenStage::RecoverTableMeta(ctx) => {
-                    let result = match Self::recover_single_table_meta(
+                TableOpenStage::RecoverTableMeta(RecoverTableMetaContext { table_def, space }) => {
+                    match Self::recover_single_table_meta(
                         self.manifest.as_ref(),
-                        self.shard_id,
-                        &ctx.table_def,
+                        shard_id,
+                        table_def,
                     )
                     .await
+                    .map(|_| space.find_table_by_id(*table_id))
                     {
-                        Ok(()) => {
-                            let table_data = ctx.space.find_table_by_id(*table_id);
-                            Ok(table_data.map(|data| (data, ctx.space.clone())))
-                        }
-                        Err(e) => {
-                            error!("ShardOpener recover single table meta failed, table:{:?}, shard_id:{}, err:{e}", ctx.table_def, self.shard_id);
-                            Err(e)
-                        }
-                    };
-
-                    match result {
-                        Ok(Some((table_data, space))) => {
+                        Ok(Some(table_data)) => {
                             *state = TableOpenStage::RecoverTableData(RecoverTableDataContext {
                                 table_data,
-                                space,
-                            })
+                                space: space.clone(),
+                            });
                         }
                         Ok(None) => {
-                            error!(
-                                "ShardOpener trie to open a dropped table, table:{:?}, shard_id:{}",
-                                ctx.table_def, self.shard_id
-                            );
+                            error!("ShardOpener tried to open a dropped table, table:{table_def:?}, shard_id:{shard_id}");
+                            // TODO: is this an error?
                             *state = TableOpenStage::Success(None);
                         }
-                        Err(e) => *state = TableOpenStage::Failed(e),
-                    }
+                        Err(e) => {
+                            error!("ShardOpener recover single table meta failed, table:{table_def:?}, shard_id:{shard_id}, err:{e}");
+                            *state = TableOpenStage::Failed(e)
+                        }
+                    };
                 }
                 // Table was found to be opened in init stage.
                 TableOpenStage::Success(_) => {}
@@ -348,10 +338,7 @@ impl ShardOpener {
             }
         }
 
-        info!(
-            "ShardOpener recover table metas finish, shard_id:{}",
-            self.shard_id
-        );
+        info!("ShardOpener recover table metas finish, shard_id:{shard_id}, table_num:{table_num}",);
         Ok(())
     }
 

From 3359a9a1c12a9e000e353401b3dc4438038c1240 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Wed, 27 Dec 2023 12:25:19 +0800
Subject: [PATCH 21/38] fix: skip wal encoding when data wal is disabled
 (#1401)

## Rationale
When data wal is disable, data is still encoded, which waste cpu usage.

## Detailed Changes
Skip encode when data wal is disabled.

## Test Plan
CI.
---
 analytic_engine/src/instance/mod.rs   |  1 +
 analytic_engine/src/instance/open.rs  |  1 +
 analytic_engine/src/instance/write.rs | 27 ++++++++++++++++++---------
 analytic_engine/src/lib.rs            |  1 +
 analytic_engine/src/table/data.rs     |  5 +++++
 5 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs
index ab8df1ef9b..6eb257b83e 100644
--- a/analytic_engine/src/instance/mod.rs
+++ b/analytic_engine/src/instance/mod.rs
@@ -189,6 +189,7 @@ pub struct Instance {
     pub(crate) iter_options: Option<IterOptions>,
     pub(crate) recover_mode: RecoverMode,
     pub(crate) wal_encode: WalEncodeConfig,
+    pub(crate) disable_wal: bool,
 }
 
 impl Instance {
diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs
index 0f9ad18ac7..31702dc95c 100644
--- a/analytic_engine/src/instance/open.rs
+++ b/analytic_engine/src/instance/open.rs
@@ -149,6 +149,7 @@ impl Instance {
             scan_options,
             recover_mode: ctx.config.recover_mode,
             wal_encode: ctx.config.wal_encode,
+            disable_wal: ctx.config.wal.disable_data,
         });
 
         Ok(instance)
diff --git a/analytic_engine/src/instance/write.rs b/analytic_engine/src/instance/write.rs
index e49ccaacb3..2e5d60d07c 100644
--- a/analytic_engine/src/instance/write.rs
+++ b/analytic_engine/src/instance/write.rs
@@ -422,19 +422,28 @@ impl<'a> Writer<'a> {
 
         self.preprocess_write(&mut encode_ctx).await?;
 
-        let encoded_payload = {
-            let _timer = self.table_data.metrics.start_table_write_encode_timer();
-            let schema = self.table_data.schema();
-            encode_ctx.encode(&self.instance.wal_encode, &schema)?
-        };
+        let table_data = self.table_data.clone();
+        let seq = if self.instance.disable_wal {
+            // When wal is disabled, just update the last_seq one by one.
+            table_data.next_sequence()
+        } else {
+            let encoded_payload = {
+                let _timer = self.table_data.metrics.start_table_write_encode_timer();
+                let schema = self.table_data.schema();
+                encode_ctx.encode(&self.instance.wal_encode, &schema)?
+            };
 
-        let seq = match encoded_payload {
-            EncodedPayload::Rows(encoded_rows) => self.write_to_wal_in_rows(encoded_rows).await?,
-            EncodedPayload::Cols(encoded_cols) => self.write_to_wal_in_cols(encoded_cols).await?,
+            match encoded_payload {
+                EncodedPayload::Rows(encoded_rows) => {
+                    self.write_to_wal_in_rows(encoded_rows).await?
+                }
+                EncodedPayload::Cols(encoded_cols) => {
+                    self.write_to_wal_in_cols(encoded_cols).await?
+                }
+            }
         };
 
         // Write the row group to the memtable and update the state in the mem.
-        let table_data = self.table_data.clone();
         let EncodeContext {
             row_group,
             index_in_writer,
diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs
index e7d7f81027..68f796837e 100644
--- a/analytic_engine/src/lib.rs
+++ b/analytic_engine/src/lib.rs
@@ -110,6 +110,7 @@ pub struct Config {
     /// The interval for sampling the memory usage
     pub mem_usage_sampling_interval: ReadableDuration,
     /// The config for log in the wal.
+    // TODO: move this to WalConfig.
     pub wal_encode: WalEncodeConfig,
 
     /// Wal storage config
diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs
index 2c011a9c1b..998c5c3b14 100644
--- a/analytic_engine/src/table/data.rs
+++ b/analytic_engine/src/table/data.rs
@@ -446,6 +446,11 @@ impl TableData {
         self.last_sequence.store(seq, Ordering::Release);
     }
 
+    #[inline]
+    pub fn next_sequence(&self) -> SequenceNumber {
+        self.last_sequence.fetch_add(1, Ordering::Relaxed) + 1
+    }
+
     /// Get last flush time
     #[inline]
     pub fn last_flush_time(&self) -> u64 {

From 9974944a7f29c2356b2d10096a0ea86ebd8a2324 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Wed, 27 Dec 2023 17:50:44 +0800
Subject: [PATCH 22/38] chore: add error log for remote server (#1407)

## Rationale
Currently there are no error log for remote server, this make it's hard
to debug.

## Detailed Changes


## Test Plan
No need.
---
 server/src/grpc/remote_engine_service/mod.rs | 39 +++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index 03bd7fb1f3..3d2963a49b 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -281,7 +281,10 @@ impl RemoteEngineServiceImpl {
         let (tx, rx) = mpsc::channel(STREAM_QUERY_CHANNEL_LEN);
         let handle = self.runtimes.read_runtime.spawn(async move {
             let read_request = request.into_inner();
-            handle_stream_read(ctx, read_request).await
+            handle_stream_read(ctx, read_request).await.map_err(|e| {
+                error!("Handle stream read failed, err:{e}");
+                e
+            })
         });
         let streams = handle.await.box_err().context(ErrWithCause {
             code: StatusCode::Internal,
@@ -493,7 +496,10 @@ impl RemoteEngineServiceImpl {
         let ctx = self.handler_ctx();
         let handle = self.runtimes.write_runtime.spawn(async move {
             let request = request.into_inner();
-            handle_write(ctx, request).await
+            handle_write(ctx, request).await.map_err(|e| {
+                error!("Handle write failed, err:{e}");
+                e
+            })
         });
 
         let res = handle.await.box_err().context(ErrWithCause {
@@ -526,7 +532,10 @@ impl RemoteEngineServiceImpl {
         let ctx = self.handler_ctx();
         let handle = self.runtimes.read_runtime.spawn(async move {
             let request = request.into_inner();
-            handle_get_table_info(ctx, request).await
+            handle_get_table_info(ctx, request).await.map_err(|e| {
+                error!("Handle get table info failed, err:{e}");
+                e
+            })
         });
 
         let res = handle.await.box_err().context(ErrWithCause {
@@ -720,7 +729,10 @@ impl RemoteEngineServiceImpl {
         let ctx = self.handler_ctx();
         let handle = self.runtimes.read_runtime.spawn(async move {
             let request = request.into_inner();
-            handle_alter_table_schema(ctx, request).await
+            handle_alter_table_schema(ctx, request).await.map_err(|e| {
+                error!("Handle alter table schema failed, err:{e}");
+                e
+            })
         });
 
         let res = handle.await.box_err().context(ErrWithCause {
@@ -853,11 +865,20 @@ impl RemoteEngineService for RemoteEngineServiceImpl {
         }
 
         let record_stream_result = match self.query_dedup.clone() {
-            Some(query_dedup) => {
-                self.dedup_execute_physical_plan_internal(query_dedup, request)
-                    .await
-            }
-            None => self.execute_physical_plan_internal(request).await,
+            Some(query_dedup) => self
+                .dedup_execute_physical_plan_internal(query_dedup, request)
+                .await
+                .map_err(|e| {
+                    error!("Dedup execute physical plan failed, err:{e}");
+                    e
+                }),
+            None => self
+                .execute_physical_plan_internal(request)
+                .await
+                .map_err(|e| {
+                    error!("Execute physical plan failed, err:{e}");
+                    e
+                }),
         };
 
         record_stream_to_response_stream!(record_stream_result, ExecutePhysicalPlanStream)

From b3fd4591d7dfe982b40ad2a4985916411647bed2 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Wed, 27 Dec 2023 17:51:08 +0800
Subject: [PATCH 23/38] fix: disable percentile for distributed tables (#1406)

## Rationale
See #1405

## Detailed Changes
Disable percentile functions

## Test Plan
CI
---
 .../src/dist_sql_query/physical_plan.rs             | 13 +++++++++++++
 .../cases/env/cluster/ddl/partition_table.result    | 12 ++++++++++++
 .../cases/env/cluster/ddl/partition_table.sql       |  8 ++++++++
 3 files changed, 33 insertions(+)

diff --git a/df_engine_extensions/src/dist_sql_query/physical_plan.rs b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
index e64ae96c41..9825227c7f 100644
--- a/df_engine_extensions/src/dist_sql_query/physical_plan.rs
+++ b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
@@ -33,6 +33,7 @@ use datafusion::{
         coalesce_batches::CoalesceBatchesExec,
         coalesce_partitions::CoalescePartitionsExec,
         displayable,
+        expressions::{ApproxPercentileCont, ApproxPercentileContWithWeight},
         filter::FilterExec,
         metrics::{Count, MetricValue, MetricsSet},
         projection::ProjectionExec,
@@ -619,8 +620,20 @@ pub enum PushDownEvent {
 }
 
 impl PushDownEvent {
+    // Those aggregate functions can't be pushed down.
+    // https://github.com/apache/incubator-horaedb/issues/1405
+    fn blacklist_expr(expr: &dyn Any) -> bool {
+        expr.is::<ApproxPercentileCont>() || expr.is::<ApproxPercentileContWithWeight>()
+    }
+
     pub fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
         if let Some(aggr) = plan.as_any().downcast_ref::<AggregateExec>() {
+            for aggr_expr in aggr.aggr_expr() {
+                if Self::blacklist_expr(aggr_expr.as_any()) {
+                    return Self::Unable;
+                }
+            }
+
             if *aggr.mode() == AggregateMode::Partial {
                 Self::Terminated(plan)
             } else {
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.result b/integration_tests/cases/env/cluster/ddl/partition_table.result
index 623be78659..81502d1cf6 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.result
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.result
@@ -169,6 +169,18 @@ tsid,t,name,id,value,
 UInt64(12677620772014847982),Timestamp(1651737067000),String("ceresdb5"),Int32(0),Double(105.0),
 
 
+SELECT
+    time_bucket (t, "PT1M") AS ts,
+    approx_percentile_cont (value, 0.9) AS value
+FROM
+    random_partition_table_t
+GROUP BY
+    time_bucket (t, "PT1M");
+
+ts,value,
+Timestamp(1651737060000),Double(109.4),
+
+
 DROP TABLE IF EXISTS `random_partition_table_t`;
 
 affected_rows: 0
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.sql b/integration_tests/cases/env/cluster/ddl/partition_table.sql
index 76fc398650..59a1dd2a7c 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.sql
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.sql
@@ -82,6 +82,14 @@ SELECT * from random_partition_table_t where name = "ceresdb0";
 
 SELECT * from random_partition_table_t where name = "ceresdb5";
 
+SELECT
+    time_bucket (t, "PT1M") AS ts,
+    approx_percentile_cont (value, 0.9) AS value
+FROM
+    random_partition_table_t
+GROUP BY
+    time_bucket (t, "PT1M");
+
 DROP TABLE IF EXISTS `random_partition_table_t`;
 
 SHOW CREATE TABLE random_partition_table_t;

From b02bac2ba7475a76157487b102e36c85b0c2b5f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=B2=8D=E9=87=91=E6=97=A5?= <baojinri@gmail.com>
Date: Thu, 28 Dec 2023 09:59:41 +0800
Subject: [PATCH 24/38] feat: dist sql analyze (#1260)

## Rationale
Currently, the analyze sql can not obtain detailed metrics of partitioned
table.

## Detailed Changes
Return metrics of partitioned table to remote client and then collect
metrics in client.

## Test Plan
- Existing tests
- add new integration tests for explain analyze
---
 Cargo.lock                                    |  36 ++--
 Cargo.toml                                    |   2 +-
 .../src/dist_sql_query/mod.rs                 |  21 +-
 .../src/dist_sql_query/physical_plan.rs       |  47 ++++-
 .../src/dist_sql_query/resolver.rs            |  11 +-
 .../src/dist_sql_query/test_util.rs           |   6 +-
 .../env/cluster/ddl/partition_table.result    |  17 ++
 .../cases/env/cluster/ddl/partition_table.sql |   9 +
 query_engine/src/datafusion_impl/executor.rs  |   4 +-
 .../src/datafusion_impl/physical_planner.rs   |   6 +-
 .../src/datafusion_impl/task_context.rs       |  10 +-
 query_engine/src/executor.rs                  |   4 +-
 query_engine/src/physical_planner.rs          |   4 +-
 remote_engine_client/src/client.rs            | 117 ++++++-----
 server/src/grpc/remote_engine_service/mod.rs  | 189 ++++++++++++------
 table_engine/src/partition/rule/key.rs        |   8 +-
 table_engine/src/provider.rs                  |   2 +-
 table_engine/src/remote/model.rs              |   6 +
 18 files changed, 329 insertions(+), 170 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5b41170ad6..e32493fe4d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -96,7 +96,7 @@ dependencies = [
  "atomic_enum",
  "base64 0.13.1",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "codec",
  "common_types",
  "datafusion",
@@ -1345,7 +1345,7 @@ dependencies = [
 [[package]]
 name = "ceresdbproto"
 version = "1.0.23"
-source = "git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05#2c60e0591b6066957c80e7d6ae97cf53ccd591e1"
+source = "git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4#d849fa44e29ea04c7d99c082a38efb8ce5200d5e"
 dependencies = [
  "prost",
  "protoc-bin-vendored",
@@ -1528,7 +1528,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "common_types",
  "etcd-client",
  "future_ext",
@@ -1606,7 +1606,7 @@ dependencies = [
  "arrow 43.0.0",
  "arrow_ext",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "chrono",
  "datafusion",
  "hash_ext",
@@ -2362,7 +2362,7 @@ dependencies = [
  "async-recursion",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -3916,7 +3916,7 @@ name = "meta_client"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -4441,7 +4441,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "chrono",
  "clru",
  "crc",
@@ -5318,7 +5318,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "clru",
  "cluster",
  "common_types",
@@ -5445,7 +5445,7 @@ dependencies = [
  "arrow 43.0.0",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "cluster",
  "codec",
  "common_types",
@@ -5756,7 +5756,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "arrow_ext",
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -5885,7 +5885,7 @@ name = "router"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "cluster",
  "common_types",
  "generic_error",
@@ -6260,7 +6260,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "clru",
  "cluster",
  "common_types",
@@ -6786,7 +6786,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "codec",
  "common_types",
  "futures 0.3.28",
@@ -6808,7 +6808,7 @@ dependencies = [
  "arrow_ext",
  "async-trait",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -7011,7 +7011,7 @@ dependencies = [
 name = "time_ext"
 version = "1.2.6-alpha"
 dependencies = [
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "chrono",
  "common_types",
  "macros",
@@ -7523,8 +7523,8 @@ version = "1.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
 dependencies = [
- "cfg-if 1.0.0",
- "rand 0.8.5",
+ "cfg-if 0.1.10",
+ "rand 0.3.23",
  "static_assertions",
 ]
 
@@ -7663,7 +7663,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=2c60e05)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
  "chrono",
  "codec",
  "common_types",
diff --git a/Cargo.toml b/Cargo.toml
index 50875c3cff..a01a722260 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -94,7 +94,7 @@ bytes = "1"
 bytes_ext = { path = "components/bytes_ext" }
 catalog = { path = "catalog" }
 catalog_impls = { path = "catalog_impls" }
-ceresdbproto = { git = "https://github.com/CeresDB/horaedbproto.git", rev = "2c60e05" }
+ceresdbproto = { git = "https://github.com/CeresDB/horaedbproto.git", rev = "d849fa4" }
 codec = { path = "components/codec" }
 chrono = "0.4"
 clap = "3.0"
diff --git a/df_engine_extensions/src/dist_sql_query/mod.rs b/df_engine_extensions/src/dist_sql_query/mod.rs
index 80db7be05e..4bbf6b36ef 100644
--- a/df_engine_extensions/src/dist_sql_query/mod.rs
+++ b/df_engine_extensions/src/dist_sql_query/mod.rs
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::{fmt, sync::Arc};
+use std::{
+    fmt,
+    sync::{Arc, Mutex},
+};
 
 use async_trait::async_trait;
 use common_types::projected_schema::ProjectedSchema;
@@ -35,8 +38,8 @@ pub mod test_util;
 pub trait RemotePhysicalPlanExecutor: fmt::Debug + Send + Sync + 'static {
     fn execute(
         &self,
+        task_context: RemoteTaskContext,
         table: TableIdentifier,
-        task_context: &TaskContext,
         plan: Arc<dyn ExecutionPlan>,
     ) -> DfResult<BoxFuture<'static, DfResult<SendableRecordBatchStream>>>;
 }
@@ -58,6 +61,20 @@ pub trait ExecutableScanBuilder: fmt::Debug + Send + Sync + 'static {
 
 type ExecutableScanBuilderRef = Box<dyn ExecutableScanBuilder>;
 
+pub struct RemoteTaskContext {
+    pub task_ctx: Arc<TaskContext>,
+    pub remote_metrics: Arc<Mutex<Option<String>>>,
+}
+
+impl RemoteTaskContext {
+    pub fn new(task_ctx: Arc<TaskContext>, remote_metrics: Arc<Mutex<Option<String>>>) -> Self {
+        Self {
+            task_ctx,
+            remote_metrics,
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct TableScanContext {
     pub batch_size: usize,
diff --git a/df_engine_extensions/src/dist_sql_query/physical_plan.rs b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
index 9825227c7f..0dbaf415ac 100644
--- a/df_engine_extensions/src/dist_sql_query/physical_plan.rs
+++ b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
@@ -18,7 +18,7 @@ use std::{
     any::Any,
     fmt,
     pin::Pin,
-    sync::Arc,
+    sync::{Arc, Mutex},
     task::{Context, Poll},
     time::{Duration, Instant},
 };
@@ -46,7 +46,7 @@ use futures::{future::BoxFuture, FutureExt, Stream, StreamExt};
 use table_engine::{remote::model::TableIdentifier, table::ReadRequest};
 use trace_metric::{collector::FormatCollectorVisitor, MetricsCollector, TraceMetricWhenDrop};
 
-use crate::dist_sql_query::{RemotePhysicalPlanExecutor, TableScanContext};
+use crate::dist_sql_query::{RemotePhysicalPlanExecutor, RemoteTaskContext, TableScanContext};
 
 /// Placeholder of partitioned table's scan plan
 /// It is inexecutable actually and just for carrying the necessary information
@@ -148,6 +148,7 @@ pub(crate) struct ResolvedPartitionedScan {
     pub remote_exec_ctx: Arc<RemoteExecContext>,
     pub pushdown_continue: bool,
     pub metrics_collector: MetricsCollector,
+    pub is_analyze: bool,
 }
 
 impl ResolvedPartitionedScan {
@@ -155,24 +156,27 @@ impl ResolvedPartitionedScan {
         remote_executor: Arc<dyn RemotePhysicalPlanExecutor>,
         sub_table_plan_ctxs: Vec<SubTablePlanContext>,
         metrics_collector: MetricsCollector,
+        is_analyze: bool,
     ) -> Self {
         let remote_exec_ctx = Arc::new(RemoteExecContext {
             executor: remote_executor,
             plan_ctxs: sub_table_plan_ctxs,
         });
 
-        Self::new_with_details(remote_exec_ctx, true, metrics_collector)
+        Self::new_with_details(remote_exec_ctx, true, metrics_collector, is_analyze)
     }
 
     pub fn new_with_details(
         remote_exec_ctx: Arc<RemoteExecContext>,
         pushdown_continue: bool,
         metrics_collector: MetricsCollector,
+        is_analyze: bool,
     ) -> Self {
         Self {
             remote_exec_ctx,
             pushdown_continue,
             metrics_collector,
+            is_analyze,
         }
     }
 
@@ -181,6 +185,7 @@ impl ResolvedPartitionedScan {
             remote_exec_ctx: self.remote_exec_ctx.clone(),
             pushdown_continue: false,
             metrics_collector: self.metrics_collector.clone(),
+            is_analyze: self.is_analyze,
         })
     }
 
@@ -216,6 +221,7 @@ impl ResolvedPartitionedScan {
                         table: plan_ctx.table.clone(),
                         plan: extended_plan,
                         metrics_collector: plan_ctx.metrics_collector.clone(),
+                        remote_metrics: plan_ctx.remote_metrics.clone(),
                     })
             })
             .collect::<DfResult<Vec<_>>>()?;
@@ -228,6 +234,7 @@ impl ResolvedPartitionedScan {
             remote_exec_ctx,
             can_push_down_more,
             self.metrics_collector.clone(),
+            self.is_analyze,
         );
 
         Ok(Arc::new(plan))
@@ -257,6 +264,7 @@ pub(crate) struct SubTablePlanContext {
     table: TableIdentifier,
     plan: Arc<dyn ExecutionPlan>,
     metrics_collector: MetricsCollector,
+    remote_metrics: Arc<Mutex<Option<String>>>,
 }
 
 impl SubTablePlanContext {
@@ -269,6 +277,7 @@ impl SubTablePlanContext {
             table,
             plan,
             metrics_collector,
+            remote_metrics: Arc::new(Mutex::new(None)),
         }
     }
 }
@@ -296,6 +305,12 @@ impl ExecutionPlan for ResolvedPartitionedScan {
     }
 
     fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        // If this is a analyze plan, we should not collect metrics of children
+        // which have been send to remote, So we just return empty children.
+        if self.is_analyze {
+            return vec![];
+        }
+
         self.remote_exec_ctx
             .plan_ctxs
             .iter()
@@ -328,13 +343,17 @@ impl ExecutionPlan for ResolvedPartitionedScan {
             table: sub_table,
             plan,
             metrics_collector,
+            remote_metrics,
         } = &self.remote_exec_ctx.plan_ctxs[partition];
 
+        let remote_task_ctx = RemoteTaskContext::new(context, remote_metrics.clone());
+
         // Send plan for remote execution.
-        let stream_future =
-            self.remote_exec_ctx
-                .executor
-                .execute(sub_table.clone(), &context, plan.clone())?;
+        let stream_future = self.remote_exec_ctx.executor.execute(
+            remote_task_ctx,
+            sub_table.clone(),
+            plan.clone(),
+        )?;
         let record_stream =
             PartitionedScanStream::new(stream_future, plan.schema(), metrics_collector.clone());
 
@@ -350,7 +369,18 @@ impl ExecutionPlan for ResolvedPartitionedScan {
 
         let mut format_visitor = FormatCollectorVisitor::default();
         self.metrics_collector.visit(&mut format_visitor);
-        let metrics_desc = format_visitor.into_string();
+        let mut metrics_desc = format_visitor.into_string();
+
+        // collect metrics from remote
+        for sub_table_ctx in &self.remote_exec_ctx.plan_ctxs {
+            if let Some(remote_metrics) = sub_table_ctx.remote_metrics.lock().unwrap().take() {
+                metrics_desc.push_str(&format!(
+                    "\n{}:\n{}",
+                    sub_table_ctx.table.table, remote_metrics
+                ));
+            }
+        }
+
         metric_set.push(Arc::new(Metric::new(
             MetricValue::Count {
                 name: format!("\n{metrics_desc}").into(),
@@ -358,7 +388,6 @@ impl ExecutionPlan for ResolvedPartitionedScan {
             },
             None,
         )));
-
         Some(metric_set)
     }
 }
diff --git a/df_engine_extensions/src/dist_sql_query/resolver.rs b/df_engine_extensions/src/dist_sql_query/resolver.rs
index 951ba88d4b..c48bfe5351 100644
--- a/df_engine_extensions/src/dist_sql_query/resolver.rs
+++ b/df_engine_extensions/src/dist_sql_query/resolver.rs
@@ -18,7 +18,7 @@ use async_recursion::async_recursion;
 use catalog::manager::ManagerRef as CatalogManagerRef;
 use datafusion::{
     error::{DataFusionError, Result as DfResult},
-    physical_plan::ExecutionPlan,
+    physical_plan::{analyze::AnalyzeExec, ExecutionPlan},
 };
 use table_engine::{remote::model::TableIdentifier, table::TableRef};
 
@@ -99,7 +99,10 @@ impl Resolver {
         &self,
         plan: Arc<dyn ExecutionPlan>,
     ) -> DfResult<Arc<dyn ExecutionPlan>> {
-        let resolved_plan = self.resolve_partitioned_scan_internal(plan)?;
+        // Check if this plan is `AnalyzeExec`, if it is, we should collect metrics.
+        let is_analyze = plan.as_any().is::<AnalyzeExec>();
+
+        let resolved_plan = self.resolve_partitioned_scan_internal(plan, is_analyze)?;
         PUSH_DOWN_PLAN_COUNTER
             .with_label_values(&["remote_scan"])
             .inc();
@@ -117,6 +120,7 @@ impl Resolver {
     pub fn resolve_partitioned_scan_internal(
         &self,
         plan: Arc<dyn ExecutionPlan>,
+        is_analyze: bool,
     ) -> DfResult<Arc<dyn ExecutionPlan>> {
         // Leave node, let's resolve it and return.
         if let Some(unresolved) = plan.as_any().downcast_ref::<UnresolvedPartitionedScan>() {
@@ -139,6 +143,7 @@ impl Resolver {
                 self.remote_executor.clone(),
                 remote_plans,
                 metrics_collector,
+                is_analyze,
             )));
         }
 
@@ -151,7 +156,7 @@ impl Resolver {
         // Resolve children if exist.
         let mut new_children = Vec::with_capacity(children.len());
         for child in children {
-            let child = self.resolve_partitioned_scan_internal(child)?;
+            let child = self.resolve_partitioned_scan_internal(child, is_analyze)?;
 
             new_children.push(child);
         }
diff --git a/df_engine_extensions/src/dist_sql_query/test_util.rs b/df_engine_extensions/src/dist_sql_query/test_util.rs
index 77584a9fca..d70f9ec258 100644
--- a/df_engine_extensions/src/dist_sql_query/test_util.rs
+++ b/df_engine_extensions/src/dist_sql_query/test_util.rs
@@ -29,7 +29,7 @@ use common_types::{
 };
 use datafusion::{
     error::{DataFusionError, Result as DfResult},
-    execution::{FunctionRegistry, TaskContext},
+    execution::FunctionRegistry,
     logical_expr::{expr_fn, Literal, Operator},
     physical_plan::{
         aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy},
@@ -56,7 +56,7 @@ use trace_metric::MetricsCollector;
 use crate::dist_sql_query::{
     physical_plan::{PartitionedScanStream, UnresolvedPartitionedScan, UnresolvedSubTableScan},
     resolver::Resolver,
-    ExecutableScanBuilder, RemotePhysicalPlanExecutor, TableScanContext,
+    ExecutableScanBuilder, RemotePhysicalPlanExecutor, RemoteTaskContext, TableScanContext,
 };
 
 // Test context
@@ -504,8 +504,8 @@ struct MockRemotePhysicalPlanExecutor;
 impl RemotePhysicalPlanExecutor for MockRemotePhysicalPlanExecutor {
     fn execute(
         &self,
+        _task_context: RemoteTaskContext,
         _table: TableIdentifier,
-        _task_context: &TaskContext,
         _plan: Arc<dyn ExecutionPlan>,
     ) -> DfResult<BoxFuture<'static, DfResult<SendableRecordBatchStream>>> {
         unimplemented!()
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.result b/integration_tests/cases/env/cluster/ddl/partition_table.result
index 81502d1cf6..18e023c006 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.result
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.result
@@ -78,6 +78,23 @@ UInt64(9923681778193615344),Timestamp(1651737067000),String("ceresdb8"),Int32(0)
 UInt64(4860320137932382618),Timestamp(1651737067000),String("ceresdb9"),Int32(0),Double(109.0),
 
 
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE compute=\d+.?\d*(µ|m|n) compute=xx
+EXPLAIN ANALYZE SELECT * from partition_table_t where name = "ceresdb0";
+
+plan_type,plan,
+String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:1, metrics=[\npartition_table_t:\n    __partition_table_t_1:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_1:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_1, parallelism=8, metrics=[\nPredicate { exprs:[name = Utf8(\"ceresdb0\")], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1:\n=0]\n=0]\n"),
+
+
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE compute=\d+.?\d*(µ|m|n) compute=xx
+-- SQLNESS REPLACE __partition_table_t_\d __partition_table_t_x
+EXPLAIN ANALYZE SELECT * from partition_table_t where name in ("ceresdb0", "ceresdb1", "ceresdb2", "ceresdb3", "ceresdb4");
+
+plan_type,plan,
+String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:3, metrics=[\npartition_table_t:\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1:\n=0]\n=0]\n"),
+
+
 ALTER TABLE partition_table_t ADD COLUMN (b string);
 
 affected_rows: 0
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.sql b/integration_tests/cases/env/cluster/ddl/partition_table.sql
index 59a1dd2a7c..46be8e1b69 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.sql
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.sql
@@ -35,6 +35,15 @@ SELECT * from partition_table_t where name in ("ceresdb0", "ceresdb1", "ceresdb2
 
 SELECT * from partition_table_t where name in ("ceresdb5", "ceresdb6", "ceresdb7","ceresdb8", "ceresdb9", "ceresdb10") order by name;
 
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE compute=\d+.?\d*(µ|m|n) compute=xx
+EXPLAIN ANALYZE SELECT * from partition_table_t where name = "ceresdb0";
+
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+-- SQLNESS REPLACE compute=\d+.?\d*(µ|m|n) compute=xx
+-- SQLNESS REPLACE __partition_table_t_\d __partition_table_t_x
+EXPLAIN ANALYZE SELECT * from partition_table_t where name in ("ceresdb0", "ceresdb1", "ceresdb2", "ceresdb3", "ceresdb4");
+
 ALTER TABLE partition_table_t ADD COLUMN (b string);
 
 INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, "ceresdb0", 100);
diff --git a/query_engine/src/datafusion_impl/executor.rs b/query_engine/src/datafusion_impl/executor.rs
index 0412c1d144..eb75c0daa3 100644
--- a/query_engine/src/datafusion_impl/executor.rs
+++ b/query_engine/src/datafusion_impl/executor.rs
@@ -29,7 +29,7 @@ use crate::{
     },
     error::*,
     executor::Executor,
-    physical_planner::{PhysicalPlanPtr, TaskExecContext},
+    physical_planner::{PhysicalPlanRef, TaskExecContext},
 };
 
 #[derive(Debug, Clone)]
@@ -68,7 +68,7 @@ impl Executor for DatafusionExecutorImpl {
     async fn execute(
         &self,
         ctx: &Context,
-        physical_plan: PhysicalPlanPtr,
+        physical_plan: PhysicalPlanRef,
     ) -> Result<SendableRecordBatchStream> {
         debug!(
             "DatafusionExecutorImpl begin to execute plan, request_id:{}, physical_plan:{:?}",
diff --git a/query_engine/src/datafusion_impl/physical_planner.rs b/query_engine/src/datafusion_impl/physical_planner.rs
index 50b302e502..5828733909 100644
--- a/query_engine/src/datafusion_impl/physical_planner.rs
+++ b/query_engine/src/datafusion_impl/physical_planner.rs
@@ -26,7 +26,7 @@ use crate::{
         DfContextBuilder,
     },
     error::*,
-    physical_planner::{PhysicalPlanPtr, PhysicalPlanner},
+    physical_planner::{PhysicalPlanRef, PhysicalPlanner},
 };
 
 /// Physical planner based on datafusion
@@ -58,7 +58,7 @@ impl DatafusionPhysicalPlannerImpl {
 #[async_trait]
 impl PhysicalPlanner for DatafusionPhysicalPlannerImpl {
     // TODO: we should modify `QueryPlan` to support create remote plan here.
-    async fn plan(&self, ctx: &Context, logical_plan: QueryPlan) -> Result<PhysicalPlanPtr> {
+    async fn plan(&self, ctx: &Context, logical_plan: QueryPlan) -> Result<PhysicalPlanRef> {
         // Register catalogs to datafusion execution context.
         let catalogs = CatalogProviderAdapter::new_adapters(logical_plan.tables.clone());
         // TODO: maybe we should not build `SessionContext` in each physical plan's
@@ -88,6 +88,6 @@ impl PhysicalPlanner for DatafusionPhysicalPlannerImpl {
         };
         let physical_plan = DataFusionPhysicalPlanAdapter::new(typed_plan);
 
-        Ok(Box::new(physical_plan))
+        Ok(Arc::new(physical_plan))
     }
 }
diff --git a/query_engine/src/datafusion_impl/task_context.rs b/query_engine/src/datafusion_impl/task_context.rs
index 5946f3d22b..f19c5dde35 100644
--- a/query_engine/src/datafusion_impl/task_context.rs
+++ b/query_engine/src/datafusion_impl/task_context.rs
@@ -33,14 +33,14 @@ use datafusion_proto::{
 };
 use df_engine_extensions::dist_sql_query::{
     resolver::Resolver, ExecutableScanBuilder, RemotePhysicalPlanExecutor,
-    RemotePhysicalPlanExecutorRef, TableScanContext,
+    RemotePhysicalPlanExecutorRef, RemoteTaskContext, TableScanContext,
 };
 use futures::future::BoxFuture;
 use generic_error::BoxError;
 use prost::Message;
 use snafu::ResultExt;
 use table_engine::{
-    provider::{CeresdbOptions, ScanTable},
+    provider::{CeresdbOptions, ScanTable, SCAN_TABLE_METRICS_COLLECTOR_NAME},
     remote::{
         model::{
             ExecContext, ExecutePlanRequest, PhysicalPlan, RemoteExecuteRequest, TableIdentifier,
@@ -172,12 +172,13 @@ struct RemotePhysicalPlanExecutorImpl {
 impl RemotePhysicalPlanExecutor for RemotePhysicalPlanExecutorImpl {
     fn execute(
         &self,
+        task_context: RemoteTaskContext,
         table: TableIdentifier,
-        task_context: &TaskContext,
         plan: Arc<dyn ExecutionPlan>,
     ) -> DfResult<BoxFuture<'static, DfResult<SendableRecordBatchStream>>> {
         // Get the custom context to rebuild execution context.
         let ceresdb_options = task_context
+            .task_ctx
             .session_config()
             .options()
             .extensions
@@ -223,6 +224,7 @@ impl RemotePhysicalPlanExecutor for RemotePhysicalPlanExecutorImpl {
             let request = ExecutePlanRequest {
                 plan_schema,
                 remote_request,
+                remote_metrics: task_context.remote_metrics,
             };
 
             // Remote execute.
@@ -288,7 +290,7 @@ impl ExecutableScanBuilder for ExecutableScanBuilderImpl {
             opts: read_opts,
             projected_schema: ctx.projected_schema,
             predicate: ctx.predicate,
-            metrics_collector: MetricsCollector::default(),
+            metrics_collector: MetricsCollector::new(SCAN_TABLE_METRICS_COLLECTOR_NAME.to_string()),
         };
 
         let mut scan = ScanTable::new(table, read_request);
diff --git a/query_engine/src/executor.rs b/query_engine/src/executor.rs
index bf32a24b03..92b0a71492 100644
--- a/query_engine/src/executor.rs
+++ b/query_engine/src/executor.rs
@@ -19,7 +19,7 @@ use std::{fmt, sync::Arc};
 use async_trait::async_trait;
 use table_engine::stream::SendableRecordBatchStream;
 
-use crate::{context::Context, error::*, physical_planner::PhysicalPlanPtr};
+use crate::{context::Context, error::*, physical_planner::PhysicalPlanRef};
 
 /// Query executor
 ///
@@ -33,7 +33,7 @@ pub trait Executor: fmt::Debug + Send + Sync + 'static {
     async fn execute(
         &self,
         ctx: &Context,
-        physical_plan: PhysicalPlanPtr,
+        physical_plan: PhysicalPlanRef,
     ) -> Result<SendableRecordBatchStream>;
 }
 
diff --git a/query_engine/src/physical_planner.rs b/query_engine/src/physical_planner.rs
index c32975e190..9233362f62 100644
--- a/query_engine/src/physical_planner.rs
+++ b/query_engine/src/physical_planner.rs
@@ -29,7 +29,7 @@ use crate::{context::Context, datafusion_impl::task_context::DatafusionTaskExecC
 #[async_trait]
 pub trait PhysicalPlanner: fmt::Debug + Send + Sync + 'static {
     /// Create a physical plan from a logical plan
-    async fn plan(&self, ctx: &Context, logical_plan: QueryPlan) -> Result<PhysicalPlanPtr>;
+    async fn plan(&self, ctx: &Context, logical_plan: QueryPlan) -> Result<PhysicalPlanRef>;
 }
 
 pub type PhysicalPlannerRef = Arc<dyn PhysicalPlanner>;
@@ -45,7 +45,7 @@ pub trait PhysicalPlan: std::fmt::Debug + Sync + Send + 'static {
     fn metrics_to_string(&self) -> String;
 }
 
-pub type PhysicalPlanPtr = Box<dyn PhysicalPlan>;
+pub type PhysicalPlanRef = Arc<dyn PhysicalPlan>;
 
 /// Task context, just a wrapper of datafusion task context now
 #[derive(Default)]
diff --git a/remote_engine_client/src/client.rs b/remote_engine_client/src/client.rs
index 80e47ad6ef..456de91413 100644
--- a/remote_engine_client/src/client.rs
+++ b/remote_engine_client/src/client.rs
@@ -17,7 +17,7 @@
 use std::{
     collections::HashMap,
     pin::Pin,
-    sync::Arc,
+    sync::{Arc, Mutex},
     task::{Context, Poll},
 };
 
@@ -26,8 +26,12 @@ use arrow_ext::{
     ipc::{CompressOptions, CompressionMethod},
 };
 use ceresdbproto::{
-    remote_engine::{self, read_response::Output::Arrow, remote_engine_service_client::*},
-    storage::arrow_payload,
+    remote_engine::{
+        self,
+        read_response::Output::{Arrow, Metric},
+        remote_engine_service_client::*,
+    },
+    storage::{arrow_payload, ArrowPayload},
 };
 use common_types::{record_batch::RecordBatch, schema::RecordSchema};
 use futures::{Stream, StreamExt};
@@ -115,8 +119,12 @@ impl Client {
         // When success to get the stream, table has been found in remote, not need to
         // evict cache entry.
         let response = response.into_inner();
-        let remote_read_record_batch_stream =
-            ClientReadRecordBatchStream::new(table_ident, response, record_schema);
+        let remote_read_record_batch_stream = ClientReadRecordBatchStream::new(
+            table_ident,
+            response,
+            record_schema,
+            Default::default(),
+        );
 
         Ok(remote_read_record_batch_stream)
     }
@@ -481,8 +489,12 @@ impl Client {
         // When success to get the stream, table has been found in remote, not need to
         // evict cache entry.
         let response = response.into_inner();
-        let remote_execute_plan_stream =
-            ClientReadRecordBatchStream::new(table_ident, response, plan_schema);
+        let remote_execute_plan_stream = ClientReadRecordBatchStream::new(
+            table_ident,
+            response,
+            plan_schema,
+            request.remote_metrics,
+        );
 
         Ok(remote_execute_plan_stream)
     }
@@ -500,6 +512,7 @@ pub struct ClientReadRecordBatchStream {
     pub table_ident: TableIdentifier,
     pub response_stream: Streaming<remote_engine::ReadResponse>,
     pub record_schema: RecordSchema,
+    pub remote_metrics: Arc<Mutex<Option<String>>>,
 }
 
 impl ClientReadRecordBatchStream {
@@ -507,11 +520,13 @@ impl ClientReadRecordBatchStream {
         table_ident: TableIdentifier,
         response_stream: Streaming<remote_engine::ReadResponse>,
         record_schema: RecordSchema,
+        remote_metrics: Arc<Mutex<Option<String>>>,
     ) -> Self {
         Self {
             table_ident,
             response_stream,
             record_schema,
+            remote_metrics,
         }
     }
 }
@@ -534,52 +549,14 @@ impl Stream for ClientReadRecordBatchStream {
 
                 match response.output {
                     None => Poll::Ready(None),
-                    Some(v) => {
-                        let record_batch = match v {
-                            Arrow(mut v) => {
-                                if v.record_batches.len() != 1 {
-                                    return Poll::Ready(Some(
-                                        InvalidRecordBatchNumber {
-                                            batch_num: v.record_batches.len(),
-                                        }
-                                        .fail(),
-                                    ));
-                                }
-
-                                let compression = match v.compression() {
-                                    arrow_payload::Compression::None => CompressionMethod::None,
-                                    arrow_payload::Compression::Zstd => CompressionMethod::Zstd,
-                                };
-
-                                ipc::decode_record_batches(
-                                    v.record_batches.swap_remove(0),
-                                    compression,
-                                )
-                                .map_err(|e| Box::new(e) as _)
-                                .context(Convert {
-                                    msg: "decode read record batch",
-                                })
-                                .and_then(
-                                    |mut record_batch_vec| {
-                                        ensure!(
-                                            record_batch_vec.len() == 1,
-                                            InvalidRecordBatchNumber {
-                                                batch_num: record_batch_vec.len()
-                                            }
-                                        );
-                                        record_batch_vec
-                                            .swap_remove(0)
-                                            .try_into()
-                                            .map_err(|e| Box::new(e) as _)
-                                            .context(Convert {
-                                                msg: "convert read record batch",
-                                            })
-                                    },
-                                )
-                            }
-                        };
-                        Poll::Ready(Some(record_batch))
-                    }
+                    Some(v) => match v {
+                        Arrow(v) => Poll::Ready(Some(convert_arrow_payload(v))),
+                        Metric(v) => {
+                            let mut remote_metrics = this.remote_metrics.lock().unwrap();
+                            *remote_metrics = Some(v.metric);
+                            Poll::Ready(None)
+                        }
+                    },
                 }
             }
 
@@ -594,3 +571,37 @@ impl Stream for ClientReadRecordBatchStream {
         }
     }
 }
+
+fn convert_arrow_payload(mut v: ArrowPayload) -> Result<RecordBatch> {
+    if v.record_batches.len() != 1 {
+        return InvalidRecordBatchNumber {
+            batch_num: v.record_batches.len(),
+        }
+        .fail();
+    }
+    let compression = match v.compression() {
+        arrow_payload::Compression::None => CompressionMethod::None,
+        arrow_payload::Compression::Zstd => CompressionMethod::Zstd,
+    };
+
+    ipc::decode_record_batches(v.record_batches.swap_remove(0), compression)
+        .map_err(|e| Box::new(e) as _)
+        .context(Convert {
+            msg: "decode read record batch",
+        })
+        .and_then(|mut record_batch_vec| {
+            ensure!(
+                record_batch_vec.len() == 1,
+                InvalidRecordBatchNumber {
+                    batch_num: record_batch_vec.len()
+                }
+            );
+            record_batch_vec
+                .swap_remove(0)
+                .try_into()
+                .map_err(|e| Box::new(e) as _)
+                .context(Convert {
+                    msg: "convert read record batch",
+                })
+        })
+}
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index 3d2963a49b..1135930014 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -27,11 +27,13 @@ use async_trait::async_trait;
 use catalog::{manager::ManagerRef, schema::SchemaRef};
 use ceresdbproto::{
     remote_engine::{
-        execute_plan_request, read_response::Output::Arrow,
-        remote_engine_service_server::RemoteEngineService, row_group, AlterTableOptionsRequest,
-        AlterTableOptionsResponse, AlterTableSchemaRequest, AlterTableSchemaResponse, ExecContext,
-        ExecutePlanRequest, GetTableInfoRequest, GetTableInfoResponse, ReadRequest, ReadResponse,
-        WriteBatchRequest, WriteRequest, WriteResponse,
+        execute_plan_request,
+        read_response::Output::{Arrow, Metric},
+        remote_engine_service_server::RemoteEngineService,
+        row_group, AlterTableOptionsRequest, AlterTableOptionsResponse, AlterTableSchemaRequest,
+        AlterTableSchemaResponse, ExecContext, ExecutePlanRequest, GetTableInfoRequest,
+        GetTableInfoResponse, MetricPayload, ReadRequest, ReadResponse, WriteBatchRequest,
+        WriteRequest, WriteResponse,
     },
     storage::{arrow_payload, ArrowPayload},
 };
@@ -50,6 +52,7 @@ use proxy::{
 use query_engine::{
     context::Context as QueryContext,
     datafusion_impl::physical_plan::{DataFusionPhysicalPlanAdapter, TypedPlan},
+    physical_planner::PhysicalPlanRef,
     QueryEngineRef, QueryEngineType,
 };
 use snafu::{OptionExt, ResultExt};
@@ -82,6 +85,12 @@ pub mod error;
 const STREAM_QUERY_CHANNEL_LEN: usize = 200;
 const DEFAULT_COMPRESS_MIN_LENGTH: usize = 80 * 1024;
 
+#[derive(Debug, Clone)]
+pub enum RecordBatchWithMetric {
+    RecordBatch(RecordBatch),
+    Metric(String),
+}
+
 #[derive(Clone, Debug, PartialEq, Eq, Hash)]
 pub struct StreamReadReqKey {
     table: String,
@@ -192,46 +201,94 @@ impl<M: MetricCollector> Drop for StreamWithMetric<M> {
     }
 }
 
+struct RemoteExecStream {
+    inner: BoxStream<'static, Result<RecordBatch>>,
+    physical_plan: Option<PhysicalPlanRef>,
+}
+
+impl RemoteExecStream {
+    fn new(
+        inner: BoxStream<'static, Result<RecordBatch>>,
+        physical_plan: Option<PhysicalPlanRef>,
+    ) -> Self {
+        Self {
+            inner,
+            physical_plan,
+        }
+    }
+}
+
+impl Stream for RemoteExecStream {
+    type Item = Result<RecordBatchWithMetric>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        match this.inner.poll_next_unpin(cx) {
+            Poll::Ready(Some(res)) => {
+                Poll::Ready(Some(res.map(RecordBatchWithMetric::RecordBatch)))
+            }
+            Poll::Ready(None) => match &this.physical_plan {
+                Some(physical_plan) => {
+                    let metrics = physical_plan.metrics_to_string();
+                    this.physical_plan = None;
+                    Poll::Ready(Some(Ok(RecordBatchWithMetric::Metric(metrics))))
+                }
+                None => Poll::Ready(None),
+            },
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
 macro_rules! record_stream_to_response_stream {
     ($record_stream_result:ident, $StreamType:ident) => {
         match $record_stream_result {
             Ok(stream) => {
                 let new_stream: Self::$StreamType = Box::pin(stream.map(|res| match res {
-                    Ok(record_batch) => {
-                        let resp = match ipc::encode_record_batch(
-                            &record_batch.into_arrow_record_batch(),
-                            CompressOptions {
-                                compress_min_length: DEFAULT_COMPRESS_MIN_LENGTH,
-                                method: CompressionMethod::Zstd,
-                            },
-                        )
-                        .box_err()
-                        .context(ErrWithCause {
-                            code: StatusCode::Internal,
-                            msg: "encode record batch failed",
-                        }) {
-                            Err(e) => ReadResponse {
-                                header: Some(error::build_err_header(e)),
-                                ..Default::default()
-                            },
-                            Ok(CompressOutput { payload, method }) => {
-                                let compression = match method {
-                                    CompressionMethod::None => arrow_payload::Compression::None,
-                                    CompressionMethod::Zstd => arrow_payload::Compression::Zstd,
-                                };
-
-                                ReadResponse {
-                                    header: Some(error::build_ok_header()),
-                                    output: Some(Arrow(ArrowPayload {
-                                        record_batches: vec![payload],
-                                        compression: compression as i32,
-                                    })),
+                    Ok(res) => match res {
+                        RecordBatchWithMetric::Metric(metric) => {
+                            let resp = ReadResponse {
+                                header: Some(error::build_ok_header()),
+                                output: Some(Metric(MetricPayload { metric })),
+                            };
+                            Ok(resp)
+                        }
+                        RecordBatchWithMetric::RecordBatch(record_batch) => {
+                            let resp = match ipc::encode_record_batch(
+                                &record_batch.into_arrow_record_batch(),
+                                CompressOptions {
+                                    compress_min_length: DEFAULT_COMPRESS_MIN_LENGTH,
+                                    method: CompressionMethod::Zstd,
+                                },
+                            )
+                            .box_err()
+                            .context(ErrWithCause {
+                                code: StatusCode::Internal,
+                                msg: "encode record batch failed",
+                            }) {
+                                Err(e) => ReadResponse {
+                                    header: Some(error::build_err_header(e)),
+                                    ..Default::default()
+                                },
+                                Ok(CompressOutput { payload, method }) => {
+                                    let compression = match method {
+                                        CompressionMethod::None => arrow_payload::Compression::None,
+                                        CompressionMethod::Zstd => arrow_payload::Compression::Zstd,
+                                    };
+
+                                    ReadResponse {
+                                        header: Some(error::build_ok_header()),
+                                        output: Some(Arrow(ArrowPayload {
+                                            record_batches: vec![payload],
+                                            compression: compression as i32,
+                                        })),
+                                    }
                                 }
-                            }
-                        };
+                            };
 
-                        Ok(resp)
-                    }
+                            Ok(resp)
+                        }
+                    },
                     Err(e) => {
                         let resp = ReadResponse {
                             header: Some(error::build_err_header(e)),
@@ -240,7 +297,6 @@ macro_rules! record_stream_to_response_stream {
                         Ok(resp)
                     }
                 }));
-
                 Ok(Response::new(new_stream))
             }
             Err(e) => {
@@ -274,7 +330,7 @@ impl RemoteEngineServiceImpl {
     async fn stream_read_internal(
         &self,
         request: Request<ReadRequest>,
-    ) -> Result<StreamWithMetric<StreamReadMetricCollector>> {
+    ) -> Result<RemoteExecStream> {
         let metric = StreamReadMetricCollector(Instant::now());
 
         let ctx = self.handler_ctx();
@@ -316,17 +372,15 @@ impl RemoteEngineServiceImpl {
             });
         }
 
-        Ok(StreamWithMetric::new(
-            Box::pin(ReceiverStream::new(rx)),
-            metric,
-        ))
+        let stream = StreamWithMetric::new(Box::pin(ReceiverStream::new(rx)), metric);
+        Ok(RemoteExecStream::new(Box::pin(stream), None))
     }
 
     async fn dedup_stream_read_internal(
         &self,
         query_dedup: QueryDedup,
         request: Request<ReadRequest>,
-    ) -> Result<StreamWithMetric<StreamReadMetricCollector>> {
+    ) -> Result<RemoteExecStream> {
         let metric = StreamReadMetricCollector(Instant::now());
 
         let request = request.into_inner();
@@ -371,10 +425,8 @@ impl RemoteEngineServiceImpl {
             }
         }
 
-        Ok(StreamWithMetric::new(
-            Box::pin(ReceiverStream::new(rx)),
-            metric,
-        ))
+        let stream = StreamWithMetric::new(Box::pin(ReceiverStream::new(rx)), metric);
+        Ok(RemoteExecStream::new(Box::pin(stream), None))
     }
 
     async fn read_and_send_dedupped_resps<K, F>(
@@ -614,7 +666,7 @@ impl RemoteEngineServiceImpl {
     async fn execute_physical_plan_internal(
         &self,
         request: Request<ExecutePlanRequest>,
-    ) -> Result<StreamWithMetric<ExecutePlanMetricCollector>> {
+    ) -> Result<RemoteExecStream> {
         let request = request.into_inner();
         let query_engine = self.instance.query_engine.clone();
         let (ctx, encoded_plan) = extract_plan_from_req(request)?;
@@ -636,10 +688,15 @@ impl RemoteEngineServiceImpl {
             ctx.timeout_ms,
         );
 
+        let physical_plan = Arc::new(DataFusionPhysicalPlanAdapter::new(TypedPlan::Remote(
+            encoded_plan,
+        )));
+        let physical_plan_clone = physical_plan.clone();
+
         let stream = self
             .runtimes
             .read_runtime
-            .spawn(async move { handle_execute_plan(query_ctx, encoded_plan, query_engine).await })
+            .spawn(async move { handle_execute_plan(query_ctx, physical_plan, query_engine).await })
             .await
             .box_err()
             .with_context(|| ErrWithCause {
@@ -653,14 +710,18 @@ impl RemoteEngineServiceImpl {
                 })
             });
 
-        Ok(StreamWithMetric::new(Box::pin(stream), metric))
+        let stream = StreamWithMetric::new(Box::pin(stream), metric);
+        Ok(RemoteExecStream::new(
+            Box::pin(stream),
+            Some(physical_plan_clone),
+        ))
     }
 
     async fn dedup_execute_physical_plan_internal(
         &self,
         query_dedup: QueryDedup,
         request: Request<ExecutePlanRequest>,
-    ) -> Result<StreamWithMetric<ExecutePlanMetricCollector>> {
+    ) -> Result<RemoteExecStream> {
         let request = request.into_inner();
         let query_engine = self.instance.query_engine.clone();
         let (ctx, encoded_plan) = extract_plan_from_req(request)?;
@@ -685,6 +746,12 @@ impl RemoteEngineServiceImpl {
             encoded_plan: encoded_plan.clone(),
         };
 
+        let physical_plan = Arc::new(DataFusionPhysicalPlanAdapter::new(TypedPlan::Remote(
+            encoded_plan,
+        )));
+
+        let physical_plan_clone = physical_plan.clone();
+
         let QueryDedup {
             config,
             physical_plan_notifiers,
@@ -696,7 +763,7 @@ impl RemoteEngineServiceImpl {
             // The first request, need to handle it, and then notify the other requests.
             RequestResult::First => {
                 let query = async move {
-                    handle_execute_plan(query_ctx, encoded_plan, query_engine)
+                    handle_execute_plan(query_ctx, physical_plan, query_engine)
                         .await
                         .map(PartitionedStreams::one_stream)
                 };
@@ -715,9 +782,10 @@ impl RemoteEngineServiceImpl {
             }
         }
 
-        Ok(StreamWithMetric::new(
-            Box::pin(ReceiverStream::new(rx)),
-            metric,
+        let stream = StreamWithMetric::new(Box::pin(ReceiverStream::new(rx)), metric);
+        Ok(RemoteExecStream::new(
+            Box::pin(stream),
+            Some(physical_plan_clone),
         ))
     }
 
@@ -1140,14 +1208,9 @@ fn create_query_ctx(
 
 async fn handle_execute_plan(
     ctx: QueryContext,
-    encoded_plan: Vec<u8>,
+    physical_plan: PhysicalPlanRef,
     query_engine: QueryEngineRef,
 ) -> Result<SendableRecordBatchStream> {
-    // TODO: Build remote plan in physical planner.
-    let physical_plan = Box::new(DataFusionPhysicalPlanAdapter::new(TypedPlan::Remote(
-        encoded_plan,
-    )));
-
     // Execute plan.
     let executor = query_engine.executor();
     executor
diff --git a/table_engine/src/partition/rule/key.rs b/table_engine/src/partition/rule/key.rs
index d33fd88ba4..b8d9e87c0f 100644
--- a/table_engine/src/partition/rule/key.rs
+++ b/table_engine/src/partition/rule/key.rs
@@ -14,7 +14,7 @@
 
 //! Key partition rule
 
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap};
 
 use common_types::{
     datum::{Datum, DatumView},
@@ -136,8 +136,8 @@ impl KeyRule {
         &self,
         group: &[usize],
         filters: &[PartitionFilter],
-    ) -> Result<HashSet<usize>> {
-        let mut partitions = HashSet::new();
+    ) -> Result<BTreeSet<usize>> {
+        let mut partitions = BTreeSet::new();
         let expanded_group = expand_partition_keys_group(group, filters)?;
         for partition_keys in expanded_group {
             let partition = compute_partition(partition_keys.into_iter(), self.partition_num);
@@ -219,7 +219,7 @@ impl PartitionRule for KeyRule {
             target_partitions = target_partitions
                 .intersection(&partitions)
                 .copied()
-                .collect::<HashSet<_>>();
+                .collect::<BTreeSet<_>>();
         }
 
         Ok(target_partitions.into_iter().collect())
diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs
index 6b0c38a770..fb3cb41d55 100644
--- a/table_engine/src/provider.rs
+++ b/table_engine/src/provider.rs
@@ -47,7 +47,7 @@ use crate::{
     table::{ReadOptions, ReadRequest, TableRef},
 };
 
-const SCAN_TABLE_METRICS_COLLECTOR_NAME: &str = "scan_table";
+pub const SCAN_TABLE_METRICS_COLLECTOR_NAME: &str = "scan_table";
 
 #[derive(Clone, Debug)]
 pub struct CeresdbOptions {
diff --git a/table_engine/src/remote/model.rs b/table_engine/src/remote/model.rs
index 2fc6a297eb..bd99670375 100644
--- a/table_engine/src/remote/model.rs
+++ b/table_engine/src/remote/model.rs
@@ -16,6 +16,7 @@
 
 use std::{
     collections::HashMap,
+    sync::{Arc, Mutex},
     time::{Duration, Instant},
 };
 
@@ -409,6 +410,9 @@ pub struct ExecutePlanRequest {
 
     /// Remote plan execution request
     pub remote_request: RemoteExecuteRequest,
+
+    /// Collect metrics of remote plan
+    pub remote_metrics: Arc<Mutex<Option<String>>>,
 }
 
 impl ExecutePlanRequest {
@@ -417,6 +421,7 @@ impl ExecutePlanRequest {
         plan_schema: RecordSchema,
         context: ExecContext,
         physical_plan: PhysicalPlan,
+        remote_metrics: Arc<Mutex<Option<String>>>,
     ) -> Self {
         let remote_request = RemoteExecuteRequest {
             table,
@@ -427,6 +432,7 @@ impl ExecutePlanRequest {
         Self {
             plan_schema,
             remote_request,
+            remote_metrics,
         }
     }
 }

From ce4044b8c2f02551ccd0c46a6d03cdf1e3032dd8 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Thu, 28 Dec 2023 15:35:46 +0800
Subject: [PATCH 25/38] feat: impl priority runtime for read (#1303)

## Rationale
Close #1299

## Detailed Changes
- Add PriorityRuntime component, and use in read API
- In normal query, its plan will be executed in higher runtime by
default, when executor decide query is expensive, then it will spawn
`stream.poll` in another lower runtime.
- In distributed query, a priority field is added in remote query
request, so it can decide which runtime to run on.


## Test Plan
Newly added UT
---
 Cargo.lock                                    |  29 ++-
 Cargo.toml                                    |   8 +-
 analytic_engine/src/instance/mod.rs           |   4 +-
 analytic_engine/src/instance/read.rs          |   6 +-
 analytic_engine/src/table/mod.rs              |   2 +
 analytic_engine/src/tests/table.rs            |   1 +
 analytic_engine/src/tests/util.rs             |   5 +-
 components/logger/Cargo.toml                  |   1 +
 components/logger/src/lib.rs                  |  10 +-
 components/runtime/src/lib.rs                 |   4 +-
 components/runtime/src/priority_runtime.rs    |  98 ++++++++
 df_engine_extensions/Cargo.toml               |   1 +
 .../src/dist_sql_query/mod.rs                 |  18 +-
 .../src/dist_sql_query/physical_plan.rs       |   3 +
 .../src/dist_sql_query/resolver.rs            |   9 +-
 .../src/dist_sql_query/test_util.rs           |   5 +
 docs/example-cluster-1.toml                   |   2 +-
 .../cases/common/dml/issue-1087.result        |   4 +-
 .../cases/common/dml/issue-341.result         |   8 +-
 .../cases/common/dml/issue-59.result          |   2 +-
 .../cases/common/explain/explain.result       |   2 +-
 .../cases/common/optimizer/optimizer.result   |   2 +-
 .../env/cluster/ddl/partition_table.result    |   4 +-
 .../cases/env/local/ddl/query-plan.result     |  21 +-
 .../cases/env/local/ddl/query-plan.sql        |   5 +
 interpreters/Cargo.toml                       |   3 +
 interpreters/src/context.rs                   |  20 +-
 interpreters/src/factory.rs                   |  14 +-
 interpreters/src/lib.rs                       |   4 +-
 .../mod.rs => interpreters/src/metrics.rs     |  14 +-
 interpreters/src/select.rs                    | 108 ++++++---
 interpreters/src/tests.rs                     |  25 ++-
 proxy/src/instance.rs                         |   2 +
 proxy/src/lib.rs                              |  81 +------
 proxy/src/read.rs                             |  37 +---
 query_engine/Cargo.toml                       |   1 +
 query_engine/src/config.rs                    |   3 +
 query_engine/src/context.rs                   |   2 +
 query_engine/src/datafusion_impl/mod.rs       |  75 +------
 .../src/datafusion_impl/physical_planner.rs   |  39 ++--
 .../src/datafusion_impl/task_context.rs       |   6 +
 query_frontend/Cargo.toml                     |   3 +
 query_frontend/src/frontend.rs                |  75 ++++---
 query_frontend/src/influxql/planner.rs        |  13 +-
 query_frontend/src/lib.rs                     |   1 +
 query_frontend/src/logical_optimizer/mod.rs   |  49 ++++
 .../src}/logical_optimizer/type_conversion.rs |   5 +-
 query_frontend/src/plan.rs                    | 209 +++++++++++++++++-
 query_frontend/src/planner.rs                 |  15 +-
 query_frontend/src/promql/convert.rs          |   3 +-
 query_frontend/src/promql/remote.rs           |  16 +-
 .../src/grpc/remote_engine_service/metrics.rs |  25 +++
 server/src/grpc/remote_engine_service/mod.rs  |  87 ++++++--
 server/src/http.rs                            |   6 +-
 server/src/server.rs                          |   3 +
 src/ceresdb/src/config.rs                     |   5 +-
 src/ceresdb/src/setup.rs                      |  22 +-
 system_catalog/src/sys_catalog_table.rs       |   1 +
 table_engine/src/engine.rs                    |   4 +-
 table_engine/src/provider.rs                  |  48 +++-
 table_engine/src/remote/model.rs              |  11 +-
 table_engine/src/table.rs                     |   5 +
 62 files changed, 909 insertions(+), 385 deletions(-)
 create mode 100644 components/runtime/src/priority_runtime.rs
 rename query_engine/src/datafusion_impl/logical_optimizer/mod.rs => interpreters/src/metrics.rs (66%)
 create mode 100644 query_frontend/src/logical_optimizer/mod.rs
 rename {query_engine/src/datafusion_impl => query_frontend/src}/logical_optimizer/type_conversion.rs (98%)
 create mode 100644 server/src/grpc/remote_engine_service/metrics.rs

diff --git a/Cargo.lock b/Cargo.lock
index e32493fe4d..271041656a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -648,7 +648,7 @@ dependencies = [
 [[package]]
 name = "arrow_util"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "ahash 0.8.3",
  "arrow 43.0.0",
@@ -2271,7 +2271,7 @@ dependencies = [
 [[package]]
 name = "datafusion_util"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "async-trait",
  "datafusion",
@@ -2372,6 +2372,7 @@ dependencies = [
  "lazy_static",
  "prometheus 0.12.0",
  "prost",
+ "runtime",
  "snafu 0.6.10",
  "table_engine",
  "tokio",
@@ -2860,7 +2861,7 @@ checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2"
 [[package]]
 name = "generated_types"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "pbjson",
  "pbjson-build",
@@ -3298,7 +3299,7 @@ dependencies = [
 [[package]]
 name = "influxdb_influxql_parser"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "chrono",
  "chrono-tz",
@@ -3352,12 +3353,15 @@ dependencies = [
  "futures 0.3.28",
  "generic_error",
  "hash_ext",
+ "lazy_static",
  "logger",
  "macros",
  "meta_client",
+ "prometheus 0.12.0",
  "query_engine",
  "query_frontend",
  "regex",
+ "runtime",
  "snafu 0.6.10",
  "table_engine",
  "test_util",
@@ -3388,7 +3392,7 @@ dependencies = [
 [[package]]
 name = "iox_query"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "arrow 43.0.0",
  "arrow_util",
@@ -3412,7 +3416,7 @@ dependencies = [
 [[package]]
 name = "iox_query_influxql"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "arrow 43.0.0",
  "chrono",
@@ -3738,6 +3742,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "chrono",
  "log",
+ "runtime",
  "serde",
  "slog",
  "slog-async",
@@ -4511,7 +4516,7 @@ dependencies = [
 [[package]]
 name = "observability_deps"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "tracing",
 ]
@@ -5430,6 +5435,7 @@ dependencies = [
  "macros",
  "prost",
  "query_frontend",
+ "runtime",
  "serde",
  "snafu 0.6.10",
  "table_engine",
@@ -5446,6 +5452,7 @@ dependencies = [
  "async-trait",
  "catalog",
  "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "chrono",
  "cluster",
  "codec",
  "common_types",
@@ -5455,6 +5462,7 @@ dependencies = [
  "generic_error",
  "hash_ext",
  "influxdb_influxql_parser",
+ "iox_query",
  "iox_query_influxql",
  "itertools 0.10.5",
  "lazy_static",
@@ -5465,6 +5473,7 @@ dependencies = [
  "prom-remote-api",
  "regex",
  "regex-syntax 0.6.29",
+ "runtime",
  "schema",
  "snafu 0.6.10",
  "sqlparser",
@@ -5475,7 +5484,7 @@ dependencies = [
 [[package]]
 name = "query_functions"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "arrow 43.0.0",
  "chrono",
@@ -6130,7 +6139,7 @@ dependencies = [
 [[package]]
 name = "schema"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "arrow 43.0.0",
  "hashbrown 0.13.2",
@@ -6901,7 +6910,7 @@ dependencies = [
 [[package]]
 name = "test_helpers"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql.git?rev=acbd3ad7651f2deb74857155bea892f88926da57#acbd3ad7651f2deb74857155bea892f88926da57"
+source = "git+https://github.com/CeresDB/influxql.git?rev=a905863#a9058633c03f018607dc1e4f6ca090b82d46a30c"
 dependencies = [
  "dotenvy",
  "observability_deps",
diff --git a/Cargo.toml b/Cargo.toml
index a01a722260..c99bbaf56b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -115,10 +115,10 @@ hash_ext = { path = "components/hash_ext" }
 hex = "0.4.3"
 hyperloglog = { git = "https://github.com/jedisct1/rust-hyperloglog.git", rev = "425487ce910f26636fbde8c4d640b538431aad50" }
 id_allocator = { path = "components/id_allocator" }
-influxql-logical-planner = { git = "https://github.com/CeresDB/influxql.git", rev = "acbd3ad7651f2deb74857155bea892f88926da57", package = "iox_query_influxql" }
-influxql-parser = { git = "https://github.com/CeresDB/influxql.git", rev = "acbd3ad7651f2deb74857155bea892f88926da57", package = "influxdb_influxql_parser" }
-influxql-query = { git = "https://github.com/CeresDB/influxql.git", rev = "acbd3ad7651f2deb74857155bea892f88926da57", package = "iox_query" }
-influxql-schema = { git = "https://github.com/CeresDB/influxql.git", rev = "acbd3ad7651f2deb74857155bea892f88926da57", package = "schema" }
+influxql-logical-planner = { git = "https://github.com/CeresDB/influxql.git", rev = "a905863", package = "iox_query_influxql" }
+influxql-parser = { git = "https://github.com/CeresDB/influxql.git", rev = "a905863", package = "influxdb_influxql_parser" }
+influxql-query = { git = "https://github.com/CeresDB/influxql.git", rev = "a905863", package = "iox_query" }
+influxql-schema = { git = "https://github.com/CeresDB/influxql.git", rev = "a905863", package = "schema" }
 interpreters = { path = "interpreters" }
 itertools = "0.10.5"
 lz4_flex = { version = "0.11", default-features = false, features = ["frame"] }
diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs
index 6eb257b83e..7525f64190 100644
--- a/analytic_engine/src/instance/mod.rs
+++ b/analytic_engine/src/instance/mod.rs
@@ -38,7 +38,7 @@ use generic_error::{BoxError, GenericError};
 use logger::{error, info};
 use macros::define_result;
 use mem_collector::MemUsageCollector;
-use runtime::Runtime;
+use runtime::{PriorityRuntime, Runtime};
 use snafu::{ResultExt, Snafu};
 use table_engine::{engine::EngineRuntimes, predicate::PredicateRef, table::FlushRequest};
 use time_ext::ReadableDuration;
@@ -291,7 +291,7 @@ impl Instance {
     }
 
     #[inline]
-    fn read_runtime(&self) -> &Arc<Runtime> {
+    fn read_runtime(&self) -> &PriorityRuntime {
         &self.runtimes.read_runtime
     }
 
diff --git a/analytic_engine/src/instance/read.rs b/analytic_engine/src/instance/read.rs
index 9624f4cfbb..341af2fcff 100644
--- a/analytic_engine/src/instance/read.rs
+++ b/analytic_engine/src/instance/read.rs
@@ -122,6 +122,10 @@ impl Instance {
             None,
         ));
 
+        let runtime = self
+            .read_runtime()
+            .choose_runtime(&request.priority)
+            .clone();
         let sst_read_options_builder = SstReadOptionsBuilder::new(
             ScanType::Query,
             self.scan_options.clone(),
@@ -129,7 +133,7 @@ impl Instance {
             table_options.num_rows_per_row_group,
             request.predicate.clone(),
             self.meta_cache.clone(),
-            self.read_runtime().clone(),
+            runtime,
         );
 
         if need_merge_sort {
diff --git a/analytic_engine/src/table/mod.rs b/analytic_engine/src/table/mod.rs
index 90097101bc..f6bc9cf4e5 100644
--- a/analytic_engine/src/table/mod.rs
+++ b/analytic_engine/src/table/mod.rs
@@ -541,6 +541,8 @@ impl Table for TableImpl {
             projected_schema: request.projected_schema,
             predicate,
             metrics_collector: MetricsCollector::new(GET_METRICS_COLLECTOR_NAME.to_string()),
+            // TODO: pass priority from request
+            priority: Default::default(),
         };
         let mut batch_stream = self
             .read(read_request)
diff --git a/analytic_engine/src/tests/table.rs b/analytic_engine/src/tests/table.rs
index 8524f9debe..1704a051d7 100644
--- a/analytic_engine/src/tests/table.rs
+++ b/analytic_engine/src/tests/table.rs
@@ -192,6 +192,7 @@ pub fn new_read_all_request_with_order(schema: Schema, opts: ReadOptions) -> Rea
         projected_schema: ProjectedSchema::no_projection(schema),
         predicate: Arc::new(Predicate::empty()),
         metrics_collector: MetricsCollector::default(),
+        priority: Default::default(),
     }
 }
 
diff --git a/analytic_engine/src/tests/util.rs b/analytic_engine/src/tests/util.rs
index 0d9613b7a8..c7acad40a7 100644
--- a/analytic_engine/src/tests/util.rs
+++ b/analytic_engine/src/tests/util.rs
@@ -26,6 +26,7 @@ use common_types::{
 use futures::stream::StreamExt;
 use logger::info;
 use object_store::config::{LocalOptions, ObjectStoreOptions, StorageOptions};
+use runtime::PriorityRuntime;
 use size_ext::ReadableSize;
 use table_engine::{
     engine::{
@@ -124,7 +125,7 @@ impl<T: WalsOpener> TestContext<T> {
                 .open_wals(
                     &self.config.wal,
                     WalRuntimes {
-                        read_runtime: self.runtimes.read_runtime.clone(),
+                        read_runtime: self.runtimes.read_runtime.high().clone(),
                         write_runtime: self.runtimes.write_runtime.clone(),
                         default_runtime: self.runtimes.default_runtime.clone(),
                     },
@@ -528,7 +529,7 @@ impl Builder {
             _dir: dir,
             config,
             runtimes: Arc::new(EngineRuntimes {
-                read_runtime: runtime.clone(),
+                read_runtime: PriorityRuntime::new(runtime.clone(), runtime.clone()),
                 write_runtime: runtime.clone(),
                 meta_runtime: runtime.clone(),
                 compact_runtime: runtime.clone(),
diff --git a/components/logger/Cargo.toml b/components/logger/Cargo.toml
index 77db8f5948..7ae97c1e19 100644
--- a/components/logger/Cargo.toml
+++ b/components/logger/Cargo.toml
@@ -35,6 +35,7 @@ workspace = true
 [dependencies]
 chrono = { workspace = true }
 log = "0.4"
+runtime = { workspace = true }
 serde = { workspace = true }
 slog = { workspace = true }
 slog-async = "2.6"
diff --git a/components/logger/src/lib.rs b/components/logger/src/lib.rs
index 1dc78b0cb8..5d25e230d3 100644
--- a/components/logger/src/lib.rs
+++ b/components/logger/src/lib.rs
@@ -28,6 +28,7 @@ pub use log::{
     debug as log_debug, error as log_error, info as log_info, max_level, trace as log_trace,
     warn as log_warn, SetLoggerError,
 };
+use runtime::Priority;
 use serde::{Deserialize, Serialize};
 pub use slog::Level;
 use slog::{slog_o, Drain, Key, OwnedKVList, Record, KV};
@@ -471,6 +472,7 @@ pub struct SlowTimer<'a> {
     sql: &'a str,
     slow_threshold: Duration,
     start_time: Instant,
+    priority: Option<Priority>,
 }
 
 impl<'a> Drop for SlowTimer<'a> {
@@ -478,9 +480,10 @@ impl<'a> Drop for SlowTimer<'a> {
         let cost = self.elapsed();
         if cost > self.slow_threshold {
             slow_query!(
-                "Normal query elapsed:{:?}, id:{}, query:{}",
+                "Normal query elapsed:{:?}, id:{}, priority:{:?}, query:{}",
                 cost,
                 self.request_id,
+                self.priority,
                 self.sql,
             );
         }
@@ -494,6 +497,7 @@ impl<'a> SlowTimer<'a> {
             sql,
             slow_threshold: threshold,
             start_time: Instant::now(),
+            priority: None,
         }
     }
 
@@ -504,6 +508,10 @@ impl<'a> SlowTimer<'a> {
     pub fn start_time(&self) -> Instant {
         self.start_time
     }
+
+    pub fn priority(&mut self, priority: Priority) {
+        self.priority = Some(priority);
+    }
 }
 
 #[macro_export(local_inner_macros)]
diff --git a/components/runtime/src/lib.rs b/components/runtime/src/lib.rs
index 54f6e7be73..b6453819d5 100644
--- a/components/runtime/src/lib.rs
+++ b/components/runtime/src/lib.rs
@@ -30,8 +30,10 @@ use tokio::{
 };
 
 mod metrics;
+mod priority_runtime;
+
+pub use priority_runtime::{Priority, PriorityRuntime};
 
-// TODO(yingwen): Use opaque error type
 #[derive(Debug, Snafu)]
 #[snafu(visibility(pub))]
 pub enum Error {
diff --git a/components/runtime/src/priority_runtime.rs b/components/runtime/src/priority_runtime.rs
new file mode 100644
index 0000000000..1f69bd8a1c
--- /dev/null
+++ b/components/runtime/src/priority_runtime.rs
@@ -0,0 +1,98 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::future::Future;
+
+use crate::{JoinHandle, RuntimeRef};
+
+// TODO: maybe we could move this to common_types crate.
+#[derive(Copy, Clone, Debug, Default)]
+#[repr(u8)]
+pub enum Priority {
+    #[default]
+    High = 0,
+    Low = 1,
+}
+
+impl Priority {
+    pub fn as_u8(&self) -> u8 {
+        *self as u8
+    }
+
+    pub fn as_str(&self) -> &str {
+        match self {
+            Self::High => "high",
+            Self::Low => "low",
+        }
+    }
+}
+
+impl TryFrom<u8> for Priority {
+    type Error = String;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            0 => Ok(Priority::High),
+            1 => Ok(Priority::Low),
+            _ => Err(format!("Unknown priority, value:{value}")),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct PriorityRuntime {
+    low: RuntimeRef,
+    high: RuntimeRef,
+}
+
+impl PriorityRuntime {
+    pub fn new(low: RuntimeRef, high: RuntimeRef) -> Self {
+        Self { low, high }
+    }
+
+    pub fn low(&self) -> &RuntimeRef {
+        &self.low
+    }
+
+    pub fn high(&self) -> &RuntimeRef {
+        &self.high
+    }
+
+    pub fn choose_runtime(&self, priority: &Priority) -> &RuntimeRef {
+        match priority {
+            Priority::Low => &self.low,
+            Priority::High => &self.high,
+        }
+    }
+
+    // By default we spawn the future to the higher priority runtime.
+    pub fn spawn<F>(&self, future: F) -> JoinHandle<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send + 'static,
+    {
+        self.high.spawn(future)
+    }
+
+    pub fn spawn_with_priority<F>(&self, future: F, priority: Priority) -> JoinHandle<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send + 'static,
+    {
+        match priority {
+            Priority::Low => self.low.spawn(future),
+            Priority::High => self.high.spawn(future),
+        }
+    }
+}
diff --git a/df_engine_extensions/Cargo.toml b/df_engine_extensions/Cargo.toml
index 54926369eb..4de30d40b9 100644
--- a/df_engine_extensions/Cargo.toml
+++ b/df_engine_extensions/Cargo.toml
@@ -41,6 +41,7 @@ generic_error = { workspace = true }
 lazy_static = { workspace = true }
 prometheus = { workspace = true }
 prost = { workspace = true }
+runtime = { workspace = true }
 snafu = { workspace = true }
 table_engine = { workspace = true }
 trace_metric = { workspace = true }
diff --git a/df_engine_extensions/src/dist_sql_query/mod.rs b/df_engine_extensions/src/dist_sql_query/mod.rs
index 4bbf6b36ef..abfc0cca1d 100644
--- a/df_engine_extensions/src/dist_sql_query/mod.rs
+++ b/df_engine_extensions/src/dist_sql_query/mod.rs
@@ -26,6 +26,7 @@ use datafusion::{
 };
 use futures::future::BoxFuture;
 use generic_error::BoxError;
+use runtime::Priority;
 use table_engine::{predicate::PredicateRef, remote::model::TableIdentifier, table::TableRef};
 
 pub mod codec;
@@ -56,6 +57,7 @@ pub trait ExecutableScanBuilder: fmt::Debug + Send + Sync + 'static {
         &self,
         table: TableRef,
         ctx: TableScanContext,
+        priority: Priority,
     ) -> DfResult<Arc<dyn ExecutionPlan>>;
 }
 
@@ -91,22 +93,6 @@ pub struct TableScanContext {
     pub predicate: PredicateRef,
 }
 
-impl TableScanContext {
-    pub fn new(
-        batch_size: usize,
-        read_parallelism: usize,
-        projected_schema: ProjectedSchema,
-        predicate: PredicateRef,
-    ) -> Self {
-        Self {
-            batch_size,
-            read_parallelism,
-            projected_schema,
-            predicate,
-        }
-    }
-}
-
 impl TryFrom<TableScanContext> for ceresdbproto::remote_engine::TableScanContext {
     type Error = datafusion::error::DataFusionError;
 
diff --git a/df_engine_extensions/src/dist_sql_query/physical_plan.rs b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
index 0dbaf415ac..87cd18bdcd 100644
--- a/df_engine_extensions/src/dist_sql_query/physical_plan.rs
+++ b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
@@ -43,6 +43,7 @@ use datafusion::{
     },
 };
 use futures::{future::BoxFuture, FutureExt, Stream, StreamExt};
+use runtime::Priority;
 use table_engine::{remote::model::TableIdentifier, table::ReadRequest};
 use trace_metric::{collector::FormatCollectorVisitor, MetricsCollector, TraceMetricWhenDrop};
 
@@ -57,6 +58,7 @@ pub struct UnresolvedPartitionedScan {
     pub sub_tables: Vec<TableIdentifier>,
     pub table_scan_ctx: TableScanContext,
     pub metrics_collector: MetricsCollector,
+    pub priority: Priority,
 }
 
 impl UnresolvedPartitionedScan {
@@ -77,6 +79,7 @@ impl UnresolvedPartitionedScan {
             sub_tables,
             table_scan_ctx,
             metrics_collector,
+            priority: read_request.priority,
         }
     }
 }
diff --git a/df_engine_extensions/src/dist_sql_query/resolver.rs b/df_engine_extensions/src/dist_sql_query/resolver.rs
index c48bfe5351..8f1d4e1c7c 100644
--- a/df_engine_extensions/src/dist_sql_query/resolver.rs
+++ b/df_engine_extensions/src/dist_sql_query/resolver.rs
@@ -20,6 +20,7 @@ use datafusion::{
     error::{DataFusionError, Result as DfResult},
     physical_plan::{analyze::AnalyzeExec, ExecutionPlan},
 };
+use runtime::Priority;
 use table_engine::{remote::model::TableIdentifier, table::TableRef};
 
 use crate::{
@@ -45,6 +46,7 @@ pub struct Resolver {
     remote_executor: RemotePhysicalPlanExecutorRef,
     catalog_manager: CatalogManagerRef,
     scan_builder: ExecutableScanBuilderRef,
+    priority: Priority,
 }
 
 impl Resolver {
@@ -52,11 +54,13 @@ impl Resolver {
         remote_executor: RemotePhysicalPlanExecutorRef,
         catalog_manager: CatalogManagerRef,
         scan_builder: ExecutableScanBuilderRef,
+        priority: Priority,
     ) -> Self {
         Self {
             remote_executor,
             catalog_manager,
             scan_builder,
+            priority,
         }
     }
 
@@ -214,7 +218,10 @@ impl Resolver {
             };
 
         if let Some((table, table_scan_ctx)) = build_scan_opt {
-            return self.scan_builder.build(table, table_scan_ctx).await;
+            return self
+                .scan_builder
+                .build(table, table_scan_ctx, self.priority)
+                .await;
         }
 
         let children = plan.children().clone();
diff --git a/df_engine_extensions/src/dist_sql_query/test_util.rs b/df_engine_extensions/src/dist_sql_query/test_util.rs
index d70f9ec258..9006a52695 100644
--- a/df_engine_extensions/src/dist_sql_query/test_util.rs
+++ b/df_engine_extensions/src/dist_sql_query/test_util.rs
@@ -44,6 +44,7 @@ use datafusion::{
     scalar::ScalarValue,
 };
 use futures::{future::BoxFuture, Stream};
+use runtime::Priority;
 use table_engine::{
     memory::MemoryTable,
     predicate::PredicateBuilder,
@@ -204,6 +205,7 @@ impl TestContext {
             projected_schema,
             predicate,
             metrics_collector: MetricsCollector::default(),
+            priority: Default::default(),
         };
 
         // Build the test catalog
@@ -238,6 +240,7 @@ impl TestContext {
             Arc::new(MockRemotePhysicalPlanExecutor),
             self.catalog_manager.clone(),
             Box::new(MockScanBuilder),
+            Priority::High,
         )
     }
 
@@ -422,6 +425,7 @@ impl ExecutableScanBuilder for MockScanBuilder {
         &self,
         _table: TableRef,
         ctx: TableScanContext,
+        priority: Priority,
     ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
         let request = ReadRequest {
             request_id: RequestId::from("test"),
@@ -433,6 +437,7 @@ impl ExecutableScanBuilder for MockScanBuilder {
             projected_schema: ctx.projected_schema.clone(),
             predicate: ctx.predicate.clone(),
             metrics_collector: MetricsCollector::default(),
+            priority,
         };
 
         Ok(Arc::new(MockScan { request }))
diff --git a/docs/example-cluster-1.toml b/docs/example-cluster-1.toml
index b08d3dc7e8..8c221dc877 100644
--- a/docs/example-cluster-1.toml
+++ b/docs/example-cluster-1.toml
@@ -16,7 +16,7 @@
 addr = "127.0.0.1"
 
 [logger]
-level = "info"
+level = "debug"
 
 [server]
 bind_addr = "0.0.0.0"
diff --git a/integration_tests/cases/common/dml/issue-1087.result b/integration_tests/cases/common/dml/issue-1087.result
index 54265dae88..ad8a0cc774 100644
--- a/integration_tests/cases/common/dml/issue-1087.result
+++ b/integration_tests/cases/common/dml/issue-1087.result
@@ -76,7 +76,7 @@ String("logical_plan after push_down_limit"),String("SAME TEXT AS ABOVE"),
 String("logical_plan after influx_regex_to_datafusion_regex"),String("SAME TEXT AS ABOVE"),
 String("logical_plan after handle_gap_fill"),String("SAME TEXT AS ABOVE"),
 String("logical_plan"),String("TableScan: issue_1087 projection=[tsid, t, name, value]"),
-String("initial_physical_plan"),String("ScanTable: table=issue_1087, parallelism=8\n"),
+String("initial_physical_plan"),String("ScanTable: table=issue_1087, parallelism=8, priority=Low\n"),
 String("physical_plan after aggregate_statistics"),String("SAME TEXT AS ABOVE"),
 String("physical_plan after join_selection"),String("SAME TEXT AS ABOVE"),
 String("physical_plan after PipelineFixer"),String("SAME TEXT AS ABOVE"),
@@ -86,7 +86,7 @@ String("physical_plan after CombinePartialFinalAggregate"),String("SAME TEXT AS
 String("physical_plan after EnforceSorting"),String("SAME TEXT AS ABOVE"),
 String("physical_plan after coalesce_batches"),String("SAME TEXT AS ABOVE"),
 String("physical_plan after PipelineChecker"),String("SAME TEXT AS ABOVE"),
-String("physical_plan"),String("ScanTable: table=issue_1087, parallelism=8\n"),
+String("physical_plan"),String("ScanTable: table=issue_1087, parallelism=8, priority=Low\n"),
 
 
 DROP TABLE `issue_1087`;
diff --git a/integration_tests/cases/common/dml/issue-341.result b/integration_tests/cases/common/dml/issue-341.result
index f9405db366..902222590b 100644
--- a/integration_tests/cases/common/dml/issue-341.result
+++ b/integration_tests/cases/common/dml/issue-341.result
@@ -58,7 +58,7 @@ WHERE
 
 plan_type,plan,
 String("logical_plan"),String("TableScan: issue341_t1 projection=[timestamp, value], full_filters=[issue341_t1.value = Int32(3)]"),
-String("physical_plan"),String("ScanTable: table=issue341_t1, parallelism=8\n"),
+String("physical_plan"),String("ScanTable: table=issue341_t1, parallelism=8, priority=Low\n"),
 
 
 -- FilterExec node should not be in plan.
@@ -72,7 +72,7 @@ WHERE
 
 plan_type,plan,
 String("logical_plan"),String("Projection: issue341_t1.timestamp, issue341_t1.value\n  TableScan: issue341_t1 projection=[timestamp, value, tag1], full_filters=[issue341_t1.tag1 = Utf8(\"t3\")]"),
-String("physical_plan"),String("ProjectionExec: expr=[timestamp@0 as timestamp, value@1 as value]\n  ScanTable: table=issue341_t1, parallelism=8\n"),
+String("physical_plan"),String("ProjectionExec: expr=[timestamp@0 as timestamp, value@1 as value]\n  ScanTable: table=issue341_t1, parallelism=8, priority=Low\n"),
 
 
 -- Repeat operations above, but with overwrite table
@@ -116,7 +116,7 @@ WHERE
 
 plan_type,plan,
 String("logical_plan"),String("Filter: issue341_t2.value = Float64(3)\n  TableScan: issue341_t2 projection=[timestamp, value], partial_filters=[issue341_t2.value = Float64(3)]"),
-String("physical_plan"),String("CoalesceBatchesExec: target_batch_size=8192\n  FilterExec: value@1 = 3\n    ScanTable: table=issue341_t2, parallelism=8\n"),
+String("physical_plan"),String("CoalesceBatchesExec: target_batch_size=8192\n  FilterExec: value@1 = 3\n    ScanTable: table=issue341_t2, parallelism=8, priority=Low\n"),
 
 
 -- When using tag as filter, FilterExec node should not be in plan.
@@ -130,7 +130,7 @@ WHERE
 
 plan_type,plan,
 String("logical_plan"),String("Projection: issue341_t2.timestamp, issue341_t2.value\n  TableScan: issue341_t2 projection=[timestamp, value, tag1], full_filters=[issue341_t2.tag1 = Utf8(\"t3\")]"),
-String("physical_plan"),String("ProjectionExec: expr=[timestamp@0 as timestamp, value@1 as value]\n  ScanTable: table=issue341_t2, parallelism=8\n"),
+String("physical_plan"),String("ProjectionExec: expr=[timestamp@0 as timestamp, value@1 as value]\n  ScanTable: table=issue341_t2, parallelism=8, priority=Low\n"),
 
 
 DROP TABLE IF EXISTS `issue341_t1`;
diff --git a/integration_tests/cases/common/dml/issue-59.result b/integration_tests/cases/common/dml/issue-59.result
index d2bdb35f99..549c7019cd 100644
--- a/integration_tests/cases/common/dml/issue-59.result
+++ b/integration_tests/cases/common/dml/issue-59.result
@@ -25,7 +25,7 @@ GROUP BY id+1;
 
 plan_type,plan,
 String("logical_plan"),String("Projection: group_alias_0 AS issue59.id + Int64(1), COUNT(alias1) AS COUNT(DISTINCT issue59.account)\n  Aggregate: groupBy=[[group_alias_0]], aggr=[[COUNT(alias1)]]\n    Projection: group_alias_0, alias1\n      Aggregate: groupBy=[[CAST(issue59.id AS Int64) + Int64(1) AS group_alias_0, issue59.account AS alias1]], aggr=[[]]\n        TableScan: issue59 projection=[id, account]"),
-String("physical_plan"),String("ProjectionExec: expr=[group_alias_0@0 as issue59.id + Int64(1), COUNT(alias1)@1 as COUNT(DISTINCT issue59.account)]\n  AggregateExec: mode=FinalPartitioned, gby=[group_alias_0@0 as group_alias_0], aggr=[COUNT(alias1)]\n    CoalesceBatchesExec: target_batch_size=8192\n      RepartitionExec: partitioning=Hash([group_alias_0@0], 8), input_partitions=8\n        AggregateExec: mode=Partial, gby=[group_alias_0@0 as group_alias_0], aggr=[COUNT(alias1)]\n          ProjectionExec: expr=[group_alias_0@0 as group_alias_0, alias1@1 as alias1]\n            AggregateExec: mode=FinalPartitioned, gby=[group_alias_0@0 as group_alias_0, alias1@1 as alias1], aggr=[]\n              CoalesceBatchesExec: target_batch_size=8192\n                RepartitionExec: partitioning=Hash([group_alias_0@0, alias1@1], 8), input_partitions=8\n                  AggregateExec: mode=Partial, gby=[CAST(id@0 AS Int64) + 1 as group_alias_0, account@1 as alias1], aggr=[]\n                    ScanTable: table=issue59, parallelism=8\n"),
+String("physical_plan"),String("ProjectionExec: expr=[group_alias_0@0 as issue59.id + Int64(1), COUNT(alias1)@1 as COUNT(DISTINCT issue59.account)]\n  AggregateExec: mode=FinalPartitioned, gby=[group_alias_0@0 as group_alias_0], aggr=[COUNT(alias1)]\n    CoalesceBatchesExec: target_batch_size=8192\n      RepartitionExec: partitioning=Hash([group_alias_0@0], 8), input_partitions=8\n        AggregateExec: mode=Partial, gby=[group_alias_0@0 as group_alias_0], aggr=[COUNT(alias1)]\n          ProjectionExec: expr=[group_alias_0@0 as group_alias_0, alias1@1 as alias1]\n            AggregateExec: mode=FinalPartitioned, gby=[group_alias_0@0 as group_alias_0, alias1@1 as alias1], aggr=[]\n              CoalesceBatchesExec: target_batch_size=8192\n                RepartitionExec: partitioning=Hash([group_alias_0@0, alias1@1], 8), input_partitions=8\n                  AggregateExec: mode=Partial, gby=[CAST(id@0 AS Int64) + 1 as group_alias_0, account@1 as alias1], aggr=[]\n                    ScanTable: table=issue59, parallelism=8, priority=Low\n"),
 
 
 DROP TABLE IF EXISTS issue59;
diff --git a/integration_tests/cases/common/explain/explain.result b/integration_tests/cases/common/explain/explain.result
index ba651ecacb..0cd06380d5 100644
--- a/integration_tests/cases/common/explain/explain.result
+++ b/integration_tests/cases/common/explain/explain.result
@@ -10,7 +10,7 @@ EXPLAIN SELECT t FROM `04_explain_t`;
 
 plan_type,plan,
 String("logical_plan"),String("TableScan: 04_explain_t projection=[t]"),
-String("physical_plan"),String("ScanTable: table=04_explain_t, parallelism=8\n"),
+String("physical_plan"),String("ScanTable: table=04_explain_t, parallelism=8, priority=Low\n"),
 
 
 DROP TABLE `04_explain_t`;
diff --git a/integration_tests/cases/common/optimizer/optimizer.result b/integration_tests/cases/common/optimizer/optimizer.result
index 8551f96c52..f9cfac2de9 100644
--- a/integration_tests/cases/common/optimizer/optimizer.result
+++ b/integration_tests/cases/common/optimizer/optimizer.result
@@ -10,7 +10,7 @@ EXPLAIN SELECT max(value) AS c1, avg(value) AS c2 FROM `07_optimizer_t` GROUP BY
 
 plan_type,plan,
 String("logical_plan"),String("Projection: MAX(07_optimizer_t.value) AS c1, AVG(07_optimizer_t.value) AS c2\n  Aggregate: groupBy=[[07_optimizer_t.name]], aggr=[[MAX(07_optimizer_t.value), AVG(07_optimizer_t.value)]]\n    TableScan: 07_optimizer_t projection=[name, value]"),
-String("physical_plan"),String("ProjectionExec: expr=[MAX(07_optimizer_t.value)@1 as c1, AVG(07_optimizer_t.value)@2 as c2]\n  AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[MAX(07_optimizer_t.value), AVG(07_optimizer_t.value)]\n    CoalesceBatchesExec: target_batch_size=8192\n      RepartitionExec: partitioning=Hash([name@0], 8), input_partitions=8\n        AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[MAX(07_optimizer_t.value), AVG(07_optimizer_t.value)]\n          ScanTable: table=07_optimizer_t, parallelism=8\n"),
+String("physical_plan"),String("ProjectionExec: expr=[MAX(07_optimizer_t.value)@1 as c1, AVG(07_optimizer_t.value)@2 as c2]\n  AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[MAX(07_optimizer_t.value), AVG(07_optimizer_t.value)]\n    CoalesceBatchesExec: target_batch_size=8192\n      RepartitionExec: partitioning=Hash([name@0], 8), input_partitions=8\n        AggregateExec: mode=Partial, gby=[name@0 as name], aggr=[MAX(07_optimizer_t.value), AVG(07_optimizer_t.value)]\n          ScanTable: table=07_optimizer_t, parallelism=8, priority=Low\n"),
 
 
 DROP TABLE `07_optimizer_t`;
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.result b/integration_tests/cases/env/cluster/ddl/partition_table.result
index 18e023c006..e8feacbcbb 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.result
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.result
@@ -83,7 +83,7 @@ UInt64(4860320137932382618),Timestamp(1651737067000),String("ceresdb9"),Int32(0)
 EXPLAIN ANALYZE SELECT * from partition_table_t where name = "ceresdb0";
 
 plan_type,plan,
-String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:1, metrics=[\npartition_table_t:\n    __partition_table_t_1:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_1:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_1, parallelism=8, metrics=[\nPredicate { exprs:[name = Utf8(\"ceresdb0\")], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1:\n=0]\n=0]\n"),
+String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:1, metrics=[\npartition_table_t:\n    __partition_table_t_1:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_1:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_1, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name = Utf8(\"ceresdb0\")], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n=0]\n"),
 
 
 -- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
@@ -92,7 +92,7 @@ String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:f
 EXPLAIN ANALYZE SELECT * from partition_table_t where name in ("ceresdb0", "ceresdb1", "ceresdb2", "ceresdb3", "ceresdb4");
 
 plan_type,plan,
-String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:3, metrics=[\npartition_table_t:\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1:\n=0]\n=0]\n"),
+String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:3, metrics=[\npartition_table_t:\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n=0]\n"),
 
 
 ALTER TABLE partition_table_t ADD COLUMN (b string);
diff --git a/integration_tests/cases/env/local/ddl/query-plan.result b/integration_tests/cases/env/local/ddl/query-plan.result
index 26dcf9098e..be858c1540 100644
--- a/integration_tests/cases/env/local/ddl/query-plan.result
+++ b/integration_tests/cases/env/local/ddl/query-plan.result
@@ -31,7 +31,16 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348001000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=1\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t]:\n=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=1\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t]:\n=0]\n"),
+
+
+-- This query should have higher priority
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+explain analyze select t from `03_dml_select_real_time_range`
+where t >= 1695348001000 and t < 1695348002000;
+
+plan_type,plan,
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, priority=High, metrics=[\nPredicate { exprs:[t >= TimestampMillisecond(1695348001000, None), t < TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001000), exclusive_end: Timestamp(1695348002000) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=1\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t]:\n=0]\n"),
 
 
 -- This query should not include memtable
@@ -40,7 +49,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348002000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n=0]\n"),
 
 
 -- SQLNESS ARG pre_cmd=flush
@@ -51,7 +60,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348001000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=1\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_sst_1, fetched_columns:[tsid,t]:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=320\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348001000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348001001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=1\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_sst_1, fetched_columns:[tsid,t]:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=320\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
 
 
 -- This query should not include SST
@@ -59,7 +68,7 @@ explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348002000;
 
 plan_type,plan,
-String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n=0]\n"),
+String("Plan with Metrics"),String("ScanTable: table=03_dml_select_real_time_range, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[t > TimestampMillisecond(1695348002000, None)], time_range:TimeRange { inclusive_start: Timestamp(1695348002001), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=0\n=0]\n"),
 
 
 -- Table with an 'append' update mode
@@ -92,7 +101,7 @@ explain analyze select t from `03_append_mode_table`
 where t >= 1695348001000 and name = 'ceresdb';
 
 plan_type,plan,
-String("Plan with Metrics"),String("ProjectionExec: expr=[t@0 as t], metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=03_append_mode_table, parallelism=8, metrics=[\nPredicate { exprs:[t >= TimestampMillisecond(1695348001000, None), name = Utf8(\"ceresdb\")], time_range:TimeRange { inclusive_start: Timestamp(1695348001000), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=false\n    chain_iter_0:\n        num_memtables=1\n        num_ssts=0\n        scan_duration=xxs\n        since_create=xxs\n        since_init=xxs\n        total_batch_fetched=1\n        total_rows_fetched=2\n        scan_memtable_1, fetched_columns:[t,name]:\n=0]\n"),
+String("Plan with Metrics"),String("ProjectionExec: expr=[t@0 as t], metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=03_append_mode_table, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[t >= TimestampMillisecond(1695348001000, None), name = Utf8(\"ceresdb\")], time_range:TimeRange { inclusive_start: Timestamp(1695348001000), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=false\n    chain_iter_0:\n        num_memtables=1\n        num_ssts=0\n        scan_duration=xxs\n        since_create=xxs\n        since_init=xxs\n        total_batch_fetched=1\n        total_rows_fetched=2\n        scan_memtable_1, fetched_columns:[t,name]:\n=0]\n"),
 
 
 -- Should just fetch projected columns from SST
@@ -106,7 +115,7 @@ explain analyze select t from `03_append_mode_table`
 where t >= 1695348001000 and name = 'ceresdb';
 
 plan_type,plan,
-String("Plan with Metrics"),String("ProjectionExec: expr=[t@0 as t], metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=03_append_mode_table, parallelism=8, metrics=[\nPredicate { exprs:[t >= TimestampMillisecond(1695348001000, None), name = Utf8(\"ceresdb\")], time_range:TimeRange { inclusive_start: Timestamp(1695348001000), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=false\n    chain_iter_0:\n        num_memtables=0\n        num_ssts=1\n        scan_duration=xxs\n        since_create=xxs\n        since_init=xxs\n        total_batch_fetched=1\n        total_rows_fetched=2\n        scan_sst_1, fetched_columns:[t,name]:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=408\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
+String("Plan with Metrics"),String("ProjectionExec: expr=[t@0 as t], metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=03_append_mode_table, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[t >= TimestampMillisecond(1695348001000, None), name = Utf8(\"ceresdb\")], time_range:TimeRange { inclusive_start: Timestamp(1695348001000), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=false\n    chain_iter_0:\n        num_memtables=0\n        num_ssts=1\n        scan_duration=xxs\n        since_create=xxs\n        since_init=xxs\n        total_batch_fetched=1\n        total_rows_fetched=2\n        scan_sst_1, fetched_columns:[t,name]:\n            meta_data_cache_hit=false\n            parallelism=1\n            project_record_batch=xxs\n            read_meta_data_duration=xxs\n            row_mem=408\n            row_num=3\n            prune_row_groups:\n                pruned_by_custom_filter=0\n                pruned_by_min_max=0\n                row_groups_after_prune=1\n                total_row_groups=1\n                use_custom_filter=false\n=0]\n"),
 
 
 DROP TABLE `03_dml_select_real_time_range`;
diff --git a/integration_tests/cases/env/local/ddl/query-plan.sql b/integration_tests/cases/env/local/ddl/query-plan.sql
index a0baff5b81..d1dbdaf963 100644
--- a/integration_tests/cases/env/local/ddl/query-plan.sql
+++ b/integration_tests/cases/env/local/ddl/query-plan.sql
@@ -21,6 +21,11 @@ INSERT INTO `03_dml_select_real_time_range` (t, name, value)
 explain analyze select t from `03_dml_select_real_time_range`
 where t > 1695348001000;
 
+-- This query should have higher priority
+-- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
+explain analyze select t from `03_dml_select_real_time_range`
+where t >= 1695348001000 and t < 1695348002000;
+
 -- This query should not include memtable
 -- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
 explain analyze select t from `03_dml_select_real_time_range`
diff --git a/interpreters/Cargo.toml b/interpreters/Cargo.toml
index 94b9f4ea41..7687b97fe5 100644
--- a/interpreters/Cargo.toml
+++ b/interpreters/Cargo.toml
@@ -40,12 +40,15 @@ df_operator = { workspace = true }
 futures = { workspace = true }
 generic_error = { workspace = true }
 hash_ext = { workspace = true }
+lazy_static = { workspace = true }
 logger = { workspace = true }
 macros = { workspace = true }
 meta_client = { workspace = true }
+prometheus = { workspace = true }
 query_engine = { workspace = true }
 query_frontend = { workspace = true }
 regex = { workspace = true }
+runtime = { workspace = true }
 snafu = { workspace = true }
 table_engine = { workspace = true }
 
diff --git a/interpreters/src/context.rs b/interpreters/src/context.rs
index 34e76d44c9..ab72ca3887 100644
--- a/interpreters/src/context.rs
+++ b/interpreters/src/context.rs
@@ -19,6 +19,7 @@ use std::{sync::Arc, time::Instant};
 use common_types::request_id::RequestId;
 use macros::define_result;
 use query_engine::context::{Context as QueryContext, ContextRef as QueryContextRef};
+use runtime::Priority;
 use snafu::Snafu;
 
 #[derive(Debug, Snafu)]
@@ -36,6 +37,9 @@ pub struct Context {
     default_catalog: String,
     default_schema: String,
     enable_partition_table_access: bool,
+    /// If time range exceeds this threshold, the query will be marked as
+    /// expensive
+    expensive_query_threshold: u64,
 }
 
 impl Context {
@@ -46,16 +50,18 @@ impl Context {
             default_catalog: String::new(),
             default_schema: String::new(),
             enable_partition_table_access: false,
+            expensive_query_threshold: 24 * 3600 * 1000, // default 24 hours
         }
     }
 
     /// Create a new context of query executor
-    pub fn new_query_context(&self) -> Result<QueryContextRef> {
+    pub fn new_query_context(&self, priority: Priority) -> Result<QueryContextRef> {
         let ctx = QueryContext {
             request_id: self.request_id.clone(),
             deadline: self.deadline,
             default_catalog: self.default_catalog.clone(),
             default_schema: self.default_schema.clone(),
+            priority,
         };
         Ok(Arc::new(ctx))
     }
@@ -79,6 +85,11 @@ impl Context {
     pub fn enable_partition_table_access(&self) -> bool {
         self.enable_partition_table_access
     }
+
+    #[inline]
+    pub fn expensive_query_threshold(&self) -> u64 {
+        self.expensive_query_threshold
+    }
 }
 
 #[must_use]
@@ -88,6 +99,7 @@ pub struct Builder {
     default_catalog: String,
     default_schema: String,
     enable_partition_table_access: bool,
+    expensive_query_threshold: u64,
 }
 
 impl Builder {
@@ -102,6 +114,11 @@ impl Builder {
         self
     }
 
+    pub fn expensive_query_threshold(mut self, threshold: u64) -> Self {
+        self.expensive_query_threshold = threshold;
+        self
+    }
+
     pub fn build(self) -> Context {
         Context {
             request_id: self.request_id,
@@ -109,6 +126,7 @@ impl Builder {
             default_catalog: self.default_catalog,
             default_schema: self.default_schema,
             enable_partition_table_access: self.enable_partition_table_access,
+            expensive_query_threshold: self.expensive_query_threshold,
         }
     }
 }
diff --git a/interpreters/src/factory.rs b/interpreters/src/factory.rs
index 6216b6c92f..7217b5d15b 100644
--- a/interpreters/src/factory.rs
+++ b/interpreters/src/factory.rs
@@ -17,6 +17,7 @@
 use catalog::manager::ManagerRef;
 use query_engine::{executor::ExecutorRef, physical_planner::PhysicalPlannerRef};
 use query_frontend::plan::Plan;
+use runtime::PriorityRuntime;
 use table_engine::engine::TableEngineRef;
 
 use crate::{
@@ -37,6 +38,7 @@ use crate::{
 /// A factory to create interpreters
 pub struct Factory {
     query_executor: ExecutorRef,
+    query_runtime: PriorityRuntime,
     physical_planner: PhysicalPlannerRef,
     catalog_manager: ManagerRef,
     table_engine: TableEngineRef,
@@ -50,9 +52,11 @@ impl Factory {
         catalog_manager: ManagerRef,
         table_engine: TableEngineRef,
         table_manipulator: TableManipulatorRef,
+        query_runtime: PriorityRuntime,
     ) -> Self {
         Self {
             query_executor,
+            query_runtime,
             physical_planner,
             catalog_manager,
             table_engine,
@@ -68,9 +72,13 @@ impl Factory {
         validator.validate(&plan)?;
 
         let interpreter = match plan {
-            Plan::Query(p) => {
-                SelectInterpreter::create(ctx, p, self.query_executor, self.physical_planner)
-            }
+            Plan::Query(p) => SelectInterpreter::create(
+                ctx,
+                p,
+                self.query_executor,
+                self.physical_planner,
+                self.query_runtime,
+            ),
             Plan::Insert(p) => InsertInterpreter::create(ctx, p),
             Plan::Create(p) => {
                 CreateInterpreter::create(ctx, p, self.table_engine, self.table_manipulator)
diff --git a/interpreters/src/lib.rs b/interpreters/src/lib.rs
index b7b8ce35bc..5304ab7dbd 100644
--- a/interpreters/src/lib.rs
+++ b/interpreters/src/lib.rs
@@ -29,13 +29,13 @@ pub mod exists;
 pub mod factory;
 pub mod insert;
 pub mod interpreter;
+mod metrics;
 pub mod select;
 pub mod show;
+mod show_create;
 pub mod table_manipulator;
 pub mod validator;
 
-mod show_create;
-
 #[cfg(test)]
 mod tests;
 
diff --git a/query_engine/src/datafusion_impl/logical_optimizer/mod.rs b/interpreters/src/metrics.rs
similarity index 66%
rename from query_engine/src/datafusion_impl/logical_optimizer/mod.rs
rename to interpreters/src/metrics.rs
index a3b5130f3b..5f9b3028fe 100644
--- a/query_engine/src/datafusion_impl/logical_optimizer/mod.rs
+++ b/interpreters/src/metrics.rs
@@ -12,8 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! Logical optimizer
+use lazy_static::lazy_static;
+use prometheus::{register_int_counter_vec, IntCounterVec};
 
-#[cfg(test)]
-pub mod tests;
-pub mod type_conversion;
+lazy_static! {
+    pub static ref ENGINE_QUERY_COUNTER: IntCounterVec = register_int_counter_vec!(
+        "engine_query_counter",
+        "engine_query_counter",
+        &["priority"]
+    )
+    .unwrap();
+}
diff --git a/interpreters/src/select.rs b/interpreters/src/select.rs
index eb701663b8..c33c7fdf8a 100644
--- a/interpreters/src/select.rs
+++ b/interpreters/src/select.rs
@@ -19,15 +19,19 @@ use futures::TryStreamExt;
 use generic_error::{BoxError, GenericError};
 use logger::debug;
 use macros::define_result;
-use query_engine::{executor::ExecutorRef, physical_planner::PhysicalPlannerRef};
-use query_frontend::plan::QueryPlan;
+use query_engine::{
+    context::ContextRef as QueryContextRef,
+    executor::ExecutorRef,
+    physical_planner::{PhysicalPlanRef, PhysicalPlannerRef},
+};
+use query_frontend::plan::{PriorityContext, QueryPlan};
+use runtime::{Priority, PriorityRuntime};
 use snafu::{ResultExt, Snafu};
-use table_engine::stream::SendableRecordBatchStream;
 
 use crate::{
     context::Context,
     interpreter::{Interpreter, InterpreterPtr, Output, Result as InterpreterResult, Select},
-    RecordBatchVec,
+    metrics::ENGINE_QUERY_COUNTER,
 };
 
 #[derive(Debug, Snafu)]
@@ -37,6 +41,9 @@ pub enum Error {
 
     #[snafu(display("Failed to execute physical plan, msg:{}, err:{}", msg, source))]
     ExecutePlan { msg: String, source: GenericError },
+
+    #[snafu(display("Failed to spawn task, err:{}", source))]
+    Spawn { source: runtime::Error },
 }
 
 define_result!(Error);
@@ -47,6 +54,7 @@ pub struct SelectInterpreter {
     plan: QueryPlan,
     executor: ExecutorRef,
     physical_planner: PhysicalPlannerRef,
+    query_runtime: PriorityRuntime,
 }
 
 impl SelectInterpreter {
@@ -55,12 +63,14 @@ impl SelectInterpreter {
         plan: QueryPlan,
         executor: ExecutorRef,
         physical_planner: PhysicalPlannerRef,
+        query_runtime: PriorityRuntime,
     ) -> InterpreterPtr {
         Box::new(Self {
             ctx,
             plan,
             executor,
             physical_planner,
+            query_runtime,
         })
     }
 }
@@ -69,21 +79,37 @@ impl SelectInterpreter {
 impl Interpreter for SelectInterpreter {
     async fn execute(self: Box<Self>) -> InterpreterResult<Output> {
         let request_id = self.ctx.request_id();
-        debug!(
-            "Interpreter execute select begin, request_id:{}, plan:{:?}",
-            request_id, self.plan
-        );
+        let plan = self.plan;
+        let priority = match plan.decide_query_priority(PriorityContext {
+            time_range_threshold: self.ctx.expensive_query_threshold(),
+        }) {
+            Some(v) => v,
+            None => {
+                debug!(
+                    "Query has invalid query range, return empty result directly, id:{request_id}, plan:{plan:?}"
+                );
+                return Ok(Output::Records(Vec::new()));
+            }
+        };
+
+        ENGINE_QUERY_COUNTER
+            .with_label_values(&[priority.as_str()])
+            .inc();
 
         let query_ctx = self
             .ctx
-            .new_query_context()
+            .new_query_context(priority)
             .context(CreateQueryContext)
             .context(Select)?;
 
+        debug!(
+            "Interpreter execute select begin, request_id:{request_id}, plan:{plan:?}, priority:{priority:?}"
+        );
+
         // Create physical plan.
         let physical_plan = self
             .physical_planner
-            .plan(&query_ctx, self.plan)
+            .plan(&query_ctx, plan)
             .await
             .box_err()
             .context(ExecutePlan {
@@ -91,34 +117,50 @@ impl Interpreter for SelectInterpreter {
             })
             .context(Select)?;
 
-        let record_batch_stream = self
-            .executor
-            .execute(&query_ctx, physical_plan)
+        if matches!(priority, Priority::Low) {
+            let executor = self.executor;
+            return self
+                .query_runtime
+                .spawn_with_priority(
+                    async move {
+                        execute_and_collect(query_ctx, executor, physical_plan)
+                            .await
+                            .context(Select)
+                    },
+                    Priority::Low,
+                )
+                .await
+                .context(Spawn)
+                .context(Select)?;
+        }
+
+        execute_and_collect(query_ctx, self.executor, physical_plan)
             .await
-            .box_err()
-            .context(ExecutePlan {
-                msg: "failed to execute physical plan",
-            })
-            .context(Select)?;
-
-        debug!(
-            "Interpreter execute select finish, request_id:{}",
-            request_id
-        );
-
-        let record_batches = collect(record_batch_stream).await?;
-
-        Ok(Output::Records(record_batches))
+            .context(Select)
     }
 }
 
-async fn collect(stream: SendableRecordBatchStream) -> InterpreterResult<RecordBatchVec> {
-    stream
-        .try_collect()
+async fn execute_and_collect(
+    query_ctx: QueryContextRef,
+    executor: ExecutorRef,
+    physical_plan: PhysicalPlanRef,
+) -> Result<Output> {
+    let record_batch_stream = executor
+        .execute(&query_ctx, physical_plan)
         .await
         .box_err()
         .context(ExecutePlan {
-            msg: "failed to collect execution results",
-        })
-        .context(Select)
+            msg: "failed to execute physical plan",
+        })?;
+
+    let record_batches =
+        record_batch_stream
+            .try_collect()
+            .await
+            .box_err()
+            .context(ExecutePlan {
+                msg: "failed to collect execution results",
+            })?;
+
+    Ok(Output::Records(record_batches))
 }
diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs
index ed5081d4fb..1b11f5c9bc 100644
--- a/interpreters/src/tests.rs
+++ b/interpreters/src/tests.rs
@@ -29,6 +29,7 @@ use query_frontend::{
     config::DynamicConfig, parser::Parser, plan::Plan, planner::Planner, provider::MetaProvider,
     tests::MockMetaProvider,
 };
+use runtime::{Builder, PriorityRuntime};
 use table_engine::{engine::TableEngineRef, memory::MockRemoteEngine};
 
 use crate::{
@@ -62,6 +63,7 @@ where
     pub catalog_manager: ManagerRef,
     pub table_manipulator: TableManipulatorRef,
     pub query_engine: QueryEngineRef,
+    pub read_runtime: PriorityRuntime,
 }
 
 impl<M> Env<M>
@@ -84,6 +86,7 @@ where
             self.catalog_manager.clone(),
             self.engine(),
             self.table_manipulator.clone(),
+            self.read_runtime.clone(),
         )
     }
 
@@ -236,6 +239,7 @@ where
             catalog_manager.clone(),
             self.engine(),
             table_manipulator.clone(),
+            self.read_runtime.clone(),
         );
         let insert_sql = "INSERT INTO test_missing_columns_table(key1, key2, field4) VALUES('tagk', 1638428434000, 1), ('tagk2', 1638428434000, 10);";
 
@@ -256,6 +260,7 @@ where
             catalog_manager,
             self.engine(),
             table_manipulator,
+            self.read_runtime.clone(),
         );
         let ctx = Context::builder(RequestId::next_id(), None)
             .default_catalog_and_schema(DEFAULT_CATALOG.to_string(), DEFAULT_SCHEMA.to_string())
@@ -356,14 +361,21 @@ where
     }
 }
 
-#[tokio::test]
-async fn test_interpreters_rocks() {
-    test_util::init_log_for_test();
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_interpreters(rocksdb_ctx).await;
+#[test]
+fn test_interpreters_rocks() {
+    let rt = Arc::new(Builder::default().build().unwrap());
+    let read_runtime = PriorityRuntime::new(rt.clone(), rt.clone());
+    rt.block_on(async {
+        test_util::init_log_for_test();
+        let rocksdb_ctx = RocksDBEngineBuildContext::default();
+        test_interpreters(rocksdb_ctx, read_runtime).await;
+    })
 }
 
-async fn test_interpreters<T: EngineBuildContext>(engine_context: T) {
+async fn test_interpreters<T: EngineBuildContext>(
+    engine_context: T,
+    read_runtime: PriorityRuntime,
+) {
     let env = TestEnv::builder().build();
     let mut test_ctx = env.new_context(engine_context);
     test_ctx.open().await;
@@ -391,6 +403,7 @@ async fn test_interpreters<T: EngineBuildContext>(engine_context: T) {
         catalog_manager,
         table_manipulator,
         query_engine,
+        read_runtime,
     };
 
     env.test_create_table().await;
diff --git a/proxy/src/instance.rs b/proxy/src/instance.rs
index ec0d2e4890..00087d40d1 100644
--- a/proxy/src/instance.rs
+++ b/proxy/src/instance.rs
@@ -21,6 +21,7 @@ use df_operator::registry::FunctionRegistryRef;
 use interpreters::table_manipulator::TableManipulatorRef;
 use query_engine::QueryEngineRef;
 use query_frontend::config::DynamicConfig as FrontendDynamicConfig;
+use runtime::PriorityRuntime;
 use table_engine::{engine::TableEngineRef, remote::RemoteEngineRef};
 
 use crate::limiter::Limiter;
@@ -29,6 +30,7 @@ use crate::limiter::Limiter;
 pub struct Instance {
     pub catalog_manager: ManagerRef,
     pub query_engine: QueryEngineRef,
+    pub query_runtime: PriorityRuntime,
     pub table_engine: TableEngineRef,
     pub partition_table_engine: TableEngineRef,
     // User defined functions registry.
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 74d268138b..93b53590fb 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -39,7 +39,6 @@ mod write;
 pub const FORWARDED_FROM: &str = "forwarded-from";
 
 use std::{
-    ops::Bound,
     sync::Arc,
     time::{Duration, Instant},
 };
@@ -55,14 +54,9 @@ use ceresdbproto::storage::{
     storage_service_client::StorageServiceClient, PrometheusRemoteQueryRequest,
     PrometheusRemoteQueryResponse, Route,
 };
-use common_types::{request_id::RequestId, table::DEFAULT_SHARD_ID, ENABLE_TTL, TTL};
-use datafusion::{
-    prelude::{Column, Expr},
-    scalar::ScalarValue,
-};
+use common_types::{request_id::RequestId, table::DEFAULT_SHARD_ID};
 use futures::FutureExt;
 use generic_error::BoxError;
-use influxql_query::logical_optimizer::range_predicate::find_time_range;
 use interpreters::{
     context::Context as InterpreterContext,
     factory::Factory,
@@ -80,7 +74,6 @@ use table_engine::{
     table::{TableId, TableRef},
     PARTITION_TABLE_ENGINE_TYPE,
 };
-use time_ext::{current_time_millis, parse_duration};
 use tonic::{transport::Channel, IntoRequest};
 
 use crate::{
@@ -92,9 +85,6 @@ use crate::{
     schema_config_provider::SchemaConfigProviderRef,
 };
 
-// Because the clock may have errors, choose 1 hour as the error buffer
-const QUERY_EXPIRED_BUFFER: Duration = Duration::from_secs(60 * 60);
-
 #[derive(Clone, Debug, Deserialize, Serialize)]
 #[serde(default)]
 pub struct SubTableAccessPerm {
@@ -123,6 +113,7 @@ pub struct Proxy {
     cluster_with_meta: bool,
     sub_table_access_perm: SubTableAccessPerm,
     request_notifiers: Option<ReadRequestNotifiers>,
+    expensive_query_threshold: u64,
 }
 
 impl Proxy {
@@ -140,6 +131,7 @@ impl Proxy {
         cluster_with_meta: bool,
         sub_table_access_perm: SubTableAccessPerm,
         request_notifiers: Option<ReadRequestNotifiers>,
+        expensive_query_threshold: u64,
     ) -> Self {
         let forwarder = Arc::new(Forwarder::new(
             forward_config,
@@ -159,6 +151,7 @@ impl Proxy {
             cluster_with_meta,
             sub_table_access_perm,
             request_notifiers,
+            expensive_query_threshold,
         }
     }
 
@@ -211,70 +204,6 @@ impl Proxy {
         })
     }
 
-    /// Returns true when query range maybe exceeding ttl,
-    /// Note: False positive is possible
-    // TODO(tanruixiang): Add integration testing when supported by the testing
-    // framework
-    fn is_plan_expired(
-        &self,
-        plan: &Plan,
-        catalog_name: &str,
-        schema_name: &str,
-        table_name: &str,
-    ) -> Result<bool> {
-        if let Plan::Query(query) = &plan {
-            let catalog = self.get_catalog(catalog_name)?;
-            let schema = self.get_schema(&catalog, schema_name)?;
-            let table_ref = match self.get_table(&schema, table_name) {
-                Ok(Some(v)) => v,
-                _ => return Ok(false),
-            };
-            if let Some(value) = table_ref.options().get(ENABLE_TTL) {
-                if value == "false" {
-                    return Ok(false);
-                }
-            }
-            let ttl_duration = if let Some(ttl) = table_ref.options().get(TTL) {
-                if let Ok(ttl) = parse_duration(ttl) {
-                    ttl
-                } else {
-                    return Ok(false);
-                }
-            } else {
-                return Ok(false);
-            };
-
-            let timestamp_name = &table_ref
-                .schema()
-                .column(table_ref.schema().timestamp_index())
-                .name
-                .clone();
-            let ts_col = Column::from_name(timestamp_name);
-            let range = find_time_range(&query.df_plan, &ts_col)
-                .box_err()
-                .context(Internal {
-                    msg: "Failed to find time range",
-                })?;
-            match range.end {
-                Bound::Included(x) | Bound::Excluded(x) => {
-                    if let Expr::Literal(ScalarValue::Int64(Some(x))) = x {
-                        let now = current_time_millis() as i64;
-                        let deadline = now
-                            - ttl_duration.as_millis() as i64
-                            - QUERY_EXPIRED_BUFFER.as_millis() as i64;
-
-                        if x * 1_000 <= deadline {
-                            return Ok(true);
-                        }
-                    }
-                }
-                Bound::Unbounded => (),
-            }
-        }
-
-        Ok(false)
-    }
-
     fn get_catalog(&self, catalog_name: &str) -> Result<CatalogRef> {
         let catalog = self
             .instance
@@ -554,6 +483,7 @@ impl Proxy {
             // Use current ctx's catalog and schema as default catalog and schema
             .default_catalog_and_schema(catalog.to_string(), schema.to_string())
             .enable_partition_table_access(enable_partition_table_access)
+            .expensive_query_threshold(self.expensive_query_threshold)
             .build();
         let interpreter_factory = Factory::new(
             self.instance.query_engine.executor(),
@@ -561,6 +491,7 @@ impl Proxy {
             self.instance.catalog_manager.clone(),
             self.instance.table_engine.clone(),
             self.instance.table_manipulator.clone(),
+            self.instance.query_runtime.clone(),
         );
         interpreter_factory
             .create(interpreter_ctx, plan)
diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index 9d93221cae..effb88c086 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -28,6 +28,7 @@ use notifier::notifier::{ExecutionGuard, RequestNotifiers, RequestResult};
 use query_frontend::{
     frontend,
     frontend::{Context as SqlContext, Frontend},
+    plan::{Plan, PriorityContext},
     provider::CatalogMetaProvider,
 };
 use router::endpoint::Endpoint;
@@ -176,7 +177,7 @@ impl Proxy {
             .slow_threshold
             .load(std::sync::atomic::Ordering::Relaxed);
         let slow_threshold = Duration::from_secs(slow_threshold_secs);
-        let slow_timer = SlowTimer::new(request_id.as_str(), sql, slow_threshold);
+        let mut slow_timer = SlowTimer::new(request_id.as_str(), sql, slow_threshold);
         let deadline = ctx.timeout.map(|t| slow_timer.start_time() + t);
         let catalog = self.instance.catalog_manager.default_catalog_name();
 
@@ -243,13 +244,11 @@ impl Proxy {
                 })?;
         }
 
-        let mut plan_maybe_expired = false;
-        if let Some(table_name) = &table_name {
-            match self.is_plan_expired(&plan, catalog, schema, table_name) {
-                Ok(v) => plan_maybe_expired = v,
-                Err(err) => {
-                    warn!("Plan expire check failed, err:{err}");
-                }
+        if let Plan::Query(plan) = &plan {
+            if let Some(priority) = plan.decide_query_priority(PriorityContext {
+                time_range_threshold: self.expensive_query_threshold,
+            }) {
+                slow_timer.priority(priority);
             }
         }
 
@@ -276,27 +275,7 @@ impl Proxy {
             "Handle sql query finished, sql:{sql}, elapsed:{cost:?}, catalog:{catalog}, schema:{schema}, ctx:{ctx:?}",
         );
 
-        match &output {
-            Output::AffectedRows(_) => Ok(output),
-            Output::Records(v) => {
-                if plan_maybe_expired {
-                    let num_rows = v
-                        .iter()
-                        .fold(0_usize, |acc, record_batch| acc + record_batch.num_rows());
-                    if num_rows == 0 {
-                        warn!("Query time range maybe exceed TTL, sql:{sql}");
-
-                        // TODO: Cannot return this error directly, empty query
-                        // should return 200, not 4xx/5xx
-                        // All protocols should recognize this error.
-                        // return Err(Error::QueryMaybeExceedTTL {
-                        //     msg: format!("Query time range maybe exceed TTL,
-                        // sql:{sql}"), });
-                    }
-                }
-                Ok(output)
-            }
-        }
+        Ok(output)
     }
 
     async fn maybe_forward_sql_query(
diff --git a/query_engine/Cargo.toml b/query_engine/Cargo.toml
index df2e3240ca..4d9388fa45 100644
--- a/query_engine/Cargo.toml
+++ b/query_engine/Cargo.toml
@@ -46,6 +46,7 @@ logger = { workspace = true }
 macros = { workspace = true }
 prost = { workspace = true }
 query_frontend = { workspace = true }
+runtime = { workspace = true }
 serde = { workspace = true }
 snafu = { workspace = true }
 table_engine = { workspace = true }
diff --git a/query_engine/src/config.rs b/query_engine/src/config.rs
index 1e6350bac7..61462f3c76 100644
--- a/query_engine/src/config.rs
+++ b/query_engine/src/config.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use serde::{Deserialize, Serialize};
+use time_ext::ReadableDuration;
 
 // FIXME: Use cpu number as the default parallelism
 const DEFAULT_READ_PARALLELISM: usize = 8;
@@ -21,12 +22,14 @@ const DEFAULT_READ_PARALLELISM: usize = 8;
 #[serde(default)]
 pub struct Config {
     pub read_parallelism: usize,
+    pub expensive_query_threshold: ReadableDuration,
 }
 
 impl Default for Config {
     fn default() -> Self {
         Self {
             read_parallelism: DEFAULT_READ_PARALLELISM,
+            expensive_query_threshold: ReadableDuration::hours(24),
         }
     }
 }
diff --git a/query_engine/src/context.rs b/query_engine/src/context.rs
index 2bf6aebf9b..55493aeca8 100644
--- a/query_engine/src/context.rs
+++ b/query_engine/src/context.rs
@@ -17,6 +17,7 @@
 use std::{sync::Arc, time::Instant};
 
 use common_types::request_id::RequestId;
+use runtime::Priority;
 
 pub type ContextRef = Arc<Context>;
 
@@ -27,4 +28,5 @@ pub struct Context {
     pub deadline: Option<Instant>,
     pub default_catalog: String,
     pub default_schema: String,
+    pub priority: Priority,
 }
diff --git a/query_engine/src/datafusion_impl/mod.rs b/query_engine/src/datafusion_impl/mod.rs
index ed8d963ffe..dfab1fd0b2 100644
--- a/query_engine/src/datafusion_impl/mod.rs
+++ b/query_engine/src/datafusion_impl/mod.rs
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::{fmt, sync::Arc, time::Instant};
+use std::{sync::Arc, time::Instant};
 
 use catalog::manager::ManagerRef as CatalogManager;
 use datafusion::{
     execution::{
-        context::{QueryPlanner, SessionState},
+        context::SessionState,
         runtime_env::{RuntimeConfig, RuntimeEnv},
         FunctionRegistry,
     },
-    optimizer::analyzer::Analyzer,
-    physical_optimizer::PhysicalOptimizerRule,
     prelude::{SessionConfig, SessionContext},
 };
 use df_engine_extensions::codec::PhysicalExtensionCodecImpl;
@@ -31,8 +29,7 @@ use table_engine::{provider::CeresdbOptions, remote::RemoteEngineRef};
 use crate::{
     context::Context,
     datafusion_impl::{
-        executor::DatafusionExecutorImpl, logical_optimizer::type_conversion::TypeConversion,
-        physical_planner::DatafusionPhysicalPlannerImpl,
+        executor::DatafusionExecutorImpl, physical_planner::DatafusionPhysicalPlannerImpl,
         physical_planner_extension::QueryPlannerAdapter, task_context::Preprocessor,
     },
     executor::ExecutorRef,
@@ -41,7 +38,6 @@ use crate::{
 };
 
 pub mod executor;
-pub mod logical_optimizer;
 pub mod physical_optimizer;
 pub mod physical_plan;
 pub mod physical_plan_extension;
@@ -67,15 +63,12 @@ impl DatafusionQueryEngineImpl {
     ) -> Result<Self> {
         let runtime_env = Arc::new(RuntimeEnv::new(runtime_config).unwrap());
         let df_physical_planner = Arc::new(QueryPlannerAdapter);
-        let df_ctx_builder = Arc::new(DfContextBuilder::new(
-            config,
-            runtime_env.clone(),
+        let df_ctx_builder = Arc::new(DfContextBuilder::new(config, runtime_env.clone()));
+        let physical_planner = Arc::new(DatafusionPhysicalPlannerImpl::new(
+            df_ctx_builder.clone(),
             df_physical_planner,
         ));
 
-        // Physical planner
-        let physical_planner = Arc::new(DatafusionPhysicalPlannerImpl::new(df_ctx_builder.clone()));
-
         // Executor
         let extension_codec = Arc::new(PhysicalExtensionCodecImpl::new());
         let preprocessor = Arc::new(Preprocessor::new(
@@ -105,33 +98,17 @@ impl QueryEngine for DatafusionQueryEngineImpl {
 }
 
 /// Datafusion context builder
-#[derive(Clone)]
+#[derive(Debug, Clone)]
 pub struct DfContextBuilder {
     config: Config,
     runtime_env: Arc<RuntimeEnv>,
-    physical_planner: Arc<dyn QueryPlanner + Send + Sync>,
-}
-
-impl fmt::Debug for DfContextBuilder {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("DfContextBuilder")
-            .field("config", &self.config)
-            .field("runtime_env", &self.runtime_env)
-            .field("physical_planner", &"QueryPlannerAdapter")
-            .finish()
-    }
 }
 
 impl DfContextBuilder {
-    pub fn new(
-        config: Config,
-        runtime_env: Arc<RuntimeEnv>,
-        physical_planner: Arc<dyn QueryPlanner + Send + Sync>,
-    ) -> Self {
+    pub fn new(config: Config, runtime_env: Arc<RuntimeEnv>) -> Self {
         Self {
             config,
             runtime_env,
-            physical_planner,
         }
     }
 
@@ -144,6 +121,7 @@ impl DfContextBuilder {
             request_timeout: timeout,
             default_catalog: ctx.default_catalog.clone(),
             default_schema: ctx.default_schema.clone(),
+            priority: ctx.priority,
         };
         let mut df_session_config = SessionConfig::new()
             .with_default_catalog_and_schema(
@@ -159,40 +137,7 @@ impl DfContextBuilder {
 
         // Using default logcial optimizer, if want to add more custom rule, using
         // `add_optimizer_rule` to add.
-        let state = SessionState::with_config_rt(df_session_config, self.runtime_env.clone())
-            .with_query_planner(self.physical_planner.clone());
-
-        // Register analyzer rules
-        let state = Self::register_analyzer_rules(state);
-
-        // Register iox optimizers, used by influxql.
-        let state = influxql_query::logical_optimizer::register_iox_logical_optimizers(state);
-
+        let state = SessionState::with_config_rt(df_session_config, self.runtime_env.clone());
         SessionContext::with_state(state)
     }
-
-    // TODO: this is not used now, bug of RepartitionAdapter is already fixed in
-    // datafusion itself. Remove this code in future.
-    #[allow(dead_code)]
-    fn apply_adapters_for_physical_optimize_rules(
-        default_rules: &[Arc<dyn PhysicalOptimizerRule + Send + Sync>],
-    ) -> Vec<Arc<dyn PhysicalOptimizerRule + Send + Sync>> {
-        let mut new_rules = Vec::with_capacity(default_rules.len());
-        for rule in default_rules {
-            new_rules.push(physical_optimizer::may_adapt_optimize_rule(rule.clone()))
-        }
-
-        new_rules
-    }
-
-    fn register_analyzer_rules(mut state: SessionState) -> SessionState {
-        // Our analyzer has high priority, so first add we custom rules, then add the
-        // default ones.
-        state = state.with_analyzer_rules(vec![Arc::new(TypeConversion)]);
-        for rule in Analyzer::new().rules {
-            state = state.add_analyzer_rule(rule);
-        }
-
-        state
-    }
 }
diff --git a/query_engine/src/datafusion_impl/physical_planner.rs b/query_engine/src/datafusion_impl/physical_planner.rs
index 5828733909..8ee4198fd8 100644
--- a/query_engine/src/datafusion_impl/physical_planner.rs
+++ b/query_engine/src/datafusion_impl/physical_planner.rs
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::sync::Arc;
+use std::{fmt, sync::Arc};
 
 use async_trait::async_trait;
+use datafusion::execution::context::QueryPlanner;
 use generic_error::BoxError;
-use query_frontend::{plan::QueryPlan, provider::CatalogProviderAdapter};
+use query_frontend::plan::QueryPlan;
 use snafu::ResultExt;
 
 use crate::{
@@ -30,14 +31,29 @@ use crate::{
 };
 
 /// Physical planner based on datafusion
-#[derive(Debug, Clone)]
+#[derive(Clone)]
 pub struct DatafusionPhysicalPlannerImpl {
     df_ctx_builder: Arc<DfContextBuilder>,
+    physical_planner: Arc<dyn QueryPlanner + Send + Sync>,
+}
+
+impl fmt::Debug for DatafusionPhysicalPlannerImpl {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("DfContextBuilder")
+            .field("df_ctx_builder", &self.df_ctx_builder)
+            .finish()
+    }
 }
 
 impl DatafusionPhysicalPlannerImpl {
-    pub fn new(df_ctx_builder: Arc<DfContextBuilder>) -> Self {
-        Self { df_ctx_builder }
+    pub fn new(
+        df_ctx_builder: Arc<DfContextBuilder>,
+        physical_planner: Arc<dyn QueryPlanner + Send + Sync>,
+    ) -> Self {
+        Self {
+            df_ctx_builder,
+            physical_planner,
+        }
     }
 
     fn has_partitioned_table(logical_plan: &QueryPlan) -> bool {
@@ -59,21 +75,16 @@ impl DatafusionPhysicalPlannerImpl {
 impl PhysicalPlanner for DatafusionPhysicalPlannerImpl {
     // TODO: we should modify `QueryPlan` to support create remote plan here.
     async fn plan(&self, ctx: &Context, logical_plan: QueryPlan) -> Result<PhysicalPlanRef> {
-        // Register catalogs to datafusion execution context.
-        let catalogs = CatalogProviderAdapter::new_adapters(logical_plan.tables.clone());
         // TODO: maybe we should not build `SessionContext` in each physical plan's
         // building. We need to do so because we place some dynamic
         // information(such as `timeout`) in `SessionConfig`, maybe it is better
         // to remove it to `TaskContext`.
         let df_ctx = self.df_ctx_builder.build(ctx);
-        for (name, catalog) in catalogs {
-            df_ctx.register_catalog(&name, Arc::new(catalog));
-        }
+        let state = df_ctx.state();
 
-        // Generate physical plan.
-        let exec_plan = df_ctx
-            .state()
-            .create_physical_plan(&logical_plan.df_plan)
+        let exec_plan = self
+            .physical_planner
+            .create_physical_plan(&logical_plan.df_plan, &state)
             .await
             .box_err()
             .context(PhysicalPlannerWithCause { msg: None })?;
diff --git a/query_engine/src/datafusion_impl/task_context.rs b/query_engine/src/datafusion_impl/task_context.rs
index f19c5dde35..8aefd563c1 100644
--- a/query_engine/src/datafusion_impl/task_context.rs
+++ b/query_engine/src/datafusion_impl/task_context.rs
@@ -38,6 +38,7 @@ use df_engine_extensions::dist_sql_query::{
 use futures::future::BoxFuture;
 use generic_error::BoxError;
 use prost::Message;
+use runtime::Priority;
 use snafu::ResultExt;
 use table_engine::{
     provider::{CeresdbOptions, ScanTable, SCAN_TABLE_METRICS_COLLECTOR_NAME},
@@ -191,6 +192,7 @@ impl RemotePhysicalPlanExecutor for RemotePhysicalPlanExecutorImpl {
             .map(|n| Instant::now() + Duration::from_millis(n));
         let default_catalog = ceresdb_options.default_catalog.clone();
         let default_schema = ceresdb_options.default_schema.clone();
+        let priority = ceresdb_options.priority;
 
         let display_plan = DisplayableExecutionPlan::new(plan.as_ref());
         let exec_ctx = ExecContext {
@@ -199,6 +201,7 @@ impl RemotePhysicalPlanExecutor for RemotePhysicalPlanExecutorImpl {
             default_catalog,
             default_schema,
             query: display_plan.indent(true).to_string(),
+            priority,
         };
 
         // Encode plan and schema
@@ -261,6 +264,7 @@ impl DistQueryResolverBuilder {
             self.remote_executor.clone(),
             self.catalog_manager.clone(),
             scan_builder,
+            ctx.priority,
         )
     }
 }
@@ -278,6 +282,7 @@ impl ExecutableScanBuilder for ExecutableScanBuilderImpl {
         &self,
         table: TableRef,
         ctx: TableScanContext,
+        priority: Priority,
     ) -> DfResult<Arc<dyn ExecutionPlan>> {
         let read_opts = ReadOptions {
             batch_size: ctx.batch_size,
@@ -291,6 +296,7 @@ impl ExecutableScanBuilder for ExecutableScanBuilderImpl {
             projected_schema: ctx.projected_schema,
             predicate: ctx.predicate,
             metrics_collector: MetricsCollector::new(SCAN_TABLE_METRICS_COLLECTOR_NAME.to_string()),
+            priority,
         };
 
         let mut scan = ScanTable::new(table, read_request);
diff --git a/query_frontend/Cargo.toml b/query_frontend/Cargo.toml
index 998beb2b9f..51705ab26f 100644
--- a/query_frontend/Cargo.toml
+++ b/query_frontend/Cargo.toml
@@ -36,6 +36,7 @@ arrow = { workspace = true }
 async-trait = { workspace = true }
 catalog = { workspace = true }
 ceresdbproto = { workspace = true }
+chrono = { workspace = true }
 cluster = { workspace = true }
 codec = { workspace = true }
 common_types = { workspace = true }
@@ -46,6 +47,7 @@ generic_error = { workspace = true }
 hash_ext = { workspace = true }
 influxql-logical-planner = { workspace = true }
 influxql-parser = { workspace = true }
+influxql-query = { workspace = true }
 influxql-schema = { workspace = true }
 itertools = { workspace = true }
 lazy_static = { workspace = true }
@@ -56,6 +58,7 @@ paste = { workspace = true }
 prom-remote-api = { workspace = true }
 regex = { workspace = true }
 regex-syntax = "0.6.28"
+runtime = { workspace = true }
 snafu = { workspace = true }
 sqlparser = { workspace = true }
 table_engine = { workspace = true }
diff --git a/query_frontend/src/frontend.rs b/query_frontend/src/frontend.rs
index 7208cfbedf..aff6eabb08 100644
--- a/query_frontend/src/frontend.rs
+++ b/query_frontend/src/frontend.rs
@@ -212,50 +212,53 @@ impl<P: MetaProvider> Frontend<P> {
     }
 }
 
-pub fn parse_table_name(statements: &StatementVec) -> Option<String> {
-    // maybe have empty sql
-    if statements.is_empty() {
-        return None;
-    }
-    match &statements[0] {
-        Statement::Standard(s) => match *s.clone() {
-            SqlStatement::Insert { table_name, .. } => {
-                Some(TableName::from(table_name).to_string())
-            }
-            SqlStatement::Explain { statement, .. } => {
-                if let SqlStatement::Query(q) = *statement {
-                    match *q.body {
-                        SetExpr::Select(select) => {
-                            if select.from.len() != 1 {
-                                None
-                            } else if let TableFactor::Table { name, .. } = &select.from[0].relation
-                            {
-                                Some(TableName::from(name.clone()).to_string())
-                            } else {
-                                None
-                            }
+pub fn parse_table_name_with_standard(sql_statement: &SqlStatement) -> Option<String> {
+    match sql_statement.clone() {
+        SqlStatement::Insert { table_name, .. } => {
+            Some(TableName::from(table_name.clone()).to_string())
+        }
+        SqlStatement::Explain { statement, .. } => {
+            if let SqlStatement::Query(q) = *statement {
+                match *q.body {
+                    SetExpr::Select(select) => {
+                        if select.from.len() != 1 {
+                            None
+                        } else if let TableFactor::Table { name, .. } = &select.from[0].relation {
+                            Some(TableName::from(name.clone()).to_string())
+                        } else {
+                            None
                         }
-                        // TODO: return unsupported error rather than none.
-                        _ => None,
                     }
+                    // TODO: return unsupported error rather than none.
+                    _ => None,
+                }
+            } else {
+                None
+            }
+        }
+        SqlStatement::Query(q) => match *q.body {
+            SetExpr::Select(select) => {
+                if select.from.len() != 1 {
+                    None
+                } else if let TableFactor::Table { name, .. } = &select.from[0].relation {
+                    Some(TableName::from(name.clone()).to_string())
                 } else {
                     None
                 }
             }
-            SqlStatement::Query(q) => match *q.body {
-                SetExpr::Select(select) => {
-                    if select.from.len() != 1 {
-                        None
-                    } else if let TableFactor::Table { name, .. } = &select.from[0].relation {
-                        Some(TableName::from(name.clone()).to_string())
-                    } else {
-                        None
-                    }
-                }
-                _ => None,
-            },
             _ => None,
         },
+        _ => None,
+    }
+}
+
+pub fn parse_table_name(statements: &StatementVec) -> Option<String> {
+    // maybe have empty sql
+    if statements.is_empty() {
+        return None;
+    }
+    match &statements[0] {
+        Statement::Standard(s) => parse_table_name_with_standard(s),
         Statement::Create(s) => Some(s.table_name.to_string()),
         Statement::Drop(s) => Some(s.table_name.to_string()),
         Statement::Describe(s) => Some(s.table_name.to_string()),
diff --git a/query_frontend/src/influxql/planner.rs b/query_frontend/src/influxql/planner.rs
index dc3b3693e5..960da30776 100644
--- a/query_frontend/src/influxql/planner.rs
+++ b/query_frontend/src/influxql/planner.rs
@@ -37,6 +37,7 @@ use table_engine::table::TableRef;
 
 use crate::{
     influxql::error::*,
+    logical_optimizer::optimize_plan,
     plan::{Plan, QueryPlan, QueryType, ShowPlan, ShowTablesPlan},
     provider::{ContextProviderAdapter, MetaProvider},
 };
@@ -171,6 +172,12 @@ impl<'a, P: MetaProvider> Planner<'a, P> {
                         .context(BuildPlanWithCause {
                             msg: "planner stmt to plan",
                         })?;
+                let df_plan = optimize_plan(&df_plan)
+                    .box_err()
+                    .context(BuildPlanWithCause {
+                        msg: "optimize plan",
+                    })?;
+
                 let tables = Arc::new(
                     self.schema_provider
                         .context_provider
@@ -180,7 +187,11 @@ impl<'a, P: MetaProvider> Planner<'a, P> {
                             msg: "get tables from context_provider",
                         })?,
                 );
-                Ok(Plan::Query(QueryPlan { df_plan, tables }))
+                Ok(Plan::Query(QueryPlan {
+                    df_plan,
+                    tables,
+                    table_name: None,
+                }))
             }
         }
     }
diff --git a/query_frontend/src/lib.rs b/query_frontend/src/lib.rs
index ee96af794a..c8a2617c22 100644
--- a/query_frontend/src/lib.rs
+++ b/query_frontend/src/lib.rs
@@ -23,6 +23,7 @@ pub mod config;
 pub mod container;
 pub mod frontend;
 pub mod influxql;
+mod logical_optimizer;
 pub mod parser;
 mod partition;
 pub mod plan;
diff --git a/query_frontend/src/logical_optimizer/mod.rs b/query_frontend/src/logical_optimizer/mod.rs
new file mode 100644
index 0000000000..9fcd46b331
--- /dev/null
+++ b/query_frontend/src/logical_optimizer/mod.rs
@@ -0,0 +1,49 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Logical optimizer
+
+mod type_conversion;
+use std::sync::Arc;
+
+use datafusion::{
+    error::Result,
+    execution::{context::SessionState, runtime_env::RuntimeEnv},
+    logical_expr::LogicalPlan,
+    optimizer::analyzer::Analyzer,
+    prelude::SessionConfig,
+};
+use type_conversion::TypeConversion;
+
+pub fn optimize_plan(plan: &LogicalPlan) -> Result<LogicalPlan> {
+    let state = SessionState::with_config_rt(SessionConfig::new(), Arc::new(RuntimeEnv::default()));
+    let state = register_analyzer_rules(state);
+    // Register iox optimizers, used by influxql.
+    let state = influxql_query::logical_optimizer::register_iox_logical_optimizers(state);
+
+    let plan = state.optimize(plan)?;
+
+    Ok(plan)
+}
+
+fn register_analyzer_rules(mut state: SessionState) -> SessionState {
+    // Our analyzer has high priority, so first add we custom rules, then add the
+    // default ones.
+    state = state.with_analyzer_rules(vec![Arc::new(crate::logical_optimizer::TypeConversion)]);
+    for rule in Analyzer::new().rules {
+        state = state.add_analyzer_rule(rule);
+    }
+
+    state
+}
diff --git a/query_engine/src/datafusion_impl/logical_optimizer/type_conversion.rs b/query_frontend/src/logical_optimizer/type_conversion.rs
similarity index 98%
rename from query_engine/src/datafusion_impl/logical_optimizer/type_conversion.rs
rename to query_frontend/src/logical_optimizer/type_conversion.rs
index 29c1e95684..f2b0992aa3 100644
--- a/query_engine/src/datafusion_impl/logical_optimizer/type_conversion.rs
+++ b/query_frontend/src/logical_optimizer/type_conversion.rs
@@ -372,7 +372,6 @@ mod tests {
     };
 
     use super::*;
-    use crate::datafusion_impl::logical_optimizer::type_conversion;
 
     fn expr_test_schema() -> DFSchemaRef {
         Arc::new(
@@ -591,7 +590,7 @@ mod tests {
             "2021-09-07 16:00:00Z",
         ];
         for string in date_string {
-            let result = type_conversion::string_to_timestamp_ms_workaround(string);
+            let result = string_to_timestamp_ms_workaround(string);
             assert!(result.is_err());
         }
 
@@ -600,7 +599,7 @@ mod tests {
         let t = NaiveTime::from_hms_milli_opt(16, 0, 0, 0).unwrap();
         let dt = NaiveDateTime::new(d, t);
         let expect = naive_datetime_to_timestamp(&date_string, dt).unwrap();
-        let result = type_conversion::string_to_timestamp_ms_workaround(&date_string);
+        let result = string_to_timestamp_ms_workaround(&date_string);
         if let Ok(ScalarValue::TimestampMillisecond(Some(mills), _)) = result {
             assert_eq!(mills, expect)
         }
diff --git a/query_frontend/src/plan.rs b/query_frontend/src/plan.rs
index f2dd864e3d..94c8bca6b3 100644
--- a/query_frontend/src/plan.rs
+++ b/query_frontend/src/plan.rs
@@ -18,14 +18,21 @@ use std::{
     collections::{BTreeMap, HashMap},
     fmt,
     fmt::{Debug, Formatter},
+    ops::Bound,
     sync::Arc,
 };
 
-use common_types::{column_schema::ColumnSchema, row::RowGroup, schema::Schema};
-use datafusion::logical_expr::{
-    expr::Expr as DfLogicalExpr, logical_plan::LogicalPlan as DataFusionLogicalPlan,
+use common_types::{column_schema::ColumnSchema, row::RowGroup, schema::Schema, time::TimeRange};
+use datafusion::{
+    logical_expr::{
+        expr::Expr as DfLogicalExpr, logical_plan::LogicalPlan as DataFusionLogicalPlan,
+    },
+    prelude::Column,
+    scalar::ScalarValue,
 };
+use logger::{debug, warn};
 use macros::define_result;
+use runtime::Priority;
 use snafu::Snafu;
 use table_engine::{partition::PartitionInfo, table::TableRef};
 
@@ -70,14 +77,132 @@ pub enum Plan {
     Exists(ExistsTablePlan),
 }
 
+pub struct PriorityContext {
+    pub time_range_threshold: u64,
+}
+
 pub struct QueryPlan {
     pub df_plan: DataFusionLogicalPlan,
+    pub table_name: Option<String>,
     // Contains the TableProviders so we can register the them to ExecutionContext later.
     // Use TableProviderAdapter here so we can get the underlying TableRef and also be
     // able to cast to Arc<dyn TableProvider + Send + Sync>
     pub tables: Arc<TableContainer>,
 }
 
+impl QueryPlan {
+    fn find_timestamp_column(&self) -> Option<Column> {
+        let table_name = self.table_name.as_ref()?;
+        let table_ref = self.tables.get(table_name.into())?;
+        let schema = table_ref.table.schema();
+        let timestamp_name = schema.timestamp_name();
+        Some(Column::from_name(timestamp_name))
+    }
+
+    /// This function is used to extract time range from the query plan.
+    /// It will return max possible time range. For example, if the query
+    /// contains no timestmap filter, it will return
+    /// `TimeRange::min_to_max()`
+    ///
+    /// Note: When it timestamp filter evals to false(such as ts < 10 and ts >
+    /// 100), it will return None, which means no valid time range for this
+    /// query.
+    fn extract_time_range(&self) -> Option<TimeRange> {
+        let ts_column = if let Some(v) = self.find_timestamp_column() {
+            v
+        } else {
+            warn!(
+                "Couldn't find time column, plan:{:?}, table_name:{:?}",
+                self.df_plan, self.table_name
+            );
+            return Some(TimeRange::min_to_max());
+        };
+        let time_range = match influxql_query::logical_optimizer::range_predicate::find_time_range(
+            &self.df_plan,
+            &ts_column,
+        ) {
+            Ok(v) => v,
+            Err(e) => {
+                warn!(
+                    "Couldn't find time range, plan:{:?}, err:{}",
+                    self.df_plan, e
+                );
+                return Some(TimeRange::min_to_max());
+            }
+        };
+
+        debug!(
+            "Extract time range, value:{time_range:?}, plan:{:?}",
+            self.df_plan
+        );
+        let mut start = i64::MIN;
+        match time_range.start {
+            Bound::Included(inclusive_start) => {
+                if let DfLogicalExpr::Literal(ScalarValue::TimestampMillisecond(Some(x), _)) =
+                    inclusive_start
+                {
+                    start = start.max(x);
+                }
+            }
+            Bound::Excluded(exclusive_start) => {
+                if let DfLogicalExpr::Literal(ScalarValue::TimestampMillisecond(Some(x), _)) =
+                    exclusive_start
+                {
+                    start = start.max(x + 1);
+                }
+            }
+            Bound::Unbounded => {}
+        }
+        let mut end = i64::MAX;
+        match time_range.end {
+            Bound::Included(inclusive_end) => {
+                if let DfLogicalExpr::Literal(ScalarValue::TimestampMillisecond(Some(x), _)) =
+                    inclusive_end
+                {
+                    end = end.min(x + 1);
+                }
+            }
+            Bound::Excluded(exclusive_start) => {
+                if let DfLogicalExpr::Literal(ScalarValue::TimestampMillisecond(Some(x), _)) =
+                    exclusive_start
+                {
+                    end = end.min(x);
+                }
+            }
+            Bound::Unbounded => {}
+        }
+
+        TimeRange::new(start.into(), end.into())
+    }
+
+    /// Decide the query priority based on the query plan.
+    /// When query contains invalid time range, it will return None.
+    // TODO: Currently we only consider the time range, consider other factors, such
+    // as the number of series, or slow log metrics.
+    pub fn decide_query_priority(&self, ctx: PriorityContext) -> Option<Priority> {
+        let threshold = ctx.time_range_threshold;
+        let time_range = self.extract_time_range()?;
+        let is_expensive = if let Some(v) = time_range
+            .exclusive_end()
+            .as_i64()
+            .checked_sub(time_range.inclusive_start().as_i64())
+        {
+            v as u64 >= threshold
+        } else {
+            // When overflow, we treat it as expensive query.
+            true
+        };
+
+        let priority = if is_expensive {
+            Priority::Low
+        } else {
+            Priority::High
+        };
+
+        Some(priority)
+    }
+}
+
 impl Debug for QueryPlan {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("QueryPlan")
@@ -200,3 +325,81 @@ pub enum ShowPlan {
 pub struct ExistsTablePlan {
     pub exists: bool,
 }
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    use crate::planner::tests::sql_to_logical_plan;
+
+    #[test]
+    fn test_extract_time_range() {
+        // key2 is timestamp column
+        let testcases = [
+            (
+                "select * from test_table where key2 > 1 and key2 < 10",
+                Some((2, 10)),
+            ),
+            (
+                "select field1 from test_table where key2 > 1 and key2 < 10",
+                Some((2, 10)),
+            ),
+            (
+                "select * from test_table where key2 >= 1 and key2 <= 10",
+                Some((1, 11)),
+            ),
+            (
+                "select * from test_table where key2 < 1 and key2 > 10",
+                None,
+            ),
+            (
+                "select * from test_table where key2 < 1 ",
+                Some((i64::MIN, 1))
+            ),
+            (
+                "select * from test_table where key2 > 1 ",
+                Some((2, i64::MAX))
+            ),
+            // date literals
+            (
+                r#"select * from test_table where key2 >= "2023-11-21 14:12:00+08:00" and key2 < "2023-11-21 14:22:00+08:00" "#,
+                Some((1700547120000, 1700547720000))
+            ),
+             // no timestamp filter
+            ("select * from test_table", Some((i64::MIN, i64::MAX))),
+            // aggr
+            (
+                "select key2, sum(field1) from test_table where key2 > 1 and key2 < 10 group by key2",
+                Some((2, 10)),
+            ),
+            // aggr & sort
+            (
+                "select key2, sum(field1) from test_table where key2 > 1 and key2 < 10 group by key2 order by key2",     Some((2, 10)),
+            ),
+            // explain
+            (
+                "explain select * from test_table where key2 > 1 and key2 < 10",
+                Some((2, 10)),
+            ),
+            // analyze
+            (
+                "explain analyze select * from test_table where key2 > 1 and key2 < 10",
+                Some((2, 10)),
+            ),
+        ];
+
+        for case in testcases {
+            let sql = case.0;
+            let plan = sql_to_logical_plan(sql).unwrap();
+            let plan = match plan {
+                Plan::Query(v) => v,
+                _ => unreachable!(),
+            };
+            let expected = case
+                .1
+                .map(|v| TimeRange::new_unchecked(v.0.into(), v.1.into()));
+
+            assert_eq!(plan.extract_time_range(), expected, "sql:{}", sql);
+        }
+    }
+}
diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs
index 7b1fa82ede..559fbcd8b7 100644
--- a/query_frontend/src/planner.rs
+++ b/query_frontend/src/planner.rs
@@ -67,6 +67,8 @@ use crate::{
     },
     config::DynamicConfig,
     container::TableReference,
+    frontend::parse_table_name_with_standard,
+    logical_optimizer::optimize_plan,
     parser,
     partition::PartitionParser,
     plan::{
@@ -613,18 +615,20 @@ impl<'a, P: MetaProvider> PlannerDelegate<'a, P> {
 
     fn sql_statement_to_datafusion_plan(self, sql_stmt: SqlStatement) -> Result<Plan> {
         let df_planner = SqlToRel::new_with_options(&self.meta_provider, DEFAULT_PARSER_OPTS);
+        let table_name = parse_table_name_with_standard(&sql_stmt);
 
         let df_plan = df_planner
             .sql_statement_to_plan(sql_stmt)
             .context(DatafusionPlan)?;
+        let df_plan = optimize_plan(&df_plan).context(DatafusionPlan)?;
 
         debug!("Sql statement to datafusion plan, df_plan:\n{:#?}", df_plan);
 
         // Get all tables needed in the plan
         let tables = self.meta_provider.try_into_container().context(FindMeta)?;
-
         Ok(Plan::Query(QueryPlan {
             df_plan,
+            table_name,
             tables: Arc::new(tables),
         }))
     }
@@ -1389,7 +1393,7 @@ pub fn get_table_ref(table_name: &str) -> TableReference {
 }
 
 #[cfg(test)]
-mod tests {
+pub mod tests {
 
     use ceresdbproto::storage::{
         value, Field, FieldGroup, Tag, Value as PbValue, WriteSeriesEntry,
@@ -1416,7 +1420,7 @@ mod tests {
         Ok(())
     }
 
-    fn sql_to_logical_plan(sql: &str) -> Result<Plan> {
+    pub fn sql_to_logical_plan(sql: &str) -> Result<Plan> {
         let dyn_config = DynamicConfig::default();
         sql_to_logical_plan_with_config(sql, &dyn_config)
     }
@@ -1644,10 +1648,9 @@ mod tests {
         let sql = "select * from test_table;";
         quick_test(
             sql,
-            "Query(
+            r"Query(
     QueryPlan {
-        df_plan: Projection: test_table.key1, test_table.key2, test_table.field1, test_table.field2, test_table.field3, test_table.field4
-          TableScan: test_table,
+        df_plan: TableScan: test_table projection=[key1, key2, field1, field2, field3, field4],
     },
 )",
         )
diff --git a/query_frontend/src/promql/convert.rs b/query_frontend/src/promql/convert.rs
index e8ba58518f..0109f9ac1d 100644
--- a/query_frontend/src/promql/convert.rs
+++ b/query_frontend/src/promql/convert.rs
@@ -154,7 +154,7 @@ impl Expr {
         meta_provider: ContextProviderAdapter<'_, P>,
         read_parallelism: usize,
     ) -> Result<(Plan, Arc<ColumnNames>)> {
-        let (logic_plan, column_name, _) =
+        let (logic_plan, column_name, table_name) =
             self.build_plan_iter(&meta_provider, INIT_LEVEL, read_parallelism)?;
         let tables = Arc::new(
             meta_provider
@@ -167,6 +167,7 @@ impl Expr {
             Plan::Query(QueryPlan {
                 df_plan: logic_plan,
                 tables,
+                table_name: Some(table_name),
             }),
             column_name,
         ))
diff --git a/query_frontend/src/promql/remote.rs b/query_frontend/src/promql/remote.rs
index 6b8c8f8f3f..4343048ca9 100644
--- a/query_frontend/src/promql/remote.rs
+++ b/query_frontend/src/promql/remote.rs
@@ -27,6 +27,7 @@ use prom_remote_api::types::{label_matcher, LabelMatcher, Query};
 use snafu::{OptionExt, ResultExt};
 
 use crate::{
+    logical_optimizer::optimize_plan,
     plan::{Plan, QueryPlan},
     promql::{
         convert::Selector,
@@ -81,6 +82,7 @@ pub fn remote_query_to_plan<P: MetaProvider>(
         .sort(sort_exprs)?
         .build()
         .context(BuildPlanError)?;
+    let df_plan = optimize_plan(&df_plan).context(BuildPlanError)?;
 
     let tables = Arc::new(
         meta_provider
@@ -90,7 +92,11 @@ pub fn remote_query_to_plan<P: MetaProvider>(
             })?,
     );
     Ok(RemoteQueryPlan {
-        plan: Plan::Query(QueryPlan { df_plan, tables }),
+        plan: Plan::Query(QueryPlan {
+            df_plan,
+            tables,
+            table_name: Some(metric),
+        }),
         field_col_name: field,
         timestamp_col_name: timestamp_col_name.to_string(),
     })
@@ -185,8 +191,8 @@ mod tests {
                 r#"
 Query(QueryPlan { df_plan: Sort: cpu.tsid ASC NULLS FIRST, cpu.time ASC NULLS FIRST
   Projection: cpu.tag1, cpu.tag2, cpu.time, cpu.tsid, cpu.value
-    Filter: cpu.tag1 = Utf8("some-value") AND cpu.time BETWEEN Int64(1000) AND Int64(2000)
-      TableScan: cpu })"#
+    Filter: cpu.tag1 = Utf8("some-value") AND cpu.time >= TimestampMillisecond(1000, None) AND cpu.time <= TimestampMillisecond(2000, None)
+      TableScan: cpu projection=[tsid, time, tag1, tag2, value], partial_filters=[cpu.tag1 = Utf8("some-value"), cpu.time >= TimestampMillisecond(1000, None), cpu.time <= TimestampMillisecond(2000, None)] })"#
                     .to_string()
             );
             assert_eq!(&field_col_name, "value");
@@ -217,8 +223,8 @@ Query(QueryPlan { df_plan: Sort: cpu.tsid ASC NULLS FIRST, cpu.time ASC NULLS FI
                 r#"
 Query(QueryPlan { df_plan: Sort: cpu.tsid ASC NULLS FIRST, cpu.time ASC NULLS FIRST
   Projection: cpu.tag1, cpu.tag2, cpu.time, cpu.tsid, cpu.field2
-    Filter: cpu.tag1 = Utf8("some-value") AND cpu.time BETWEEN Int64(1000) AND Int64(2000)
-      TableScan: cpu })"#
+    Filter: cpu.tag1 = Utf8("some-value") AND cpu.time >= TimestampMillisecond(1000, None) AND cpu.time <= TimestampMillisecond(2000, None)
+      TableScan: cpu projection=[tsid, time, tag1, tag2, field2], partial_filters=[cpu.tag1 = Utf8("some-value"), cpu.time >= TimestampMillisecond(1000, None), cpu.time <= TimestampMillisecond(2000, None)] })"#
                     .to_string()
             );
             assert_eq!(&field_col_name, "field2");
diff --git a/server/src/grpc/remote_engine_service/metrics.rs b/server/src/grpc/remote_engine_service/metrics.rs
new file mode 100644
index 0000000000..c6c8124a02
--- /dev/null
+++ b/server/src/grpc/remote_engine_service/metrics.rs
@@ -0,0 +1,25 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use lazy_static::lazy_static;
+use prometheus::{register_int_counter_vec, IntCounterVec};
+
+lazy_static! {
+    pub static ref REMOTE_ENGINE_QUERY_COUNTER: IntCounterVec = register_int_counter_vec!(
+        "remote_engine_query_counter",
+        "remote_engine_query_counter",
+        &["priority"]
+    )
+    .unwrap();
+}
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index 1135930014..eda9a61ada 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -32,8 +32,8 @@ use ceresdbproto::{
         remote_engine_service_server::RemoteEngineService,
         row_group, AlterTableOptionsRequest, AlterTableOptionsResponse, AlterTableSchemaRequest,
         AlterTableSchemaResponse, ExecContext, ExecutePlanRequest, GetTableInfoRequest,
-        GetTableInfoResponse, MetricPayload, ReadRequest, ReadResponse, WriteBatchRequest,
-        WriteRequest, WriteResponse,
+        GetTableInfoResponse, MetricPayload, QueryPriority, ReadRequest, ReadResponse,
+        WriteBatchRequest, WriteRequest, WriteResponse,
     },
     storage::{arrow_payload, ArrowPayload},
 };
@@ -43,7 +43,7 @@ use futures::{
     Future,
 };
 use generic_error::BoxError;
-use logger::{error, info, slow_query};
+use logger::{debug, error, info, slow_query};
 use notifier::notifier::{ExecutionGuard, RequestNotifiers, RequestResult};
 use proxy::{
     hotspot::{HotspotRecorder, Message},
@@ -55,6 +55,7 @@ use query_engine::{
     physical_planner::PhysicalPlanRef,
     QueryEngineRef, QueryEngineType,
 };
+use runtime::{Priority, RuntimeRef};
 use snafu::{OptionExt, ResultExt};
 use table_engine::{
     engine::EngineRuntimes,
@@ -76,11 +77,15 @@ use crate::{
             REMOTE_ENGINE_GRPC_HANDLER_DURATION_HISTOGRAM_VEC,
             REMOTE_ENGINE_WRITE_BATCH_NUM_ROWS_HISTOGRAM,
         },
-        remote_engine_service::error::{ErrNoCause, ErrWithCause, Result, StatusCode},
+        remote_engine_service::{
+            error::{ErrNoCause, ErrWithCause, Result, StatusCode},
+            metrics::REMOTE_ENGINE_QUERY_COUNTER,
+        },
     },
 };
 
 pub mod error;
+mod metrics;
 
 const STREAM_QUERY_CHANNEL_LEN: usize = 200;
 const DEFAULT_COMPRESS_MIN_LENGTH: usize = 80 * 1024;
@@ -139,15 +144,22 @@ struct ExecutePlanMetricCollector {
     query: String,
     request_id: RequestId,
     slow_threshold: Duration,
+    priority: Priority,
 }
 
 impl ExecutePlanMetricCollector {
-    fn new(request_id: String, query: String, slow_threshold_secs: u64) -> Self {
+    fn new(
+        request_id: String,
+        query: String,
+        slow_threshold_secs: u64,
+        priority: Priority,
+    ) -> Self {
         Self {
             start: Instant::now(),
             query,
             request_id: request_id.into(),
             slow_threshold: Duration::from_secs(slow_threshold_secs),
+            priority,
         }
     }
 }
@@ -157,12 +169,17 @@ impl MetricCollector for ExecutePlanMetricCollector {
         let cost = self.start.elapsed();
         if cost > self.slow_threshold {
             slow_query!(
-                "Remote query elapsed:{:?}, id:{}, query:{}",
+                "Remote query elapsed:{:?}, id:{}, priority:{}, query:{}",
                 cost,
                 self.request_id,
+                self.priority.as_str(),
                 self.query
             );
         }
+
+        REMOTE_ENGINE_QUERY_COUNTER
+            .with_label_values(&[self.priority.as_str()])
+            .inc();
         REMOTE_ENGINE_GRPC_HANDLER_DURATION_HISTOGRAM_VEC
             .execute_physical_plan
             .observe(cost.as_secs_f64());
@@ -415,6 +432,8 @@ impl RemoteEngineServiceImpl {
                     query,
                     request_notifiers.clone(),
                     config.notify_timeout.0,
+                    // TODO: decide runtime from request priority.
+                    self.runtimes.read_runtime.high(),
                 )
                 .await?;
             }
@@ -435,6 +454,7 @@ impl RemoteEngineServiceImpl {
         query: F,
         notifiers: Arc<RequestNotifiers<K, mpsc::Sender<Result<RecordBatch>>>>,
         notify_timeout: Duration,
+        rt: &RuntimeRef,
     ) -> Result<()>
     where
         K: Hash + PartialEq + Eq,
@@ -444,7 +464,7 @@ impl RemoteEngineServiceImpl {
         let mut guard = ExecutionGuard::new(|| {
             notifiers.take_notifiers(&request_key);
         });
-        let handle = self.runtimes.read_runtime.spawn(query);
+        let handle = rt.spawn(query);
         let streams = handle.await.box_err().context(ErrWithCause {
             code: StatusCode::Internal,
             msg: "fail to join task",
@@ -459,7 +479,7 @@ impl RemoteEngineServiceImpl {
                 })
             });
 
-            let handle = self.runtimes.read_runtime.spawn(async move {
+            let handle = rt.spawn(async move {
                 let mut batches = Vec::new();
                 while let Some(batch) = stream.next().await {
                     batches.push(batch)
@@ -486,7 +506,7 @@ impl RemoteEngineServiceImpl {
         let notifiers = notifiers.take_notifiers(&request_key).unwrap();
 
         // Do send in background to avoid blocking the rpc procedure.
-        self.runtimes.read_runtime.spawn(async move {
+        rt.spawn(async move {
             Self::send_dedupped_resps(resps, notifiers, notify_timeout).await;
         });
 
@@ -676,26 +696,36 @@ impl RemoteEngineServiceImpl {
             .slow_threshold
             .load(std::sync::atomic::Ordering::Relaxed);
 
-        let metric = ExecutePlanMetricCollector::new(
-            ctx.request_id_str.clone(),
-            ctx.displayable_query,
-            slow_threshold_secs,
-        );
+        let priority = ctx.priority();
         let query_ctx = create_query_ctx(
             ctx.request_id_str,
             ctx.default_catalog,
             ctx.default_schema,
             ctx.timeout_ms,
+            priority,
         );
 
+        debug!(
+            "Execute remote query, ctx:{query_ctx:?}, query:{}",
+            &ctx.displayable_query
+        );
+        let metric = ExecutePlanMetricCollector::new(
+            ctx.request_id.to_string(),
+            ctx.displayable_query,
+            slow_threshold_secs,
+            query_ctx.priority,
+        );
         let physical_plan = Arc::new(DataFusionPhysicalPlanAdapter::new(TypedPlan::Remote(
             encoded_plan,
         )));
-        let physical_plan_clone = physical_plan.clone();
 
-        let stream = self
+        let rt = self
             .runtimes
             .read_runtime
+            .choose_runtime(&query_ctx.priority);
+        let physical_plan_clone = physical_plan.clone();
+
+        let stream = rt
             .spawn(async move { handle_execute_plan(query_ctx, physical_plan, query_engine).await })
             .await
             .box_err()
@@ -730,18 +760,20 @@ impl RemoteEngineServiceImpl {
             .dyn_config
             .slow_threshold
             .load(std::sync::atomic::Ordering::Relaxed);
-        let metric = ExecutePlanMetricCollector::new(
-            ctx.request_id_str.clone(),
-            ctx.displayable_query,
-            slow_threshold_secs,
-        );
+        let priority = ctx.priority();
         let query_ctx = create_query_ctx(
             ctx.request_id_str,
             ctx.default_catalog,
             ctx.default_schema,
             ctx.timeout_ms,
+            priority,
+        );
+        let metric = ExecutePlanMetricCollector::new(
+            ctx.request_id.to_string(),
+            ctx.displayable_query,
+            slow_threshold_secs,
+            query_ctx.priority,
         );
-
         let key = PhysicalPlanKey {
             encoded_plan: encoded_plan.clone(),
         };
@@ -758,6 +790,10 @@ impl RemoteEngineServiceImpl {
             ..
         } = query_dedup;
 
+        let rt = self
+            .runtimes
+            .read_runtime
+            .choose_runtime(&query_ctx.priority);
         let (tx, rx) = mpsc::channel(config.notify_queue_cap);
         match physical_plan_notifiers.insert_notifier(key.clone(), tx) {
             // The first request, need to handle it, and then notify the other requests.
@@ -772,6 +808,7 @@ impl RemoteEngineServiceImpl {
                     query,
                     physical_plan_notifiers,
                     config.notify_timeout.0,
+                    rt,
                 )
                 .await?;
             }
@@ -1190,6 +1227,7 @@ fn create_query_ctx(
     default_catalog: String,
     default_schema: String,
     timeout_ms: i64,
+    priority: QueryPriority,
 ) -> QueryContext {
     let request_id = RequestId::from(request_id);
     let deadline = if timeout_ms >= 0 {
@@ -1197,12 +1235,17 @@ fn create_query_ctx(
     } else {
         None
     };
+    let priority = match priority {
+        QueryPriority::Low => Priority::Low,
+        QueryPriority::High => Priority::High,
+    };
 
     QueryContext {
         request_id,
         deadline,
         default_catalog,
         default_schema,
+        priority,
     }
 }
 
diff --git a/server/src/http.rs b/server/src/http.rs
index 068f21fa44..389e771224 100644
--- a/server/src/http.rs
+++ b/server/src/http.rs
@@ -43,7 +43,7 @@ use proxy::{
     Proxy,
 };
 use router::endpoint::Endpoint;
-use runtime::{Runtime, RuntimeRef};
+use runtime::{PriorityRuntime, Runtime};
 use serde::Serialize;
 use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
 use table_engine::{engine::EngineRuntimes, table::FlushRequest};
@@ -310,7 +310,7 @@ impl Service {
             .and(self.with_proxy())
             .and(self.with_read_runtime())
             .and_then(
-                |req, mut ctx: RequestContext, proxy: Arc<Proxy>, runtime: RuntimeRef| async move {
+                |req, mut ctx: RequestContext, proxy: Arc<Proxy>, runtime: PriorityRuntime| async move {
                     // We don't timeout http api since it's mainly used for debugging.
                     ctx.timeout = None;
 
@@ -774,7 +774,7 @@ impl Service {
 
     fn with_read_runtime(
         &self,
-    ) -> impl Filter<Extract = (Arc<Runtime>,), Error = Infallible> + Clone {
+    ) -> impl Filter<Extract = (PriorityRuntime,), Error = Infallible> + Clone {
         let runtime = self.engine_runtimes.read_runtime.clone();
         warp::any().map(move || runtime.clone())
     }
diff --git a/server/src/server.rs b/server/src/server.rs
index 27f59c1819..9d418a32ec 100644
--- a/server/src/server.rs
+++ b/server/src/server.rs
@@ -378,6 +378,7 @@ impl Builder {
         let config_content = self.config_content.context(MissingConfigContent)?;
         let query_engine_config = self.query_engine_config.context(MissingQueryEngineConfig)?;
         let datafusion_context = self.datatfusion_context.context(MissingDatafusionContext)?;
+        let expensive_query_threshold = query_engine_config.expensive_query_threshold.as_millis();
 
         let hotspot_recorder = Arc::new(HotspotRecorder::new(
             self.server_config.hotspot,
@@ -412,6 +413,7 @@ impl Builder {
             let instance = Instance {
                 catalog_manager,
                 query_engine,
+                query_runtime: engine_runtimes.read_runtime.clone(),
                 table_engine,
                 partition_table_engine,
                 function_registry,
@@ -459,6 +461,7 @@ impl Builder {
             self.cluster.is_some(),
             self.server_config.sub_table_access_perm,
             request_notifiers,
+            expensive_query_threshold,
         ));
 
         let http_service = http::Builder::new(http_config)
diff --git a/src/ceresdb/src/config.rs b/src/ceresdb/src/config.rs
index b23476d480..676df8ef2c 100644
--- a/src/ceresdb/src/config.rs
+++ b/src/ceresdb/src/config.rs
@@ -91,7 +91,7 @@ pub enum ClusterDeployment {
 #[derive(Clone, Debug, Deserialize, Serialize)]
 #[serde(default)]
 pub struct RuntimeConfig {
-    /// Runtime for reading data
+    /// High priority runtime for reading data
     pub read_thread_num: usize,
     /// The size of the stack used by the read thread
     ///
@@ -99,6 +99,8 @@ pub struct RuntimeConfig {
     /// TODO: this config may be removed in the future when the complex query
     /// won't overflow the stack.
     pub read_thread_stack_size: ReadableSize,
+    /// Low priority runtime for reading data
+    pub low_read_thread_num: usize,
     /// Runtime for writing data
     pub write_thread_num: usize,
     /// Runtime for communicating with meta cluster
@@ -116,6 +118,7 @@ impl Default for RuntimeConfig {
         Self {
             read_thread_num: 8,
             read_thread_stack_size: ReadableSize::mb(16),
+            low_read_thread_num: 1,
             write_thread_num: 8,
             meta_thread_num: 2,
             compact_thread_num: 4,
diff --git a/src/ceresdb/src/setup.rs b/src/ceresdb/src/setup.rs
index de78aa3db6..a7ae6a3160 100644
--- a/src/ceresdb/src/setup.rs
+++ b/src/ceresdb/src/setup.rs
@@ -32,6 +32,7 @@ use proxy::{
     },
 };
 use router::{rule_based::ClusterView, ClusterBasedRouter, RuleBasedRouter};
+use runtime::PriorityRuntime;
 use server::{
     config::{StaticRouteConfig, StaticTopologyConfig},
     local_tables::LocalTablesRecoverer,
@@ -86,12 +87,20 @@ fn build_runtime(name: &str, threads_num: usize) -> runtime::Runtime {
 }
 
 fn build_engine_runtimes(config: &RuntimeConfig) -> EngineRuntimes {
+    let read_stack_size = config.read_thread_stack_size.as_byte() as usize;
     EngineRuntimes {
-        read_runtime: Arc::new(build_runtime_with_stack_size(
-            "ceres-read",
-            config.read_thread_num,
-            Some(config.read_thread_stack_size.as_byte() as usize),
-        )),
+        read_runtime: PriorityRuntime::new(
+            Arc::new(build_runtime_with_stack_size(
+                "read-low",
+                config.low_read_thread_num,
+                Some(read_stack_size),
+            )),
+            Arc::new(build_runtime_with_stack_size(
+                "read-high",
+                config.read_thread_num,
+                Some(read_stack_size),
+            )),
+        ),
         write_runtime: Arc::new(build_runtime("ceres-write", config.write_thread_num)),
         compact_runtime: Arc::new(build_runtime("ceres-compact", config.compact_thread_num)),
         meta_runtime: Arc::new(build_runtime("ceres-meta", config.meta_thread_num)),
@@ -266,7 +275,8 @@ async fn build_table_engine_proxy(engine_builder: EngineBuilder<'_>) -> Arc<Tabl
 fn make_wal_runtime(runtimes: Arc<EngineRuntimes>) -> WalRuntimes {
     WalRuntimes {
         write_runtime: runtimes.write_runtime.clone(),
-        read_runtime: runtimes.read_runtime.clone(),
+        // TODO: remove read_runtime from WalRuntimes
+        read_runtime: runtimes.read_runtime.high().clone(),
         default_runtime: runtimes.default_runtime.clone(),
     }
 }
diff --git a/system_catalog/src/sys_catalog_table.rs b/system_catalog/src/sys_catalog_table.rs
index 0daacb03c9..00f7b061c4 100644
--- a/system_catalog/src/sys_catalog_table.rs
+++ b/system_catalog/src/sys_catalog_table.rs
@@ -545,6 +545,7 @@ impl SysCatalogTable {
             projected_schema: ProjectedSchema::no_projection(self.table.schema()),
             predicate: PredicateBuilder::default().build(),
             metrics_collector: MetricsCollector::default(),
+            priority: Default::default(),
         };
         let mut batch_stream = self.table.read(read_request).await.context(ReadTable)?;
 
diff --git a/table_engine/src/engine.rs b/table_engine/src/engine.rs
index 9946d0cbae..3411c7672e 100644
--- a/table_engine/src/engine.rs
+++ b/table_engine/src/engine.rs
@@ -24,7 +24,7 @@ use common_types::{
 };
 use generic_error::{GenericError, GenericResult};
 use macros::define_result;
-use runtime::RuntimeRef;
+use runtime::{PriorityRuntime, RuntimeRef};
 use snafu::{ensure, Backtrace, Snafu};
 
 use crate::{
@@ -346,7 +346,7 @@ pub type TableEngineRef = Arc<dyn TableEngine>;
 #[derive(Clone, Debug)]
 pub struct EngineRuntimes {
     /// Runtime for reading data
-    pub read_runtime: RuntimeRef,
+    pub read_runtime: PriorityRuntime,
     /// Runtime for writing data
     pub write_runtime: RuntimeRef,
     /// Runtime for compacting data
diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs
index fb3cb41d55..cbc4137240 100644
--- a/table_engine/src/provider.rs
+++ b/table_engine/src/provider.rs
@@ -39,6 +39,7 @@ use datafusion::{
 };
 use df_operator::visitor;
 use logger::debug;
+use runtime::Priority;
 use trace_metric::{collector::FormatCollectorVisitor, MetricsCollector};
 
 use crate::{
@@ -55,12 +56,19 @@ pub struct CeresdbOptions {
     pub request_timeout: Option<u64>,
     pub default_schema: String,
     pub default_catalog: String,
+    pub priority: Priority,
 }
 
 impl ConfigExtension for CeresdbOptions {
     const PREFIX: &'static str = "ceresdb";
 }
 
+impl CeresdbOptions {
+    const REQUEST_ID_KEY: &'static str = "request_id";
+    const REQUEST_PRIORITY_KEY: &'static str = "request_priority";
+    const REQUEST_TIMEOUT_KEY: &'static str = "request_timeout";
+}
+
 impl ExtensionOptions for CeresdbOptions {
     fn as_any(&self) -> &dyn Any {
         self
@@ -76,33 +84,57 @@ impl ExtensionOptions for CeresdbOptions {
 
     fn set(&mut self, key: &str, value: &str) -> Result<()> {
         match key {
-            "request_id" => self.request_id = value.to_string(),
-            "request_timeout" => {
+            Self::REQUEST_ID_KEY => self.request_id = value.to_string(),
+            Self::REQUEST_TIMEOUT_KEY => {
                 self.request_timeout = Some(value.parse::<u64>().map_err(|e| {
                     DataFusionError::External(
                         format!("could not parse request_timeout, input:{value}, err:{e:?}").into(),
                     )
                 })?)
             }
+            Self::REQUEST_PRIORITY_KEY => {
+                self.priority = value
+                    .parse::<u8>()
+                    .map_err(|e| {
+                        DataFusionError::External(
+                            format!("request_priority should be u8, input:{value}, err:{e:?}")
+                                .into(),
+                        )
+                    })
+                    .and_then(|value| {
+                        Priority::try_from(value).map_err(|e| {
+                            DataFusionError::External(
+                                format!("parse request_priority failed, input:{value}, err:{e:?}")
+                                    .into(),
+                            )
+                        })
+                    })?
+            }
             _ => Err(DataFusionError::External(
                 format!("could not find key, key:{key}").into(),
             ))?,
         }
+
         Ok(())
     }
 
     fn entries(&self) -> Vec<ConfigEntry> {
         vec![
             ConfigEntry {
-                key: "request_id".to_string(),
+                key: Self::REQUEST_ID_KEY.to_string(),
                 value: Some(self.request_id.to_string()),
                 description: "",
             },
             ConfigEntry {
-                key: "request_timeout".to_string(),
+                key: Self::REQUEST_TIMEOUT_KEY.to_string(),
                 value: self.request_timeout.map(|v| v.to_string()),
                 description: "",
             },
+            ConfigEntry {
+                key: Self::REQUEST_PRIORITY_KEY.to_string(),
+                value: Some(self.priority.as_u8().to_string()),
+                description: "",
+            },
         ]
     }
 }
@@ -182,8 +214,9 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
             .request_timeout
             .map(|n| Instant::now() + Duration::from_millis(n));
         let read_parallelism = state.config().target_partitions();
+        let priority = ceresdb_options.priority;
         debug!(
-            "TableProvider scan table, table:{}, request_id:{}, projection:{:?}, filters:{:?}, limit:{:?}, deadline:{:?}, parallelism:{}",
+            "TableProvider scan table, table:{}, request_id:{}, projection:{:?}, filters:{:?}, limit:{:?}, deadline:{:?}, parallelism:{}, priority:{:?}",
             self.table.name(),
             request_id,
             projection,
@@ -191,6 +224,7 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
             limit,
             deadline,
             read_parallelism,
+            priority,
         );
 
         let predicate = self.check_and_build_predicate_from_filters(filters);
@@ -216,6 +250,7 @@ impl<B: TableScanBuilder> TableProviderAdapter<B> {
             projected_schema,
             predicate,
             metrics_collector: MetricsCollector::new(SCAN_TABLE_METRICS_COLLECTOR_NAME.to_string()),
+            priority,
         };
 
         self.builder.build(request).await
@@ -439,9 +474,10 @@ impl DisplayAs for ScanTable {
     fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
             f,
-            "ScanTable: table={}, parallelism={}",
+            "ScanTable: table={}, parallelism={}, priority={:?}",
             self.table.name(),
             self.request.opts.read_parallelism,
+            self.request.priority
         )
     }
 }
diff --git a/table_engine/src/remote/model.rs b/table_engine/src/remote/model.rs
index bd99670375..9073e75ec6 100644
--- a/table_engine/src/remote/model.rs
+++ b/table_engine/src/remote/model.rs
@@ -22,7 +22,7 @@ use std::{
 
 use bytes_ext::{ByteVec, Bytes};
 use ceresdbproto::remote_engine::{
-    self, execute_plan_request, row_group::Rows::Contiguous, ColumnDesc,
+    self, execute_plan_request, row_group::Rows::Contiguous, ColumnDesc, QueryPriority,
 };
 use common_types::{
     request_id::RequestId,
@@ -35,6 +35,7 @@ use common_types::{
 use generic_error::{BoxError, GenericError, GenericResult};
 use itertools::Itertools;
 use macros::define_result;
+use runtime::Priority;
 use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
 
 use crate::{
@@ -454,6 +455,7 @@ pub struct ExecContext {
     pub default_catalog: String,
     pub default_schema: String,
     pub query: String,
+    pub priority: Priority,
 }
 
 pub enum PhysicalPlan {
@@ -474,7 +476,7 @@ impl From<RemoteExecuteRequest> for ceresdbproto::remote_engine::ExecutePlanRequ
             default_catalog: value.context.default_catalog,
             default_schema: value.context.default_schema,
             timeout_ms: rest_duration_ms,
-            priority: 0, // not used now
+            priority: value.context.priority.as_u8() as i32,
             displayable_query: value.context.query,
         };
 
@@ -510,6 +512,10 @@ impl TryFrom<ceresdbproto::remote_engine::ExecutePlanRequest> for RemoteExecuteR
         let pb_exec_ctx = value.context.context(ConvertRemoteExecuteRequest {
             msg: "missing exec ctx",
         })?;
+        let priority = match pb_exec_ctx.priority() {
+            QueryPriority::Low => Priority::Low,
+            QueryPriority::High => Priority::High,
+        };
         let ceresdbproto::remote_engine::ExecContext {
             request_id_str,
             default_catalog,
@@ -532,6 +538,7 @@ impl TryFrom<ceresdbproto::remote_engine::ExecutePlanRequest> for RemoteExecuteR
             default_catalog,
             default_schema,
             query: displayable_query,
+            priority,
         };
 
         // Plan
diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs
index 4f3e854f49..6c38044142 100644
--- a/table_engine/src/table.rs
+++ b/table_engine/src/table.rs
@@ -36,6 +36,7 @@ use common_types::{
 };
 use generic_error::{BoxError, GenericError};
 use macros::define_result;
+use runtime::Priority;
 use serde::Deserialize;
 use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
 use trace_metric::MetricsCollector;
@@ -387,6 +388,7 @@ pub struct ReadRequest {
     pub predicate: PredicateRef,
     /// Collector for metrics of this read request.
     pub metrics_collector: MetricsCollector,
+    pub priority: Priority,
 }
 
 impl fmt::Debug for ReadRequest {
@@ -415,6 +417,7 @@ impl fmt::Debug for ReadRequest {
             .field("opts", &self.opts)
             .field("projected", &projected)
             .field("predicate", &predicate)
+            .field("priority", &self.priority)
             .finish()
     }
 }
@@ -470,6 +473,8 @@ impl TryFrom<ceresdbproto::remote_engine::TableReadRequest> for ReadRequest {
             projected_schema,
             predicate,
             metrics_collector: MetricsCollector::default(),
+            // TODO: pass priority from request.
+            priority: Default::default(),
         })
     }
 }

From 61b123ab4006942bf4261e7880e1233d220f7deb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=B2=8D=E9=87=91=E6=97=A5?= <baojinri@gmail.com>
Date: Wed, 3 Jan 2024 10:23:37 +0800
Subject: [PATCH 26/38] refactor: avoid returning metrics in non-analyze sql
 (#1410)

## Rationale
In #1260, we implemented distributed analyze, but for query that are not
analyze, metrics will be returned, which will lead to a decrease in
query performance. Therefore, we will fix it in this PR, and metrics
will not be returned for normal queries.

## Detailed Changes
- Add is_analyze field to determine whether it is analyze

## Test Plan
Existing tests

---------

Co-authored-by: jiacai2050 <dev@liujiacai.net>
---
 Cargo.lock                                    | 32 +++++-----
 Cargo.toml                                    |  2 +-
 .../src/dist_sql_query/mod.rs                 |  8 ++-
 .../src/dist_sql_query/physical_plan.rs       |  3 +-
 .../src/datafusion_impl/task_context.rs       |  1 +
 server/src/grpc/remote_engine_service/mod.rs  | 59 +++++++++++--------
 table_engine/src/remote/model.rs              | 12 ++++
 7 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 271041656a..b1f6b823ce 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -96,7 +96,7 @@ dependencies = [
  "atomic_enum",
  "base64 0.13.1",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "codec",
  "common_types",
  "datafusion",
@@ -1345,7 +1345,7 @@ dependencies = [
 [[package]]
 name = "ceresdbproto"
 version = "1.0.23"
-source = "git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4#d849fa44e29ea04c7d99c082a38efb8ce5200d5e"
+source = "git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc#cfdacccebb7c609cb1aac791b73ba9a838d7ade6"
 dependencies = [
  "prost",
  "protoc-bin-vendored",
@@ -1528,7 +1528,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "common_types",
  "etcd-client",
  "future_ext",
@@ -1606,7 +1606,7 @@ dependencies = [
  "arrow 43.0.0",
  "arrow_ext",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "chrono",
  "datafusion",
  "hash_ext",
@@ -2362,7 +2362,7 @@ dependencies = [
  "async-recursion",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -3921,7 +3921,7 @@ name = "meta_client"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -4446,7 +4446,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "chrono",
  "clru",
  "crc",
@@ -5323,7 +5323,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "clru",
  "cluster",
  "common_types",
@@ -5451,7 +5451,7 @@ dependencies = [
  "arrow 43.0.0",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "chrono",
  "cluster",
  "codec",
@@ -5765,7 +5765,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "arrow_ext",
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -5894,7 +5894,7 @@ name = "router"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "cluster",
  "common_types",
  "generic_error",
@@ -6269,7 +6269,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "clru",
  "cluster",
  "common_types",
@@ -6795,7 +6795,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "codec",
  "common_types",
  "futures 0.3.28",
@@ -6817,7 +6817,7 @@ dependencies = [
  "arrow_ext",
  "async-trait",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -7020,7 +7020,7 @@ dependencies = [
 name = "time_ext"
 version = "1.2.6-alpha"
 dependencies = [
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "chrono",
  "common_types",
  "macros",
@@ -7672,7 +7672,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=d849fa4)",
+ "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
  "chrono",
  "codec",
  "common_types",
diff --git a/Cargo.toml b/Cargo.toml
index c99bbaf56b..1588c237de 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -94,7 +94,7 @@ bytes = "1"
 bytes_ext = { path = "components/bytes_ext" }
 catalog = { path = "catalog" }
 catalog_impls = { path = "catalog_impls" }
-ceresdbproto = { git = "https://github.com/CeresDB/horaedbproto.git", rev = "d849fa4" }
+ceresdbproto = { git = "https://github.com/CeresDB/horaedbproto.git", rev = "cfdaccc" }
 codec = { path = "components/codec" }
 chrono = "0.4"
 clap = "3.0"
diff --git a/df_engine_extensions/src/dist_sql_query/mod.rs b/df_engine_extensions/src/dist_sql_query/mod.rs
index abfc0cca1d..c2cd825092 100644
--- a/df_engine_extensions/src/dist_sql_query/mod.rs
+++ b/df_engine_extensions/src/dist_sql_query/mod.rs
@@ -66,13 +66,19 @@ type ExecutableScanBuilderRef = Box<dyn ExecutableScanBuilder>;
 pub struct RemoteTaskContext {
     pub task_ctx: Arc<TaskContext>,
     pub remote_metrics: Arc<Mutex<Option<String>>>,
+    pub is_analyze: bool,
 }
 
 impl RemoteTaskContext {
-    pub fn new(task_ctx: Arc<TaskContext>, remote_metrics: Arc<Mutex<Option<String>>>) -> Self {
+    pub fn new(
+        task_ctx: Arc<TaskContext>,
+        remote_metrics: Arc<Mutex<Option<String>>>,
+        is_analyze: bool,
+    ) -> Self {
         Self {
             task_ctx,
             remote_metrics,
+            is_analyze,
         }
     }
 }
diff --git a/df_engine_extensions/src/dist_sql_query/physical_plan.rs b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
index 87cd18bdcd..1ebe669fc7 100644
--- a/df_engine_extensions/src/dist_sql_query/physical_plan.rs
+++ b/df_engine_extensions/src/dist_sql_query/physical_plan.rs
@@ -349,7 +349,8 @@ impl ExecutionPlan for ResolvedPartitionedScan {
             remote_metrics,
         } = &self.remote_exec_ctx.plan_ctxs[partition];
 
-        let remote_task_ctx = RemoteTaskContext::new(context, remote_metrics.clone());
+        let remote_task_ctx =
+            RemoteTaskContext::new(context, remote_metrics.clone(), self.is_analyze);
 
         // Send plan for remote execution.
         let stream_future = self.remote_exec_ctx.executor.execute(
diff --git a/query_engine/src/datafusion_impl/task_context.rs b/query_engine/src/datafusion_impl/task_context.rs
index 8aefd563c1..5e34cdc746 100644
--- a/query_engine/src/datafusion_impl/task_context.rs
+++ b/query_engine/src/datafusion_impl/task_context.rs
@@ -202,6 +202,7 @@ impl RemotePhysicalPlanExecutor for RemotePhysicalPlanExecutorImpl {
             default_schema,
             query: display_plan.indent(true).to_string(),
             priority,
+            is_analyze: task_context.is_analyze,
         };
 
         // Encode plan and schema
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index eda9a61ada..df201a1257 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -220,17 +220,17 @@ impl<M: MetricCollector> Drop for StreamWithMetric<M> {
 
 struct RemoteExecStream {
     inner: BoxStream<'static, Result<RecordBatch>>,
-    physical_plan: Option<PhysicalPlanRef>,
+    physical_plan_for_explain: Option<PhysicalPlanRef>,
 }
 
 impl RemoteExecStream {
     fn new(
         inner: BoxStream<'static, Result<RecordBatch>>,
-        physical_plan: Option<PhysicalPlanRef>,
+        physical_plan_for_explain: Option<PhysicalPlanRef>,
     ) -> Self {
         Self {
             inner,
-            physical_plan,
+            physical_plan_for_explain,
         }
     }
 }
@@ -240,19 +240,25 @@ impl Stream for RemoteExecStream {
 
     fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         let this = self.get_mut();
-        match this.inner.poll_next_unpin(cx) {
-            Poll::Ready(Some(res)) => {
-                Poll::Ready(Some(res.map(RecordBatchWithMetric::RecordBatch)))
-            }
-            Poll::Ready(None) => match &this.physical_plan {
-                Some(physical_plan) => {
-                    let metrics = physical_plan.metrics_to_string();
-                    this.physical_plan = None;
-                    Poll::Ready(Some(Ok(RecordBatchWithMetric::Metric(metrics))))
+        let is_explain = this.physical_plan_for_explain.is_some();
+        loop {
+            match this.inner.poll_next_unpin(cx) {
+                Poll::Ready(Some(res)) => {
+                    // If the request is explain, we try drain the stream to get the metrics.
+                    if !is_explain {
+                        return Poll::Ready(Some(res.map(RecordBatchWithMetric::RecordBatch)));
+                    }
                 }
-                None => Poll::Ready(None),
-            },
-            Poll::Pending => Poll::Pending,
+                Poll::Ready(None) => match &this.physical_plan_for_explain {
+                    Some(physical_plan) => {
+                        let metrics = physical_plan.metrics_to_string();
+                        this.physical_plan_for_explain = None;
+                        return Poll::Ready(Some(Ok(RecordBatchWithMetric::Metric(metrics))));
+                    }
+                    None => return Poll::Ready(None),
+                },
+                Poll::Pending => return Poll::Pending,
+            }
         }
     }
 }
@@ -715,15 +721,16 @@ impl RemoteEngineServiceImpl {
             slow_threshold_secs,
             query_ctx.priority,
         );
-        let physical_plan = Arc::new(DataFusionPhysicalPlanAdapter::new(TypedPlan::Remote(
-            encoded_plan,
-        )));
+        let physical_plan: PhysicalPlanRef = Arc::new(DataFusionPhysicalPlanAdapter::new(
+            TypedPlan::Remote(encoded_plan),
+        ));
+        // TODO: Use in handle_execute_plan fn to build stream with metrics
+        let physical_plan_for_explain = ctx.explain.map(|_| physical_plan.clone());
 
         let rt = self
             .runtimes
             .read_runtime
             .choose_runtime(&query_ctx.priority);
-        let physical_plan_clone = physical_plan.clone();
 
         let stream = rt
             .spawn(async move { handle_execute_plan(query_ctx, physical_plan, query_engine).await })
@@ -743,7 +750,7 @@ impl RemoteEngineServiceImpl {
         let stream = StreamWithMetric::new(Box::pin(stream), metric);
         Ok(RemoteExecStream::new(
             Box::pin(stream),
-            Some(physical_plan_clone),
+            physical_plan_for_explain,
         ))
     }
 
@@ -778,11 +785,11 @@ impl RemoteEngineServiceImpl {
             encoded_plan: encoded_plan.clone(),
         };
 
-        let physical_plan = Arc::new(DataFusionPhysicalPlanAdapter::new(TypedPlan::Remote(
-            encoded_plan,
-        )));
-
-        let physical_plan_clone = physical_plan.clone();
+        let physical_plan: PhysicalPlanRef = Arc::new(DataFusionPhysicalPlanAdapter::new(
+            TypedPlan::Remote(encoded_plan),
+        ));
+        // TODO: Use in handle_execute_plan fn to build stream with metrics
+        let physical_plan_for_explain = ctx.explain.map(|_| physical_plan.clone());
 
         let QueryDedup {
             config,
@@ -822,7 +829,7 @@ impl RemoteEngineServiceImpl {
         let stream = StreamWithMetric::new(Box::pin(ReceiverStream::new(rx)), metric);
         Ok(RemoteExecStream::new(
             Box::pin(stream),
-            Some(physical_plan_clone),
+            physical_plan_for_explain,
         ))
     }
 
diff --git a/table_engine/src/remote/model.rs b/table_engine/src/remote/model.rs
index 9073e75ec6..deb28538ac 100644
--- a/table_engine/src/remote/model.rs
+++ b/table_engine/src/remote/model.rs
@@ -456,6 +456,9 @@ pub struct ExecContext {
     pub default_schema: String,
     pub query: String,
     pub priority: Priority,
+    // TOOO: there are many explain types, we need to support them all.
+    // A proper way is to define a enum for all explain types.
+    pub is_analyze: bool,
 }
 
 pub enum PhysicalPlan {
@@ -470,6 +473,11 @@ impl From<RemoteExecuteRequest> for ceresdbproto::remote_engine::ExecutePlanRequ
             NO_TIMEOUT
         };
 
+        let explain = if value.context.is_analyze {
+            Some(ceresdbproto::remote_engine::Explain::Analyze)
+        } else {
+            None
+        };
         let pb_context = ceresdbproto::remote_engine::ExecContext {
             request_id: 0, // not used any more
             request_id_str: value.context.request_id.to_string(),
@@ -478,6 +486,7 @@ impl From<RemoteExecuteRequest> for ceresdbproto::remote_engine::ExecutePlanRequ
             timeout_ms: rest_duration_ms,
             priority: value.context.priority.as_u8() as i32,
             displayable_query: value.context.query,
+            explain: explain.map(|v| v as i32),
         };
 
         let pb_plan = match value.physical_plan {
@@ -522,8 +531,10 @@ impl TryFrom<ceresdbproto::remote_engine::ExecutePlanRequest> for RemoteExecuteR
             default_schema,
             timeout_ms,
             displayable_query,
+            explain,
             ..
         } = pb_exec_ctx;
+        let is_analyze = explain == Some(ceresdbproto::remote_engine::Explain::Analyze as i32);
 
         let request_id = RequestId::from(request_id_str);
         let deadline = if timeout_ms >= 0 {
@@ -539,6 +550,7 @@ impl TryFrom<ceresdbproto::remote_engine::ExecutePlanRequest> for RemoteExecuteR
             default_schema,
             query: displayable_query,
             priority,
+            is_analyze,
         };
 
         // Plan

From 904e2d5ede4d1a68c9437b6143065f283f81fbd0 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Thu, 4 Jan 2024 14:42:18 +0800
Subject: [PATCH 27/38] feat: block rules support query (#1420)

## Rationale
Query with long time range usually cost too much resources, which affect
stable of the whole cluster

## Detailed Changes
- Support block query by query range


## Test Plan
Manually
```bash

curl 0:5000/admin/block -H 'content-type: application/json' -d '
{
  "operation": "Set",
  "write_block_list": [],
  "read_block_list": [],
  "block_rules": [
    {"type": "QueryRange", "content": "24h"}
  ]
}'
```
---
 proxy/src/limiter.rs       | 44 +++++++++++++++++++++++++++++++++++---
 proxy/src/metrics.rs       |  6 ++++++
 proxy/src/read.rs          |  2 +-
 query_frontend/src/plan.rs | 27 +++++++++++++++++++++++
 4 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/proxy/src/limiter.rs b/proxy/src/limiter.rs
index 9405dd9ec4..ea6b697de9 100644
--- a/proxy/src/limiter.rs
+++ b/proxy/src/limiter.rs
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::{collections::HashSet, sync::RwLock};
+use std::{collections::HashSet, str::FromStr, sync::RwLock};
 
 use datafusion::logical_expr::logical_plan::LogicalPlan;
 use macros::define_result;
 use query_frontend::plan::Plan;
 use serde::{Deserialize, Serialize};
 use snafu::Snafu;
+use time_ext::ReadableDuration;
+
+use crate::metrics::BLOCKED_REQUEST_COUNTER_VEC_GLOBAL;
 
 #[derive(Snafu, Debug)]
 #[snafu(visibility(pub))]
@@ -33,12 +36,26 @@ pub enum Error {
 define_result!(Error);
 
 #[derive(Clone, Copy, Deserialize, Debug, PartialEq, Eq, Hash, Serialize, PartialOrd, Ord)]
+#[serde(tag = "type", content = "content")]
 pub enum BlockRule {
     QueryWithoutPredicate,
+    /// Max time range a query can scan.
+    #[serde(deserialize_with = "deserialize_readable_duration")]
+    QueryRange(i64),
     AnyQuery,
     AnyInsert,
 }
 
+fn deserialize_readable_duration<'de, D>(deserializer: D) -> std::result::Result<i64, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    let s: &str = Deserialize::deserialize(deserializer)?;
+    ReadableDuration::from_str(s)
+        .map(|d| d.0.as_millis() as i64)
+        .map_err(serde::de::Error::custom)
+}
+
 #[derive(Default, Clone, Deserialize, Debug, Serialize)]
 #[serde(default)]
 pub struct LimiterConfig {
@@ -52,6 +69,17 @@ impl BlockRule {
         match self {
             BlockRule::QueryWithoutPredicate => self.is_query_without_predicate(plan),
             BlockRule::AnyQuery => matches!(plan, Plan::Query(_)),
+            BlockRule::QueryRange(threshold) => {
+                if let Plan::Query(plan) = plan {
+                    if let Some(range) = plan.query_range() {
+                        if range > *threshold {
+                            return true;
+                        }
+                    }
+                }
+
+                false
+            }
             BlockRule::AnyInsert => matches!(plan, Plan::Insert(_)),
         }
     }
@@ -159,8 +187,18 @@ impl Limiter {
     ///
     /// Error will throws if the plan is forbidden to execute.
     pub fn try_limit(&self, plan: &Plan) -> Result<()> {
-        self.try_limit_by_block_list(plan)?;
-        self.try_limit_by_rules(plan)
+        let result = {
+            self.try_limit_by_block_list(plan)?;
+            self.try_limit_by_rules(plan)
+        };
+
+        if result.is_err() {
+            BLOCKED_REQUEST_COUNTER_VEC_GLOBAL
+                .with_label_values(&[plan.plan_type()])
+                .inc();
+        }
+
+        result
     }
 
     pub fn add_write_block_list(&self, block_list: Vec<String>) {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index c55c47c27b..3478b6bdd7 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -61,6 +61,12 @@ lazy_static! {
     pub static ref HTTP_HANDLER_COUNTER_VEC_GLOBAL: IntCounterVec =
         register_int_counter_vec!("http_handler_counter", "Http handler counter", &["type"])
             .unwrap();
+    pub static ref BLOCKED_REQUEST_COUNTER_VEC_GLOBAL: IntCounterVec = register_int_counter_vec!(
+        "blocked_request_counter",
+        "Blocked request counter",
+        &["type"]
+    )
+    .unwrap();
 }
 
 lazy_static! {
diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index effb88c086..e821750cde 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -240,7 +240,7 @@ impl Proxy {
                 .try_limit(&plan)
                 .box_err()
                 .context(Internal {
-                    msg: "Request is blocked",
+                    msg: format!("Request is blocked, table_name:{table_name:?}"),
                 })?;
         }
 
diff --git a/query_frontend/src/plan.rs b/query_frontend/src/plan.rs
index 94c8bca6b3..597c6810be 100644
--- a/query_frontend/src/plan.rs
+++ b/query_frontend/src/plan.rs
@@ -77,6 +77,21 @@ pub enum Plan {
     Exists(ExistsTablePlan),
 }
 
+impl Plan {
+    pub fn plan_type(&self) -> &str {
+        match self {
+            Self::Query(_) => "query",
+            Self::Insert(_) => "insert",
+            Self::Create(_)
+            | Self::Drop(_)
+            | Self::Describe(_)
+            | Self::AlterTable(_)
+            | Self::Show(_)
+            | Self::Exists(_) => "other",
+        }
+    }
+}
+
 pub struct PriorityContext {
     pub time_range_threshold: u64,
 }
@@ -201,6 +216,18 @@ impl QueryPlan {
 
         Some(priority)
     }
+
+    /// When query contains invalid time range such as `[200, 100]`, it will
+    /// return None.
+    pub fn query_range(&self) -> Option<i64> {
+        self.extract_time_range().map(|time_range| {
+            time_range
+                .exclusive_end()
+                .as_i64()
+                .checked_sub(time_range.inclusive_start().as_i64())
+                .unwrap_or(i64::MAX)
+        })
+    }
 }
 
 impl Debug for QueryPlan {

From 94673bd18585ce3f5478a07c735558a31eed301a Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Thu, 4 Jan 2024 16:24:26 +0800
Subject: [PATCH 28/38] chore: fix error message (#1412)

## Rationale


## Detailed Changes
- Attach endpoint to remote error

## Test Plan
CI
---
 .github/workflows/ci.yml                      |  1 +
 .../env/cluster/ddl/partition_table.result    |  3 +-
 .../cases/env/cluster/ddl/partition_table.sql |  1 +
 .../src/table_manipulator/meta_based.rs       |  2 +-
 remote_engine_client/src/client.rs            | 36 ++++++++++++++-----
 remote_engine_client/src/lib.rs               |  5 ++-
 6 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 13b4773a34..25ea726dfb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,6 +20,7 @@ on:
   push:
     branches:
       - main
+      - dev
     paths-ignore:
       - 'docs/**'
       - 'etc/**'
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.result b/integration_tests/cases/env/cluster/ddl/partition_table.result
index e8feacbcbb..5ae20b6418 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.result
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.result
@@ -99,9 +99,10 @@ ALTER TABLE partition_table_t ADD COLUMN (b string);
 
 affected_rows: 0
 
+-- SQLNESS REPLACE endpoint:(.*?), endpoint:xx,
 INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, "ceresdb0", 100);
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute insert, err:Failed to write table, err:Failed to write tables in batch, tables:[\"__partition_table_t_1\"], err:Failed to query from table in server, table_idents:[TableIdentifier { catalog: \"ceresdb\", schema: \"public\", table: \"__partition_table_t_1\" }], code:401, msg:failed to decode row group payload. Caused by: Schema mismatch with the write request, msg:expect 6 columns, but got 5. sql:INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, \"ceresdb0\", 100);" })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute insert, err:Failed to write table, err:Failed to write tables in batch, tables:[\"__partition_table_t_1\"], err:Failed to query from table in server, table_idents:[TableIdentifier { catalog: \"ceresdb\", schema: \"public\", table: \"__partition_table_t_1\" }], endpoint:xx, code:401, msg:failed to decode row group payload. Caused by: Schema mismatch with the write request, msg:expect 6 columns, but got 5. sql:INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, \"ceresdb0\", 100);" })
 
 ALTER TABLE partition_table_t MODIFY SETTING enable_ttl='true';
 
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.sql b/integration_tests/cases/env/cluster/ddl/partition_table.sql
index 46be8e1b69..855e2c9a6e 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.sql
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.sql
@@ -46,6 +46,7 @@ EXPLAIN ANALYZE SELECT * from partition_table_t where name in ("ceresdb0", "cere
 
 ALTER TABLE partition_table_t ADD COLUMN (b string);
 
+-- SQLNESS REPLACE endpoint:(.*?), endpoint:xx,
 INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, "ceresdb0", 100);
 
 ALTER TABLE partition_table_t MODIFY SETTING enable_ttl='true';
diff --git a/interpreters/src/table_manipulator/meta_based.rs b/interpreters/src/table_manipulator/meta_based.rs
index bdecfd3e34..2e6f8e69bb 100644
--- a/interpreters/src/table_manipulator/meta_based.rs
+++ b/interpreters/src/table_manipulator/meta_based.rs
@@ -126,7 +126,7 @@ impl TableManipulator for TableManipulatorImpl {
             .await
             .box_err()
             .context(DropWithCause {
-                msg: format!("failed to create table by meta client, req:{req:?}"),
+                msg: format!("failed to drop table by meta client, req:{req:?}"),
             })?;
 
         info!(
diff --git a/remote_engine_client/src/client.rs b/remote_engine_client/src/client.rs
index 456de91413..291c01d568 100644
--- a/remote_engine_client/src/client.rs
+++ b/remote_engine_client/src/client.rs
@@ -37,7 +37,7 @@ use common_types::{record_batch::RecordBatch, schema::RecordSchema};
 use futures::{Stream, StreamExt};
 use generic_error::BoxError;
 use logger::{error, info};
-use router::RouterRef;
+use router::{endpoint::Endpoint, RouterRef};
 use runtime::Runtime;
 use snafu::{ensure, OptionExt, ResultExt};
 use table_engine::{
@@ -120,6 +120,7 @@ impl Client {
         // evict cache entry.
         let response = response.into_inner();
         let remote_read_record_batch_stream = ClientReadRecordBatchStream::new(
+            route_context.endpoint,
             table_ident,
             response,
             record_schema,
@@ -135,6 +136,7 @@ impl Client {
 
         // Write to remote.
         let table_ident = request.table.clone();
+        let endpoint = route_context.endpoint.clone();
         let request_pb = request.convert_into_pb().box_err().context(Convert {
             msg: "Failed to convert WriteRequest to pb",
         })?;
@@ -152,6 +154,7 @@ impl Client {
             let response = response.into_inner();
             if let Some(header) = &response.header && !status_code::is_ok(header.code) {
                 Server {
+                    endpoint,
                     table_idents: vec![table_ident.clone()],
                     code: header.code,
                     msg: header.error.clone(),
@@ -187,9 +190,9 @@ impl Client {
         }
 
         // Merge according to endpoint.
-        let mut remote_writes = Vec::with_capacity(write_batch_contexts_by_endpoint.len());
+        let mut write_handles = Vec::with_capacity(write_batch_contexts_by_endpoint.len());
         let mut written_tables = Vec::with_capacity(write_batch_contexts_by_endpoint.len());
-        for (_, context) in write_batch_contexts_by_endpoint {
+        for (endpoint, context) in write_batch_contexts_by_endpoint {
             // Write to remote.
             let WriteBatchContext {
                 table_idents,
@@ -204,18 +207,18 @@ impl Client {
                 rpc_client
                     .write_batch(Request::new(batch_request_pb))
                     .await
+                    .map(|v| (v, endpoint.clone()))
                     .box_err()
             });
 
-            remote_writes.push(handle);
+            write_handles.push(handle);
             written_tables.push(table_idents);
         }
 
-        let mut results = Vec::with_capacity(remote_writes.len());
-        for (table_idents, remote_write) in written_tables.into_iter().zip(remote_writes) {
-            let batch_result = remote_write.await;
+        let mut results = Vec::with_capacity(write_handles.len());
+        for (table_idents, handle) in written_tables.into_iter().zip(write_handles) {
             // If it's runtime error, don't evict entires from route cache.
-            let batch_result = match batch_result.box_err() {
+            let batch_result = match handle.await.box_err() {
                 Ok(result) => result,
                 Err(e) => {
                     results.push(WriteBatchResult {
@@ -227,10 +230,12 @@ impl Client {
             };
 
             // Check remote write result then.
-            let result = batch_result.and_then(|response| {
+            let result = batch_result.and_then(|result| {
+                let (response, endpoint) = result;
                 let response = response.into_inner();
                 if let Some(header) = &response.header && !status_code::is_ok(header.code) {
                     Server {
+                        endpoint,
                         table_idents: table_idents.clone(),
                         code: header.code,
                         msg: header.error.clone(),
@@ -260,6 +265,7 @@ impl Client {
         let route_context = self.cached_router.route(&request.table_ident).await?;
 
         let table_ident = request.table_ident.clone();
+        let endpoint = route_context.endpoint.clone();
         let request_pb: ceresdbproto::remote_engine::AlterTableSchemaRequest = request.into();
         let mut rpc_client = RemoteEngineServiceClient::<Channel>::new(route_context.channel);
 
@@ -279,6 +285,7 @@ impl Client {
                 let response = response.into_inner();
                 if let Some(header) = &response.header && !status_code::is_ok(header.code) {
                     Server {
+                        endpoint:endpoint.clone(),
                         table_idents: vec![table_ident.clone()],
                         code: header.code,
                         msg: header.error.clone(),
@@ -318,6 +325,7 @@ impl Client {
         let route_context = self.cached_router.route(&request.table_ident).await?;
 
         let table_ident = request.table_ident.clone();
+        let endpoint = route_context.endpoint.clone();
         let request_pb: ceresdbproto::remote_engine::AlterTableOptionsRequest = request.into();
         let mut rpc_client = RemoteEngineServiceClient::<Channel>::new(route_context.channel);
 
@@ -336,6 +344,7 @@ impl Client {
                 let response = response.into_inner();
                 if let Some(header) = &response.header && !status_code::is_ok(header.code) {
                     Server {
+                        endpoint:endpoint.clone(),
                         table_idents: vec![table_ident.clone()],
                         code: header.code,
                         msg: header.error.clone(),
@@ -371,6 +380,7 @@ impl Client {
         // Find the channel from router firstly.
         let route_context = self.cached_router.route(&request.table).await?;
         let table_ident = request.table.clone();
+        let endpoint = route_context.endpoint.clone();
         let request_pb = ceresdbproto::remote_engine::GetTableInfoRequest::try_from(request)
             .box_err()
             .context(Convert {
@@ -391,6 +401,7 @@ impl Client {
             let response = response.into_inner();
             if let Some(header) = &response.header && !status_code::is_ok(header.code) {
                     Server {
+                        endpoint:endpoint.clone(),
                         table_idents: vec![table_ident.clone()],
                         code: header.code,
                         msg: header.error.clone(),
@@ -403,6 +414,7 @@ impl Client {
         match result {
             Ok(response) => {
                 let table_info = response.table_info.context(Server {
+                    endpoint: endpoint.clone(),
                     table_idents: vec![table_ident.clone()],
                     code: status_code::StatusCode::Internal.as_u32(),
                     msg: "Table info is empty",
@@ -423,6 +435,7 @@ impl Client {
                             msg: "Failed to covert table schema",
                         })?
                         .with_context(|| Server {
+                            endpoint,
                             table_idents: vec![table_ident],
                             code: status_code::StatusCode::Internal.as_u32(),
                             msg: "Table schema is empty",
@@ -490,6 +503,7 @@ impl Client {
         // evict cache entry.
         let response = response.into_inner();
         let remote_execute_plan_stream = ClientReadRecordBatchStream::new(
+            route_context.endpoint,
             table_ident,
             response,
             plan_schema,
@@ -509,6 +523,7 @@ impl Client {
 }
 
 pub struct ClientReadRecordBatchStream {
+    endpoint: Endpoint,
     pub table_ident: TableIdentifier,
     pub response_stream: Streaming<remote_engine::ReadResponse>,
     pub record_schema: RecordSchema,
@@ -517,12 +532,14 @@ pub struct ClientReadRecordBatchStream {
 
 impl ClientReadRecordBatchStream {
     pub fn new(
+        endpoint: Endpoint,
         table_ident: TableIdentifier,
         response_stream: Streaming<remote_engine::ReadResponse>,
         record_schema: RecordSchema,
         remote_metrics: Arc<Mutex<Option<String>>>,
     ) -> Self {
         Self {
+            endpoint,
             table_ident,
             response_stream,
             record_schema,
@@ -541,6 +558,7 @@ impl Stream for ClientReadRecordBatchStream {
                 // Check header.
                 if let Some(header) = response.header && !status_code::is_ok(header.code) {
                     return Poll::Ready(Some(Server {
+                        endpoint: this.endpoint.clone(),
                         table_idents: vec![this.table_ident.clone()],
                         code: header.code,
                         msg: header.error,
diff --git a/remote_engine_client/src/lib.rs b/remote_engine_client/src/lib.rs
index c88026f3d6..055f408313 100644
--- a/remote_engine_client/src/lib.rs
+++ b/remote_engine_client/src/lib.rs
@@ -54,6 +54,7 @@ use self::client::{Client, ClientReadRecordBatchStream};
 pub mod error {
     use generic_error::GenericError;
     use macros::define_result;
+    use router::endpoint::Endpoint;
     use snafu::{Backtrace, Snafu};
     use table_engine::remote::model::TableIdentifier;
 
@@ -93,12 +94,14 @@ pub mod error {
         },
 
         #[snafu(display(
-            "Failed to query from table in server, table_idents:{:?}, code:{}, msg:{}",
+            "Failed to query from table in server, table_idents:{:?}, endpoint:{}, code:{}, msg:{}",
             table_idents,
+            endpoint.to_string(),
             code,
             msg
         ))]
         Server {
+            endpoint: Endpoint,
             table_idents: Vec<TableIdentifier>,
             code: u32,
             msg: String,

From 5377dfd5d078ec394d9401703ee5b62bf19fd94d Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Thu, 4 Jan 2024 17:04:17 +0800
Subject: [PATCH 29/38] feat: try load page indexes (#1425)

## Rationale
See #1040

## Detailed Changes
- Try load page indexes

## Test Plan
CI
---
 .../src/sst/parquet/async_reader.rs           | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index be98479619..0b7ffdc273 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -37,7 +37,7 @@ use datafusion::{
 };
 use futures::{Stream, StreamExt};
 use generic_error::{BoxError, GenericResult};
-use logger::{debug, error};
+use logger::{debug, error, warn};
 use object_store::{ObjectStoreRef, Path};
 use parquet::{
     arrow::{arrow_reader::RowSelection, ParquetRecordBatchStreamBuilder, ProjectionMask},
@@ -397,7 +397,27 @@ impl<'a> Reader<'a> {
                     file_path: self.path.to_string(),
                 })?;
 
-        // TODO: Support page index until https://github.com/CeresDB/ceresdb/issues/1040 is fixed.
+        let mut parquet_meta_data = Arc::new(parquet_meta_data);
+        let object_store_reader = parquet_ext::reader::ObjectStoreReader::new(
+            self.store.clone(),
+            self.path.clone(),
+            parquet_meta_data.clone(),
+        );
+
+        if let Ok(meta_data) = parquet_ext::meta_data::meta_with_page_indexes(object_store_reader)
+            .await
+            .map_err(|e| {
+                // When loading page indexes failed, we just log the error and continue querying
+                // TODO: Fix this in stream. https://github.com/apache/incubator-horaedb/issues/1040
+                warn!(
+                    "Fail to load page indexes, path:{}, err:{:?}.",
+                    self.path, e
+                );
+                e
+            })
+        {
+            parquet_meta_data = meta_data;
+        }
 
         MetaData::try_new(&parquet_meta_data, ignore_sst_filter, self.store.clone())
             .await

From dc10253f31d7bac3ab46c44424584ebd2fb99b8f Mon Sep 17 00:00:00 2001
From: CooooolFrog <zuliangwanghust@gmail.com>
Date: Mon, 8 Jan 2024 09:57:15 +0800
Subject: [PATCH 30/38] feat: add table status check (#1418)

## Rationale
Refer to this issue
https://github.com/apache/incubator-horaedb/issues/1386, currently, if
the status of the shard is abnormal, we cannot get any valid exception
information from the error message `table not found`.

## Detailed Changes
* Add `TableStatus` in `cluster`, you can use it to get the status of
the table in the current cluster..
* Add `SchemaWithCluster`, It wraps the schema inside the cluster,
through which the state of the cluster and schema can be combined.

## Test Plan
Pass CI.
---
 catalog/src/schema.rs              |   3 +
 catalog_impls/src/cluster_based.rs | 115 +++++++++++++++++++++++++++++
 catalog_impls/src/lib.rs           |   1 +
 catalog_impls/src/volatile.rs      |  20 ++++-
 cluster/src/cluster_impl.rs        |  21 +++++-
 cluster/src/lib.rs                 |  24 +++++-
 cluster/src/shard_set.rs           |  10 +++
 meta_client/src/types.rs           |   5 ++
 router/src/cluster_based.rs        |   5 ++
 src/ceresdb/src/setup.rs           |   7 +-
 10 files changed, 202 insertions(+), 9 deletions(-)
 create mode 100644 catalog_impls/src/cluster_based.rs

diff --git a/catalog/src/schema.rs b/catalog/src/schema.rs
index 51fb7f82d2..c3997ca495 100644
--- a/catalog/src/schema.rs
+++ b/catalog/src/schema.rs
@@ -181,6 +181,9 @@ pub enum Error {
         table: String,
         backtrace: Backtrace,
     },
+
+    #[snafu(display("Table is not ready, err:{}", source))]
+    TableNotReady { source: GenericError },
 }
 
 define_result!(Error);
diff --git a/catalog_impls/src/cluster_based.rs b/catalog_impls/src/cluster_based.rs
new file mode 100644
index 0000000000..650d201957
--- /dev/null
+++ b/catalog_impls/src/cluster_based.rs
@@ -0,0 +1,115 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use async_trait::async_trait;
+use catalog::{
+    schema,
+    schema::{
+        CreateOptions, CreateTableRequest, DropOptions, DropTableRequest, NameRef, Schema,
+        SchemaRef, TableNotReady,
+    },
+};
+use cluster::{ClusterRef, TableStatus};
+use generic_error::BoxError;
+use snafu::{ResultExt, Snafu};
+use table_engine::table::{SchemaId, TableRef};
+
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub))]
+pub enum Error {
+    #[snafu(display("Invalid table status, status:{:?}", status))]
+    InvalidTableStatus { status: TableStatus },
+}
+
+/// A cluster-based implementation for [`schema`].
+
+/// Schema with cluster.
+/// It binds cluster and schema and will detect the health status of the cluster
+/// when calling the schema interface.
+pub(crate) struct SchemaWithCluster {
+    internal: SchemaRef,
+
+    cluster: ClusterRef,
+}
+
+impl SchemaWithCluster {
+    pub(crate) fn new(internal: SchemaRef, cluster: ClusterRef) -> SchemaWithCluster {
+        SchemaWithCluster { internal, cluster }
+    }
+
+    // Get table status, return None when table not found in shard.
+    fn table_status(&self, table_name: NameRef) -> Option<TableStatus> {
+        self.cluster.get_table_status(self.name(), table_name)
+    }
+}
+
+#[async_trait]
+impl Schema for SchemaWithCluster {
+    fn name(&self) -> NameRef {
+        self.internal.name()
+    }
+
+    fn id(&self) -> SchemaId {
+        self.internal.id()
+    }
+
+    fn table_by_name(&self, name: NameRef) -> schema::Result<Option<TableRef>> {
+        let find_table_result = self.internal.table_by_name(name)?;
+
+        if find_table_result.is_none() {
+            return match self.table_status(name) {
+                // Table not found in schema and shard not contains this table.
+                None => Ok(None),
+                // Table not found in schema but shard contains this table.
+                // Check the status of the shard.
+                Some(table_status) => InvalidTableStatus {
+                    status: table_status,
+                }
+                .fail()
+                .box_err()
+                .with_context(|| TableNotReady {})?,
+            };
+        }
+
+        Ok(find_table_result)
+    }
+
+    async fn create_table(
+        &self,
+        request: CreateTableRequest,
+        opts: CreateOptions,
+    ) -> schema::Result<TableRef> {
+        self.internal.create_table(request, opts).await
+    }
+
+    async fn drop_table(
+        &self,
+        request: DropTableRequest,
+        opts: DropOptions,
+    ) -> schema::Result<bool> {
+        self.internal.drop_table(request, opts).await
+    }
+
+    fn all_tables(&self) -> schema::Result<Vec<TableRef>> {
+        self.internal.all_tables()
+    }
+
+    fn register_table(&self, table: TableRef) {
+        self.internal.register_table(table)
+    }
+
+    fn unregister_table(&self, table_name: &str) {
+        self.internal.unregister_table(table_name)
+    }
+}
diff --git a/catalog_impls/src/lib.rs b/catalog_impls/src/lib.rs
index 2abbda9528..90edc1b118 100644
--- a/catalog_impls/src/lib.rs
+++ b/catalog_impls/src/lib.rs
@@ -24,6 +24,7 @@ use system_catalog::{tables::Tables, SystemTableAdapter};
 
 use crate::system_tables::{SystemTables, SystemTablesBuilder};
 
+mod cluster_based;
 mod system_tables;
 pub mod table_based;
 pub mod volatile;
diff --git a/catalog_impls/src/volatile.rs b/catalog_impls/src/volatile.rs
index e217d157fa..a2aa7b9973 100644
--- a/catalog_impls/src/volatile.rs
+++ b/catalog_impls/src/volatile.rs
@@ -32,7 +32,7 @@ use catalog::{
     },
     Catalog, CatalogRef, CreateSchemaWithCause,
 };
-use cluster::shard_set::ShardSet;
+use cluster::{shard_set::ShardSet, ClusterRef};
 use common_types::schema::SchemaName;
 use generic_error::BoxError;
 use logger::{debug, info};
@@ -41,19 +41,23 @@ use snafu::{ensure, OptionExt, ResultExt};
 use table_engine::table::{SchemaId, TableRef};
 use tokio::sync::Mutex;
 
+use crate::cluster_based::SchemaWithCluster;
+
 /// ManagerImpl manages multiple volatile catalogs.
 pub struct ManagerImpl {
     catalogs: HashMap<String, Arc<CatalogImpl>>,
     shard_set: ShardSet,
     meta_client: MetaClientRef,
+    cluster: ClusterRef,
 }
 
 impl ManagerImpl {
-    pub fn new(shard_set: ShardSet, meta_client: MetaClientRef) -> Self {
+    pub fn new(shard_set: ShardSet, meta_client: MetaClientRef, cluster: ClusterRef) -> Self {
         let mut manager = ManagerImpl {
             catalogs: HashMap::new(),
             shard_set,
             meta_client,
+            cluster,
         };
 
         manager.maybe_create_default_catalog();
@@ -101,6 +105,7 @@ impl ManagerImpl {
             schemas: RwLock::new(HashMap::new()),
             shard_set: self.shard_set.clone(),
             meta_client: self.meta_client.clone(),
+            cluster: self.cluster.clone(),
         });
 
         self.catalogs.insert(catalog_name, catalog.clone());
@@ -121,6 +126,7 @@ struct CatalogImpl {
     schemas: RwLock<HashMap<SchemaName, SchemaRef>>,
     shard_set: ShardSet,
     meta_client: MetaClientRef,
+    cluster: ClusterRef,
 }
 
 #[async_trait]
@@ -171,7 +177,10 @@ impl Catalog for CatalogImpl {
             self.shard_set.clone(),
         ));
 
-        schemas.insert(name.to_string(), schema);
+        let cluster_based: SchemaRef =
+            Arc::new(SchemaWithCluster::new(schema, self.cluster.clone()));
+
+        schemas.insert(name.to_string(), cluster_based);
 
         info!(
             "create schema success, catalog:{}, schema:{}",
@@ -282,7 +291,10 @@ impl Schema for SchemaImpl {
     }
 
     fn table_by_name(&self, name: NameRef) -> schema::Result<Option<TableRef>> {
-        let table = self.tables.read().unwrap().get(name).cloned();
+        let table = self
+            .get_table(self.catalog_name.as_str(), self.schema_name.as_str(), name)
+            .unwrap()
+            .clone();
         Ok(table)
     }
 
diff --git a/cluster/src/cluster_impl.rs b/cluster/src/cluster_impl.rs
index 6804081a4e..3ec73a00c8 100644
--- a/cluster/src/cluster_impl.rs
+++ b/cluster/src/cluster_impl.rs
@@ -44,7 +44,7 @@ use crate::{
     topology::ClusterTopology,
     Cluster, ClusterNodesNotFound, ClusterNodesResp, EtcdClientFailureWithCause,
     InitEtcdClientConfig, InvalidArguments, MetaClientFailure, OpenShard, OpenShardWithCause,
-    Result, ShardNotFound,
+    Result, ShardNotFound, TableStatus,
 };
 
 /// ClusterImpl is an implementation of [`Cluster`] based [`MetaClient`].
@@ -311,6 +311,19 @@ impl Inner {
         self.shard_set.get(shard_id)
     }
 
+    /// Get shard by table name.
+    ///
+    /// This method is similar to `route_tables`, but it will not send request
+    /// to meta server, it only load data from local cache.
+    /// If target table is not found in any shards in this cluster, return None.
+    /// Otherwise, return the shard where this table is exists.
+    fn get_shard_by_table_name(&self, schema_name: &str, table_name: &str) -> Option<ShardRef> {
+        let shards = self.shard_set.all_shards();
+        shards
+            .into_iter()
+            .find(|shard| shard.find_table(schema_name, table_name).is_some())
+    }
+
     fn close_shard(&self, shard_id: ShardId) -> Result<ShardRef> {
         info!("Remove shard from shard_set, id:{shard_id}");
         self.shard_set
@@ -368,6 +381,12 @@ impl Cluster for ClusterImpl {
         self.inner.shard(shard_id)
     }
 
+    fn get_table_status(&self, schema_name: &str, table_name: &str) -> Option<TableStatus> {
+        self.inner
+            .get_shard_by_table_name(schema_name, table_name)
+            .map(|shard| TableStatus::from(shard.get_status()))
+    }
+
     async fn close_shard(&self, shard_id: ShardId) -> Result<ShardRef> {
         self.inner.close_shard(shard_id)
     }
diff --git a/cluster/src/lib.rs b/cluster/src/lib.rs
index be7374d7e0..f13265290d 100644
--- a/cluster/src/lib.rs
+++ b/cluster/src/lib.rs
@@ -29,7 +29,8 @@ use common_types::schema::SchemaName;
 use generic_error::GenericError;
 use macros::define_result;
 use meta_client::types::{
-    ClusterNodesRef, RouteTablesRequest, RouteTablesResponse, ShardId, ShardInfo, ShardVersion,
+    ClusterNodesRef, RouteTablesRequest, RouteTablesResponse, ShardId, ShardInfo, ShardStatus,
+    ShardVersion,
 };
 use shard_lock_manager::ShardLockManagerRef;
 use snafu::{Backtrace, Snafu};
@@ -161,6 +162,23 @@ pub enum Error {
 
 define_result!(Error);
 
+#[derive(Debug)]
+pub enum TableStatus {
+    Ready,
+    Recovering,
+    Frozen,
+}
+
+impl From<ShardStatus> for TableStatus {
+    fn from(value: ShardStatus) -> Self {
+        match value {
+            ShardStatus::Init | ShardStatus::Opening => TableStatus::Recovering,
+            ShardStatus::Ready => TableStatus::Ready,
+            ShardStatus::Frozen => TableStatus::Frozen,
+        }
+    }
+}
+
 pub type ClusterRef = Arc<dyn Cluster + Send + Sync>;
 
 #[derive(Clone, Debug)]
@@ -184,12 +202,14 @@ pub trait Cluster {
     /// None.
     fn shard(&self, shard_id: ShardId) -> Option<ShardRef>;
 
+    fn get_table_status(&self, schema_name: &str, table_name: &str) -> Option<TableStatus>;
+
     /// Close shard.
     ///
     /// Return error if the shard is not found.
     async fn close_shard(&self, shard_id: ShardId) -> Result<ShardRef>;
 
-    /// list shards
+    /// list loaded shards in current node.
     fn list_shards(&self) -> Vec<ShardInfo>;
 
     async fn route_tables(&self, req: &RouteTablesRequest) -> Result<RouteTablesResponse>;
diff --git a/cluster/src/shard_set.rs b/cluster/src/shard_set.rs
index b815c6047f..00bbb9eff1 100644
--- a/cluster/src/shard_set.rs
+++ b/cluster/src/shard_set.rs
@@ -132,11 +132,21 @@ impl Shard {
         ret
     }
 
+    pub fn get_status(&self) -> ShardStatus {
+        let data = self.data.read().unwrap();
+        data.shard_info.status.clone()
+    }
+
     pub fn is_opened(&self) -> bool {
         let data = self.data.read().unwrap();
         data.is_opened()
     }
 
+    pub fn is_frozen(&self) -> bool {
+        let data = self.data.read().unwrap();
+        data.is_frozen()
+    }
+
     pub async fn close(&self, ctx: CloseContext) -> Result<()> {
         let operator = self.operator.lock().await;
         operator.close(ctx).await
diff --git a/meta_client/src/types.rs b/meta_client/src/types.rs
index f822428ed5..f23e8c547c 100644
--- a/meta_client/src/types.rs
+++ b/meta_client/src/types.rs
@@ -226,6 +226,11 @@ impl ShardInfo {
     pub fn is_opened(&self) -> bool {
         matches!(self.status, ShardStatus::Ready | ShardStatus::Frozen)
     }
+
+    #[inline]
+    pub fn is_ready(&self) -> bool {
+        matches!(self.status, ShardStatus::Ready)
+    }
 }
 
 #[derive(Debug, Default, Copy, Clone, Eq, PartialEq, Serialize)]
diff --git a/router/src/cluster_based.rs b/router/src/cluster_based.rs
index 83d2b266b4..04f30c8409 100644
--- a/router/src/cluster_based.rs
+++ b/router/src/cluster_based.rs
@@ -201,6 +201,7 @@ mod tests {
     use ceresdbproto::storage::{RequestContext, RouteRequest as RouteRequestPb};
     use cluster::{
         shard_lock_manager::ShardLockManagerRef, shard_set::ShardRef, Cluster, ClusterNodesResp,
+        TableStatus,
     };
     use common_types::table::ShardId;
     use meta_client::types::{
@@ -230,6 +231,10 @@ mod tests {
             unimplemented!();
         }
 
+        fn get_table_status(&self, _: &str, _: &str) -> Option<TableStatus> {
+            unimplemented!()
+        }
+
         async fn close_shard(&self, _: ShardId) -> cluster::Result<ShardRef> {
             unimplemented!();
         }
diff --git a/src/ceresdb/src/setup.rs b/src/ceresdb/src/setup.rs
index a7ae6a3160..d6d4e770fe 100644
--- a/src/ceresdb/src/setup.rs
+++ b/src/ceresdb/src/setup.rs
@@ -334,8 +334,11 @@ async fn build_with_meta<T: WalsOpener>(
     };
     let engine_proxy = build_table_engine_proxy(engine_builder).await;
 
-    let meta_based_manager_ref =
-        Arc::new(volatile::ManagerImpl::new(shard_set, meta_client.clone()));
+    let meta_based_manager_ref = Arc::new(volatile::ManagerImpl::new(
+        shard_set,
+        meta_client.clone(),
+        cluster.clone(),
+    ));
 
     // Build catalog manager.
     let catalog_manager = Arc::new(CatalogManagerImpl::new(meta_based_manager_ref));

From 53df95ac507d7658b7e6abb44820a49cc62bf6d4 Mon Sep 17 00:00:00 2001
From: kamille <34352236+Rachelint@users.noreply.github.com>
Date: Mon, 8 Jan 2024 15:41:20 +0800
Subject: [PATCH 31/38] feat: impl layered memtable to reduce duplicated encode
 during scan (#1271)

## Rationale
Conversion from row format in memtable to record batch in datafusion has
been found a cpu bottleneck in production. For reduce the cpu cost, I
impl the layered memtable framework to support gradually conversion
during normal write path(and before flush).

## Detailed Changes
+ Impl layered memtable framework
+ Integrate it into the write path.

## Test Plan
Test by new ut and it.
---
 Cargo.lock                                    |  32 +-
 Cargo.toml                                    |   2 +-
 .../src/instance/flush_compaction.rs          |  30 +-
 analytic_engine/src/lib.rs                    |   6 +
 .../src/memtable/columnar/factory.rs          |   5 +-
 analytic_engine/src/memtable/factory.rs       |   9 +-
 .../src/memtable/layered/factory.rs           |  51 ++
 analytic_engine/src/memtable/layered/iter.rs  | 120 +++
 analytic_engine/src/memtable/layered/mod.rs   | 729 ++++++++++++++++++
 analytic_engine/src/memtable/mod.rs           |  73 +-
 .../src/memtable/skiplist/factory.rs          |   4 +-
 analytic_engine/src/memtable/skiplist/mod.rs  | 133 ++--
 analytic_engine/src/memtable/test_util.rs     |  42 +
 .../src/row_iter/record_batch_stream.rs       |   1 +
 .../src/sst/parquet/async_reader.rs           |  19 +-
 analytic_engine/src/table/data.rs             |  49 +-
 analytic_engine/src/table_meta_set_impl.rs    |   1 +
 analytic_engine/src/table_options.rs          |  62 +-
 benchmarks/src/scan_memtable_bench.rs         |   6 +-
 common_types/src/lib.rs                       |   1 +
 common_types/src/projected_schema.rs          |   2 +-
 common_types/src/record_batch.rs              |  27 +-
 common_types/src/time.rs                      |   7 +
 system_catalog/src/sys_catalog_table.rs       |   5 +
 24 files changed, 1281 insertions(+), 135 deletions(-)
 create mode 100644 analytic_engine/src/memtable/layered/factory.rs
 create mode 100644 analytic_engine/src/memtable/layered/iter.rs
 create mode 100644 analytic_engine/src/memtable/layered/mod.rs
 create mode 100644 analytic_engine/src/memtable/test_util.rs

diff --git a/Cargo.lock b/Cargo.lock
index b1f6b823ce..088009fa3b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -96,7 +96,7 @@ dependencies = [
  "atomic_enum",
  "base64 0.13.1",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "codec",
  "common_types",
  "datafusion",
@@ -1345,7 +1345,7 @@ dependencies = [
 [[package]]
 name = "ceresdbproto"
 version = "1.0.23"
-source = "git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc#cfdacccebb7c609cb1aac791b73ba9a838d7ade6"
+source = "git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32#cb36a32b827b3afe88ca04420f7fd21518f15293"
 dependencies = [
  "prost",
  "protoc-bin-vendored",
@@ -1528,7 +1528,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "common_types",
  "etcd-client",
  "future_ext",
@@ -1606,7 +1606,7 @@ dependencies = [
  "arrow 43.0.0",
  "arrow_ext",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "chrono",
  "datafusion",
  "hash_ext",
@@ -2362,7 +2362,7 @@ dependencies = [
  "async-recursion",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -3921,7 +3921,7 @@ name = "meta_client"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -4446,7 +4446,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "chrono",
  "clru",
  "crc",
@@ -5323,7 +5323,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "clru",
  "cluster",
  "common_types",
@@ -5451,7 +5451,7 @@ dependencies = [
  "arrow 43.0.0",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "chrono",
  "cluster",
  "codec",
@@ -5765,7 +5765,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "arrow_ext",
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "common_types",
  "futures 0.3.28",
  "generic_error",
@@ -5894,7 +5894,7 @@ name = "router"
 version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "cluster",
  "common_types",
  "generic_error",
@@ -6269,7 +6269,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "clru",
  "cluster",
  "common_types",
@@ -6795,7 +6795,7 @@ dependencies = [
  "async-trait",
  "bytes_ext",
  "catalog",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "codec",
  "common_types",
  "futures 0.3.28",
@@ -6817,7 +6817,7 @@ dependencies = [
  "arrow_ext",
  "async-trait",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "common_types",
  "datafusion",
  "datafusion-proto",
@@ -7020,7 +7020,7 @@ dependencies = [
 name = "time_ext"
 version = "1.2.6-alpha"
 dependencies = [
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "chrono",
  "common_types",
  "macros",
@@ -7672,7 +7672,7 @@ version = "1.2.6-alpha"
 dependencies = [
  "async-trait",
  "bytes_ext",
- "ceresdbproto 1.0.23 (git+https://github.com/CeresDB/horaedbproto.git?rev=cfdaccc)",
+ "ceresdbproto 1.0.23 (git+https://github.com/apache/incubator-horaedb-proto.git?rev=cb36a32)",
  "chrono",
  "codec",
  "common_types",
diff --git a/Cargo.toml b/Cargo.toml
index 1588c237de..e20984f616 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -94,7 +94,7 @@ bytes = "1"
 bytes_ext = { path = "components/bytes_ext" }
 catalog = { path = "catalog" }
 catalog_impls = { path = "catalog_impls" }
-ceresdbproto = { git = "https://github.com/CeresDB/horaedbproto.git", rev = "cfdaccc" }
+ceresdbproto = { git = "https://github.com/apache/incubator-horaedb-proto.git", rev = "cb36a32" }
 codec = { path = "components/codec" }
 chrono = "0.4"
 clap = "3.0"
diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs
index 880eb10ca9..b051fb04b4 100644
--- a/analytic_engine/src/instance/flush_compaction.rs
+++ b/analytic_engine/src/instance/flush_compaction.rs
@@ -63,7 +63,7 @@ use crate::{
         factory::{self, ColumnStats, ScanOptions, SstWriteOptions},
         file::{FileMeta, Level},
         meta_data::{SstMetaData, SstMetaReader},
-        writer::{MetaData, RecordBatchStream},
+        writer::MetaData,
     },
     table::{
         data::{self, TableData, TableDataRef},
@@ -367,7 +367,7 @@ impl FlushTask {
         let mut last_sequence = table_data.last_sequence();
         // Switch (freeze) all mutable memtables. And update segment duration if
         // suggestion is returned.
-        let mut need_reorder = false;
+        let mut need_reorder = table_data.enable_layered_memtable;
         if let Some(suggest_segment_duration) = current_version.suggest_duration() {
             info!(
                 "Update segment duration, table:{}, table_id:{}, segment_duration:{:?}",
@@ -483,7 +483,9 @@ impl FlushTask {
             }
         }
         for mem in &mems_to_flush.memtables {
-            let file = self.dump_normal_memtable(request_id.clone(), mem).await?;
+            let file = self
+                .dump_normal_memtable(request_id.clone(), mem, need_reorder)
+                .await?;
             if let Some(file) = file {
                 let sst_size = file.size;
                 files_to_level0.push(AddFile {
@@ -728,6 +730,7 @@ impl FlushTask {
         &self,
         request_id: RequestId,
         memtable_state: &MemTableState,
+        need_reorder: bool,
     ) -> Result<Option<FileMeta>> {
         let (min_key, max_key) = match (memtable_state.mem.min_key(), memtable_state.mem.max_key())
         {
@@ -778,8 +781,24 @@ impl FlushTask {
 
         let iter = build_mem_table_iter(memtable_state.mem.clone(), &self.table_data)?;
 
-        let record_batch_stream: RecordBatchStream =
-            Box::new(stream::iter(iter).map_err(|e| Box::new(e) as _));
+        let record_batch_stream = if need_reorder {
+            let schema = self.table_data.schema();
+            let primary_key_indexes = schema.primary_key_indexes().to_vec();
+            let reorder = Reorder {
+                iter,
+                schema,
+                order_by_col_indexes: primary_key_indexes,
+            };
+            Box::new(
+                reorder
+                    .into_stream()
+                    .await
+                    .context(ReorderMemIter)?
+                    .map(|batch| batch.box_err()),
+            ) as _
+        } else {
+            Box::new(stream::iter(iter).map(|batch| batch.box_err())) as _
+        };
 
         let sst_info = writer
             .write(request_id, &sst_meta, record_batch_stream)
@@ -1231,6 +1250,7 @@ fn build_mem_table_iter(
         need_dedup: table_data.dedup(),
         reverse: false,
         metrics_collector: None,
+        time_range: TimeRange::min_to_max(),
     };
     memtable
         .scan(scan_ctx, scan_req)
diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs
index 68f796837e..43515a6e7a 100644
--- a/analytic_engine/src/lib.rs
+++ b/analytic_engine/src/lib.rs
@@ -83,6 +83,11 @@ pub struct Config {
     /// The ratio of table's write buffer size to trigger preflush, and it
     /// should be in the range (0, 1].
     pub preflush_write_buffer_size_ratio: f32,
+
+    /// The threshold to trigger switching mutable segment of memtable.
+    /// If it is zero, disable the layered memtable.
+    pub mutable_segment_switch_threshold: ReadableSize,
+
     pub enable_primary_key_sampling: bool,
 
     // Iterator scanning options
@@ -200,6 +205,7 @@ impl Default for Config {
             remote_engine_client: remote_engine_client::config::Config::default(),
             recover_mode: RecoverMode::TableBased,
             metrics: MetricsOptions::default(),
+            mutable_segment_switch_threshold: ReadableSize::mb(3),
         }
     }
 }
diff --git a/analytic_engine/src/memtable/columnar/factory.rs b/analytic_engine/src/memtable/columnar/factory.rs
index 1e8f0604b3..6ce4f99643 100644
--- a/analytic_engine/src/memtable/columnar/factory.rs
+++ b/analytic_engine/src/memtable/columnar/factory.rs
@@ -24,10 +24,9 @@ use std::{
 
 use crate::memtable::{
     columnar::ColumnarMemTable,
-    factory::{Factory, Options, Result},
-    MemTableRef,
+    factory::{Factory, Options},
+    MemTableRef, Result,
 };
-
 /// Factory to create memtable
 #[derive(Debug)]
 pub struct ColumnarMemTableFactory;
diff --git a/analytic_engine/src/memtable/factory.rs b/analytic_engine/src/memtable/factory.rs
index 97a7bd4371..b6952e6ac5 100644
--- a/analytic_engine/src/memtable/factory.rs
+++ b/analytic_engine/src/memtable/factory.rs
@@ -18,15 +18,8 @@ use std::{fmt, sync::Arc};
 
 use arena::CollectorRef;
 use common_types::{schema::Schema, SequenceNumber};
-use macros::define_result;
-use snafu::Snafu;
 
-use crate::memtable::MemTableRef;
-
-#[derive(Debug, Snafu)]
-pub enum Error {}
-
-define_result!(Error);
+use crate::memtable::{MemTableRef, Result};
 
 /// MemTable options
 #[derive(Clone)]
diff --git a/analytic_engine/src/memtable/layered/factory.rs b/analytic_engine/src/memtable/layered/factory.rs
new file mode 100644
index 0000000000..03c793b9c5
--- /dev/null
+++ b/analytic_engine/src/memtable/layered/factory.rs
@@ -0,0 +1,51 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Skiplist memtable factory
+
+use std::sync::Arc;
+
+use crate::memtable::{
+    factory::{Factory, FactoryRef, Options},
+    layered::LayeredMemTable,
+    MemTableRef, Result,
+};
+
+/// Factory to create memtable
+#[derive(Debug)]
+pub struct LayeredMemtableFactory {
+    inner_memtable_factory: FactoryRef,
+    mutable_switch_threshold: usize,
+}
+
+impl LayeredMemtableFactory {
+    pub fn new(inner_memtable_factory: FactoryRef, mutable_switch_threshold: usize) -> Self {
+        Self {
+            inner_memtable_factory,
+            mutable_switch_threshold,
+        }
+    }
+}
+
+impl Factory for LayeredMemtableFactory {
+    fn create_memtable(&self, opts: Options) -> Result<MemTableRef> {
+        let memtable = LayeredMemTable::new(
+            &opts,
+            self.inner_memtable_factory.clone(),
+            self.mutable_switch_threshold,
+        )?;
+
+        Ok(Arc::new(memtable))
+    }
+}
diff --git a/analytic_engine/src/memtable/layered/iter.rs b/analytic_engine/src/memtable/layered/iter.rs
new file mode 100644
index 0000000000..6e1f303083
--- /dev/null
+++ b/analytic_engine/src/memtable/layered/iter.rs
@@ -0,0 +1,120 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Skiplist memtable iterator
+
+use common_types::{record_batch::FetchedRecordBatch, schema::Schema, time::TimeRange};
+use generic_error::BoxError;
+use snafu::ResultExt;
+
+use crate::memtable::{
+    layered::{ImmutableSegment, MutableSegment},
+    ColumnarIterPtr, Internal, ProjectSchema, Result, ScanContext, ScanRequest,
+};
+
+/// Columnar iterator for [LayeredMemTable]
+pub(crate) struct ColumnarIterImpl {
+    selected_batch_iter: ColumnarIterPtr,
+}
+
+impl ColumnarIterImpl {
+    pub fn new(
+        memtable_schema: &Schema,
+        ctx: ScanContext,
+        request: ScanRequest,
+        mutable: &MutableSegment,
+        immutables: &[ImmutableSegment],
+    ) -> Result<Self> {
+        // Create projection for the memtable schema
+        let row_projector = request
+            .row_projector_builder
+            .build(memtable_schema)
+            .context(ProjectSchema)?;
+
+        let (maybe_mutable, selected_immutables) =
+            Self::filter_by_time_range(mutable, immutables, request.time_range);
+
+        let immutable_batches = selected_immutables
+            .flat_map(|imm| {
+                imm.record_batches().iter().map(|batch| {
+                    // TODO: reduce clone here.
+                    let fetched_schema = row_projector.fetched_schema().clone();
+                    let primary_key_indexes = row_projector
+                        .primary_key_indexes()
+                        .map(|idxs| idxs.to_vec());
+                    let fetched_column_indexes = row_projector.fetched_source_column_indexes();
+                    FetchedRecordBatch::try_new(
+                        fetched_schema,
+                        primary_key_indexes,
+                        fetched_column_indexes,
+                        batch.clone(),
+                    )
+                    .box_err()
+                    .with_context(|| Internal {
+                        msg: format!("row_projector:{row_projector:?}",),
+                    })
+                })
+            })
+            .collect::<Vec<_>>();
+        let immutable_iter = immutable_batches.into_iter();
+
+        let maybe_mutable_iter = match maybe_mutable {
+            Some(mutable) => Some(mutable.scan(ctx, request)?),
+            None => None,
+        };
+
+        let maybe_chained_iter = match maybe_mutable_iter {
+            Some(mutable_iter) => Box::new(mutable_iter.chain(immutable_iter)) as _,
+            None => Box::new(immutable_iter) as _,
+        };
+
+        Ok(Self {
+            selected_batch_iter: maybe_chained_iter,
+        })
+    }
+
+    fn filter_by_time_range<'a>(
+        mutable: &'a MutableSegment,
+        immutables: &'a [ImmutableSegment],
+        time_range: TimeRange,
+    ) -> (
+        Option<&'a MutableSegment>,
+        impl Iterator<Item = &'a ImmutableSegment>,
+    ) {
+        let maybe_mutable = {
+            let mutable_time_range = mutable.time_range();
+            mutable_time_range.and_then(|range| {
+                if range.intersect_with(time_range) {
+                    Some(mutable)
+                } else {
+                    None
+                }
+            })
+        };
+
+        let selected_immutables = immutables
+            .iter()
+            .filter(move |imm| imm.time_range().intersect_with(time_range));
+
+        (maybe_mutable, selected_immutables)
+    }
+}
+
+impl Iterator for ColumnarIterImpl {
+    type Item = Result<FetchedRecordBatch>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.selected_batch_iter.next()
+    }
+}
diff --git a/analytic_engine/src/memtable/layered/mod.rs b/analytic_engine/src/memtable/layered/mod.rs
new file mode 100644
index 0000000000..92087e1e10
--- /dev/null
+++ b/analytic_engine/src/memtable/layered/mod.rs
@@ -0,0 +1,729 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! MemTable based on skiplist
+
+pub mod factory;
+pub mod iter;
+
+use std::{
+    mem,
+    ops::{Bound, Deref},
+    sync::{
+        atomic::{self, AtomicU64},
+        RwLock,
+    },
+};
+
+use arena::CollectorRef;
+use arrow::record_batch::RecordBatch as ArrowRecordBatch;
+use bytes_ext::Bytes;
+use common_types::{
+    projected_schema::RowProjectorBuilder, row::Row, schema::Schema, time::TimeRange,
+    SequenceNumber,
+};
+use generic_error::BoxError;
+use logger::debug;
+use skiplist::{BytewiseComparator, KeyComparator};
+use snafu::{OptionExt, ResultExt};
+
+use crate::memtable::{
+    factory::{FactoryRef, Options},
+    key::KeySequence,
+    layered::iter::ColumnarIterImpl,
+    ColumnarIterPtr, Internal, InternalNoCause, MemTable, MemTableRef, Metrics as MemtableMetrics,
+    PutContext, Result, ScanContext, ScanRequest,
+};
+
+/// MemTable implementation based on skiplist
+pub(crate) struct LayeredMemTable {
+    /// Schema of this memtable, is immutable.
+    schema: Schema,
+
+    /// The last sequence of the rows in this memtable. Update to this field
+    /// require external synchronization.
+    last_sequence: AtomicU64,
+
+    inner: RwLock<Inner>,
+
+    mutable_switch_threshold: usize,
+}
+
+impl LayeredMemTable {
+    pub fn new(
+        opts: &Options,
+        inner_memtable_factory: FactoryRef,
+        mutable_switch_threshold: usize,
+    ) -> Result<Self> {
+        let inner = Inner::new(inner_memtable_factory, opts)?;
+
+        Ok(Self {
+            schema: opts.schema.clone(),
+            last_sequence: AtomicU64::new(opts.creation_sequence),
+            inner: RwLock::new(inner),
+            mutable_switch_threshold,
+        })
+    }
+
+    // Used for testing only
+    #[cfg(test)]
+    fn force_switch_mutable_segment(&self) -> Result<()> {
+        let inner = &mut *self.inner.write().unwrap();
+        inner.switch_mutable_segment(self.schema.clone())
+    }
+}
+
+impl MemTable for LayeredMemTable {
+    fn schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    fn min_key(&self) -> Option<Bytes> {
+        self.inner.read().unwrap().min_key()
+    }
+
+    fn max_key(&self) -> Option<Bytes> {
+        self.inner.read().unwrap().max_key()
+    }
+
+    fn put(
+        &self,
+        ctx: &mut PutContext,
+        sequence: KeySequence,
+        row: &Row,
+        schema: &Schema,
+    ) -> Result<()> {
+        let memory_usage = {
+            let inner = self.inner.read().unwrap();
+            inner.put(ctx, sequence, row, schema)?;
+            inner.mutable_segment.0.approximate_memory_usage()
+        };
+
+        if memory_usage > self.mutable_switch_threshold {
+            debug!(
+                "LayeredMemTable put, memory_usage:{memory_usage}, mutable_switch_threshold:{}",
+                self.mutable_switch_threshold
+            );
+            let inner = &mut *self.inner.write().unwrap();
+            inner.switch_mutable_segment(self.schema.clone())?;
+        }
+
+        Ok(())
+    }
+
+    fn scan(&self, ctx: ScanContext, request: ScanRequest) -> Result<ColumnarIterPtr> {
+        let inner = self.inner.read().unwrap();
+        inner.scan(&self.schema, ctx, request)
+    }
+
+    fn approximate_memory_usage(&self) -> usize {
+        self.inner.read().unwrap().approximate_memory_usage()
+    }
+
+    fn set_last_sequence(&self, sequence: SequenceNumber) -> Result<()> {
+        self.last_sequence
+            .store(sequence, atomic::Ordering::Relaxed);
+        Ok(())
+    }
+
+    fn last_sequence(&self) -> SequenceNumber {
+        self.last_sequence.load(atomic::Ordering::Relaxed)
+    }
+
+    fn time_range(&self) -> Option<TimeRange> {
+        let inner = self.inner.read().unwrap();
+        inner.time_range()
+    }
+
+    fn metrics(&self) -> MemtableMetrics {
+        // FIXME: stats and return metrics
+        MemtableMetrics::default()
+    }
+}
+
+/// Layered memtable inner
+struct Inner {
+    mutable_segment_builder: MutableSegmentBuilder,
+    mutable_segment: MutableSegment,
+    immutable_segments: Vec<ImmutableSegment>,
+}
+
+impl Inner {
+    fn new(memtable_factory: FactoryRef, opts: &Options) -> Result<Self> {
+        let builder_opts = MutableBuilderOptions {
+            schema: opts.schema.clone(),
+            arena_block_size: opts.arena_block_size,
+            collector: opts.collector.clone(),
+        };
+        let mutable_segment_builder = MutableSegmentBuilder::new(memtable_factory, builder_opts);
+
+        // Build the first mutable batch.
+        let init_mutable_segment = mutable_segment_builder.build()?;
+
+        Ok(Self {
+            mutable_segment_builder,
+            mutable_segment: init_mutable_segment,
+            immutable_segments: vec![],
+        })
+    }
+
+    /// Scan batches including `mutable` and `immutable`s.
+    #[inline]
+    fn scan(
+        &self,
+        schema: &Schema,
+        ctx: ScanContext,
+        request: ScanRequest,
+    ) -> Result<ColumnarIterPtr> {
+        let iter = ColumnarIterImpl::new(
+            schema,
+            ctx,
+            request,
+            &self.mutable_segment,
+            &self.immutable_segments,
+        )?;
+        Ok(Box::new(iter))
+    }
+
+    #[inline]
+    fn put(
+        &self,
+        ctx: &mut PutContext,
+        sequence: KeySequence,
+        row: &Row,
+        schema: &Schema,
+    ) -> Result<()> {
+        self.mutable_segment.put(ctx, sequence, row, schema)
+    }
+
+    fn switch_mutable_segment(&mut self, schema: Schema) -> Result<()> {
+        let imm_num = self.immutable_segments.len();
+        debug!("LayeredMemTable switch_mutable_segment, imm_num:{imm_num}");
+
+        // Build a new mutable segment, and replace current's.
+        let new_mutable = self.mutable_segment_builder.build()?;
+        let current_mutable = mem::replace(&mut self.mutable_segment, new_mutable);
+        let fetched_schema = schema.to_record_schema();
+
+        // Convert current's to immutable.
+        let scan_ctx = ScanContext::default();
+        let row_projector_builder = RowProjectorBuilder::new(fetched_schema, schema, None);
+        let scan_req = ScanRequest {
+            start_user_key: Bound::Unbounded,
+            end_user_key: Bound::Unbounded,
+            sequence: common_types::MAX_SEQUENCE_NUMBER,
+            need_dedup: false,
+            reverse: false,
+            metrics_collector: None,
+            time_range: TimeRange::min_to_max(),
+            row_projector_builder,
+        };
+
+        let immutable_batches = current_mutable
+            .scan(scan_ctx, scan_req)?
+            .map(|batch_res| batch_res.map(|batch| batch.into_arrow_record_batch()))
+            .collect::<Result<Vec<_>>>()?;
+
+        let time_range = current_mutable.time_range().context(InternalNoCause {
+            msg: "failed to get time range from mutable segment",
+        })?;
+        let max_key = current_mutable.max_key().context(InternalNoCause {
+            msg: "failed to get max key from mutable segment",
+        })?;
+        let min_key = current_mutable.min_key().context(InternalNoCause {
+            msg: "failed to get min key from mutable segment",
+        })?;
+        let immutable = ImmutableSegment::new(immutable_batches, time_range, min_key, max_key);
+
+        self.immutable_segments.push(immutable);
+
+        Ok(())
+    }
+
+    pub fn min_key(&self) -> Option<Bytes> {
+        let comparator = BytewiseComparator;
+
+        let mutable_min_key = self.mutable_segment.min_key();
+
+        let immutable_min_key = if self.immutable_segments.is_empty() {
+            None
+        } else {
+            let mut min_key = self.immutable_segments.first().unwrap().min_key();
+            let mut imm_iter = self.immutable_segments.iter();
+            let _ = imm_iter.next();
+            for imm in imm_iter {
+                if let std::cmp::Ordering::Greater = comparator.compare_key(&min_key, &imm.min_key)
+                {
+                    min_key = imm.min_key();
+                }
+            }
+
+            Some(min_key)
+        };
+
+        match (mutable_min_key, immutable_min_key) {
+            (None, None) => None,
+            (None, Some(key)) | (Some(key), None) => Some(key),
+            (Some(key1), Some(key2)) => Some(match comparator.compare_key(&key1, &key2) {
+                std::cmp::Ordering::Greater => key2,
+                std::cmp::Ordering::Less | std::cmp::Ordering::Equal => key1,
+            }),
+        }
+    }
+
+    pub fn max_key(&self) -> Option<Bytes> {
+        let comparator = BytewiseComparator;
+
+        let mutable_max_key = self.mutable_segment.max_key();
+
+        let immutable_max_key = if self.immutable_segments.is_empty() {
+            None
+        } else {
+            let mut max_key = self.immutable_segments.first().unwrap().max_key();
+            let mut imm_iter = self.immutable_segments.iter();
+            let _ = imm_iter.next();
+            for imm in imm_iter {
+                if let std::cmp::Ordering::Less = comparator.compare_key(&max_key, &imm.max_key) {
+                    max_key = imm.max_key();
+                }
+            }
+
+            Some(max_key)
+        };
+
+        match (mutable_max_key, immutable_max_key) {
+            (None, None) => None,
+            (None, Some(key)) | (Some(key), None) => Some(key),
+            (Some(key1), Some(key2)) => Some(match comparator.compare_key(&key1, &key2) {
+                std::cmp::Ordering::Less => key2,
+                std::cmp::Ordering::Greater | std::cmp::Ordering::Equal => key1,
+            }),
+        }
+    }
+
+    pub fn time_range(&self) -> Option<TimeRange> {
+        let mutable_time_range = self.mutable_segment.time_range();
+
+        let immutable_time_range = if self.immutable_segments.is_empty() {
+            None
+        } else {
+            let mut time_range = self.immutable_segments.first().unwrap().time_range();
+            let mut imm_iter = self.immutable_segments.iter();
+            let _ = imm_iter.next();
+            for imm in imm_iter {
+                time_range = time_range.merge_range(imm.time_range());
+            }
+
+            Some(time_range)
+        };
+
+        match (mutable_time_range, immutable_time_range) {
+            (None, None) => None,
+            (None, Some(range)) | (Some(range), None) => Some(range),
+            (Some(range1), Some(range2)) => Some(range1.merge_range(range2)),
+        }
+    }
+
+    fn approximate_memory_usage(&self) -> usize {
+        let mutable_mem_usage = self.mutable_segment.approximate_memory_usage();
+
+        let immutable_mem_usage = self
+            .immutable_segments
+            .iter()
+            .map(|imm| imm.approximate_memory_usage())
+            .sum::<usize>();
+
+        mutable_mem_usage + immutable_mem_usage
+    }
+}
+
+/// Mutable batch
+pub(crate) struct MutableSegment(MemTableRef);
+
+impl Deref for MutableSegment {
+    type Target = MemTableRef;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// Builder for `MutableBatch`
+struct MutableSegmentBuilder {
+    memtable_factory: FactoryRef,
+    opts: MutableBuilderOptions,
+}
+
+impl MutableSegmentBuilder {
+    fn new(memtable_factory: FactoryRef, opts: MutableBuilderOptions) -> Self {
+        Self {
+            memtable_factory,
+            opts,
+        }
+    }
+
+    fn build(&self) -> Result<MutableSegment> {
+        let memtable_opts = Options {
+            schema: self.opts.schema.clone(),
+            arena_block_size: self.opts.arena_block_size,
+            // `creation_sequence` is meaningless in inner memtable, just set it to min.
+            creation_sequence: SequenceNumber::MIN,
+            collector: self.opts.collector.clone(),
+        };
+
+        let memtable = self
+            .memtable_factory
+            .create_memtable(memtable_opts)
+            .box_err()
+            .context(Internal {
+                msg: "failed to build mutable segment",
+            })?;
+
+        Ok(MutableSegment(memtable))
+    }
+}
+
+struct MutableBuilderOptions {
+    pub schema: Schema,
+
+    /// Block size of arena in bytes.
+    pub arena_block_size: u32,
+
+    /// Memory usage collector
+    pub collector: CollectorRef,
+}
+
+/// Immutable batch
+pub(crate) struct ImmutableSegment {
+    /// Record batch converted from `MutableBatch`    
+    record_batches: Vec<ArrowRecordBatch>,
+
+    /// Min time of source `MutableBatch`
+    time_range: TimeRange,
+
+    /// Min key of source `MutableBatch`
+    min_key: Bytes,
+
+    /// Max key of source `MutableBatch`
+    max_key: Bytes,
+
+    approximate_memory_size: usize,
+}
+
+impl ImmutableSegment {
+    fn new(
+        record_batches: Vec<ArrowRecordBatch>,
+        time_range: TimeRange,
+        min_key: Bytes,
+        max_key: Bytes,
+    ) -> Self {
+        let approximate_memory_size = record_batches
+            .iter()
+            .map(|batch| batch.get_array_memory_size())
+            .sum();
+
+        Self {
+            record_batches,
+            time_range,
+            min_key,
+            max_key,
+            approximate_memory_size,
+        }
+    }
+
+    pub fn time_range(&self) -> TimeRange {
+        self.time_range
+    }
+
+    pub fn min_key(&self) -> Bytes {
+        self.min_key.clone()
+    }
+
+    pub fn max_key(&self) -> Bytes {
+        self.max_key.clone()
+    }
+
+    // TODO: maybe return a iterator?
+    pub fn record_batches(&self) -> &[ArrowRecordBatch] {
+        &self.record_batches
+    }
+
+    pub fn approximate_memory_usage(&self) -> usize {
+        self.approximate_memory_size
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use std::{ops::Bound, sync::Arc};
+
+    use arena::NoopCollector;
+    use bytes_ext::ByteVec;
+    use codec::{memcomparable::MemComparable, Encoder};
+    use common_types::{
+        datum::Datum,
+        projected_schema::{ProjectedSchema, RowProjectorBuilder},
+        record_batch::FetchedRecordBatch,
+        row::Row,
+        schema::IndexInWriterSchema,
+        tests::{build_row, build_schema},
+    };
+
+    use super::*;
+    use crate::memtable::{
+        factory::Options,
+        key::ComparableInternalKey,
+        skiplist::factory::SkiplistMemTableFactory,
+        test_util::{TestMemtableBuilder, TestUtil},
+        MemTableRef,
+    };
+
+    struct TestMemtableBuilderImpl;
+
+    impl TestMemtableBuilder for TestMemtableBuilderImpl {
+        fn build(&self, data: &[(KeySequence, Row)]) -> MemTableRef {
+            let schema = build_schema();
+            let factory = SkiplistMemTableFactory;
+            let opts = Options {
+                schema: schema.clone(),
+                arena_block_size: 512,
+                creation_sequence: 1,
+                collector: Arc::new(NoopCollector {}),
+            };
+            let memtable = LayeredMemTable::new(&opts, Arc::new(factory), usize::MAX).unwrap();
+
+            let mut ctx =
+                PutContext::new(IndexInWriterSchema::for_same_schema(schema.num_columns()));
+            let partitioned_data = data.chunks(3).collect::<Vec<_>>();
+            let chunk_num = partitioned_data.len();
+
+            for chunk in partitioned_data.iter().take(chunk_num - 1) {
+                for (seq, row) in *chunk {
+                    memtable.put(&mut ctx, *seq, row, &schema).unwrap();
+                }
+                memtable.force_switch_mutable_segment().unwrap();
+            }
+
+            let last_chunk = partitioned_data[chunk_num - 1];
+            for (seq, row) in last_chunk {
+                memtable.put(&mut ctx, *seq, row, &schema).unwrap();
+            }
+
+            Arc::new(memtable)
+        }
+    }
+
+    fn test_data() -> Vec<(KeySequence, Row)> {
+        vec![
+            (
+                KeySequence::new(1, 1),
+                build_row(b"a", 1, 10.0, "v1", 1000, 1_000_000),
+            ),
+            (
+                KeySequence::new(1, 2),
+                build_row(b"b", 2, 10.0, "v2", 2000, 2_000_000),
+            ),
+            (
+                KeySequence::new(1, 4),
+                build_row(b"c", 3, 10.0, "v3", 3000, 3_000_000),
+            ),
+            (
+                KeySequence::new(2, 1),
+                build_row(b"d", 4, 10.0, "v4", 4000, 4_000_000),
+            ),
+            (
+                KeySequence::new(2, 1),
+                build_row(b"e", 5, 10.0, "v5", 5000, 5_000_000),
+            ),
+            (
+                KeySequence::new(2, 3),
+                build_row(b"f", 6, 10.0, "v6", 6000, 6_000_000),
+            ),
+            (
+                KeySequence::new(3, 4),
+                build_row(b"g", 7, 10.0, "v7", 7000, 7_000_000),
+            ),
+        ]
+    }
+
+    #[test]
+    fn test_memtable_scan() {
+        let builder = TestMemtableBuilderImpl;
+        let data = test_data();
+        let test_util = TestUtil::new(builder, data);
+        let memtable = test_util.memtable();
+        let schema = memtable.schema().clone();
+
+        // No projection.
+        let projection = (0..schema.num_columns()).collect::<Vec<_>>();
+        let expected = test_util.data();
+        test_memtable_scan_internal(
+            schema.clone(),
+            projection,
+            TimeRange::min_to_max(),
+            memtable.clone(),
+            expected,
+        );
+
+        // Projection to first three.
+        let projection = vec![0, 1, 3];
+        let expected = test_util
+            .data()
+            .iter()
+            .map(|row| {
+                let datums = vec![row[0].clone(), row[1].clone(), row[3].clone()];
+                Row::from_datums(datums)
+            })
+            .collect();
+        test_memtable_scan_internal(
+            schema.clone(),
+            projection,
+            TimeRange::min_to_max(),
+            memtable.clone(),
+            expected,
+        );
+
+        // No projection.
+        let projection = (0..schema.num_columns()).collect::<Vec<_>>();
+        let time_range = TimeRange::new(2.into(), 7.into()).unwrap();
+        // Memtable data after switching may be like(just showing timestamp column using
+        // to filter):  [1, 2, 3], [4, 5, 6], [7]
+        //
+        // And the target time range is: [2, 7)
+        //
+        // So the filter result should be: [1, 2, 3], [4, 5, 6]
+        let expected = test_util
+            .data()
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, row)| if idx < 6 { Some(row.clone()) } else { None })
+            .collect();
+        test_memtable_scan_internal(
+            schema.clone(),
+            projection,
+            time_range,
+            memtable.clone(),
+            expected,
+        );
+    }
+
+    #[test]
+    fn test_time_range() {
+        let builder = TestMemtableBuilderImpl;
+        let data = test_data();
+        let test_util = TestUtil::new(builder, data);
+        let memtable = test_util.memtable();
+
+        assert_eq!(TimeRange::new(1.into(), 8.into()), memtable.time_range());
+    }
+
+    #[test]
+    fn test_min_max_key() {
+        let builder = TestMemtableBuilderImpl;
+        let data = test_data();
+        let test_util = TestUtil::new(builder, data.clone());
+        let memtable = test_util.memtable();
+        let schema = memtable.schema();
+
+        // Min key
+        let key_encoder = ComparableInternalKey::new(data[0].0, schema);
+        let mut min_key = Vec::new();
+        min_key.reserve(key_encoder.estimate_encoded_size(&data[0].1));
+        key_encoder.encode(&mut min_key, &data[0].1).unwrap();
+        let key_encoder = ComparableInternalKey::new(data[0].0, schema);
+        let mut min_key = Vec::new();
+        min_key.reserve(key_encoder.estimate_encoded_size(&data[0].1));
+        key_encoder.encode(&mut min_key, &data[0].1).unwrap();
+
+        // Max key
+        let key_encoder = ComparableInternalKey::new(data[6].0, schema);
+        let mut max_key = Vec::new();
+        max_key.reserve(key_encoder.estimate_encoded_size(&data[6].1));
+        key_encoder.encode(&mut max_key, &data[6].1).unwrap();
+        let key_encoder = ComparableInternalKey::new(data[6].0, schema);
+        let mut max_key = Vec::new();
+        max_key.reserve(key_encoder.estimate_encoded_size(&data[6].1));
+        key_encoder.encode(&mut max_key, &data[6].1).unwrap();
+
+        assert_eq!(min_key, memtable.min_key().unwrap().to_vec());
+        assert_eq!(max_key, memtable.max_key().unwrap().to_vec());
+    }
+
+    fn test_memtable_scan_internal(
+        schema: Schema,
+        projection: Vec<usize>,
+        time_range: TimeRange,
+        memtable: Arc<dyn MemTable + Send + Sync>,
+        expected: Vec<Row>,
+    ) {
+        let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap();
+        let fetched_schema = projected_schema.to_record_schema();
+        let table_schema = projected_schema.table_schema();
+        let row_projector_builder =
+            RowProjectorBuilder::new(fetched_schema, table_schema.clone(), None);
+
+        // limited by sequence
+        let scan_request = ScanRequest {
+            start_user_key: Bound::Unbounded,
+            end_user_key: Bound::Unbounded,
+            sequence: SequenceNumber::MAX,
+            row_projector_builder,
+            need_dedup: false,
+            reverse: false,
+            metrics_collector: None,
+            time_range,
+        };
+        let scan_ctx = ScanContext::default();
+        let iter = memtable.scan(scan_ctx, scan_request).unwrap();
+        check_iterator(iter, expected);
+    }
+
+    fn check_iterator<T: Iterator<Item = Result<FetchedRecordBatch>>>(
+        iter: T,
+        expected_rows: Vec<Row>,
+    ) {
+        // sort it first.
+        let mut rows = Vec::new();
+        for batch in iter {
+            let batch = batch.unwrap();
+            for row_idx in 0..batch.num_rows() {
+                rows.push(batch.clone_row_at(row_idx));
+            }
+        }
+
+        rows.sort_by(|a, b| {
+            let key1 = build_scan_key(
+                &String::from_utf8_lossy(a[0].as_varbinary().unwrap()),
+                a[1].as_timestamp().unwrap().as_i64(),
+            );
+            let key2 = build_scan_key(
+                &String::from_utf8_lossy(b[0].as_varbinary().unwrap()),
+                b[1].as_timestamp().unwrap().as_i64(),
+            );
+            BytewiseComparator.compare_key(&key1, &key2)
+        });
+
+        assert_eq!(rows, expected_rows);
+    }
+
+    fn build_scan_key(c1: &str, c2: i64) -> Bytes {
+        let mut buf = ByteVec::new();
+        let encoder = MemComparable;
+        encoder.encode(&mut buf, &Datum::from(c1)).unwrap();
+        encoder.encode(&mut buf, &Datum::from(c2)).unwrap();
+
+        Bytes::from(buf)
+    }
+}
diff --git a/analytic_engine/src/memtable/mod.rs b/analytic_engine/src/memtable/mod.rs
index ed3b20d348..906936bb0c 100644
--- a/analytic_engine/src/memtable/mod.rs
+++ b/analytic_engine/src/memtable/mod.rs
@@ -17,24 +17,28 @@
 pub mod columnar;
 pub mod factory;
 pub mod key;
+pub mod layered;
 mod reversed_iter;
 pub mod skiplist;
+pub mod test_util;
 
-use std::{ops::Bound, sync::Arc, time::Instant};
+use std::{collections::HashMap, ops::Bound, sync::Arc, time::Instant};
 
 use bytes_ext::{ByteVec, Bytes};
+use ceresdbproto::manifest;
 use common_types::{
     projected_schema::RowProjectorBuilder,
     record_batch::FetchedRecordBatch,
     row::Row,
     schema::{IndexInWriterSchema, Schema},
     time::TimeRange,
-    SequenceNumber,
+    SequenceNumber, MUTABLE_SEGMENT_SWITCH_THRESHOLD,
 };
-use generic_error::GenericError;
+use generic_error::{BoxError, GenericError};
 use macros::define_result;
 use serde::{Deserialize, Serialize};
-use snafu::{Backtrace, Snafu};
+use size_ext::ReadableSize;
+use snafu::{Backtrace, ResultExt, Snafu};
 use trace_metric::MetricsCollector;
 
 use crate::memtable::key::KeySequence;
@@ -46,13 +50,13 @@ const MEMTABLE_TYPE_COLUMNAR: &str = "columnar";
 #[derive(Debug, Clone, Deserialize, Eq, PartialEq, Serialize)]
 pub enum MemtableType {
     SkipList,
-    Columnar,
+    Column,
 }
 
 impl MemtableType {
     pub fn parse_from(s: &str) -> Self {
         if s.eq_ignore_ascii_case(MEMTABLE_TYPE_COLUMNAR) {
-            MemtableType::Columnar
+            MemtableType::Column
         } else {
             MemtableType::SkipList
         }
@@ -63,7 +67,54 @@ impl ToString for MemtableType {
     fn to_string(&self) -> String {
         match self {
             MemtableType::SkipList => MEMTABLE_TYPE_SKIPLIST.to_string(),
-            MemtableType::Columnar => MEMTABLE_TYPE_COLUMNAR.to_string(),
+            MemtableType::Column => MEMTABLE_TYPE_COLUMNAR.to_string(),
+        }
+    }
+}
+
+/// Layered memtable options
+/// If `mutable_segment_switch_threshold` is set zero, layered memtable will be
+/// disable.
+#[derive(Debug, Clone, Deserialize, PartialEq, Serialize)]
+#[serde(default)]
+pub struct LayeredMemtableOptions {
+    pub mutable_segment_switch_threshold: ReadableSize,
+}
+
+impl Default for LayeredMemtableOptions {
+    fn default() -> Self {
+        Self {
+            mutable_segment_switch_threshold: ReadableSize::mb(3),
+        }
+    }
+}
+
+impl LayeredMemtableOptions {
+    pub fn parse_from(opts: &HashMap<String, String>) -> Result<Self> {
+        let mut options = LayeredMemtableOptions::default();
+        if let Some(v) = opts.get(MUTABLE_SEGMENT_SWITCH_THRESHOLD) {
+            let threshold = v.parse::<u64>().box_err().context(Internal {
+                msg: format!("invalid mutable segment switch threshold:{v}"),
+            })?;
+            options.mutable_segment_switch_threshold = ReadableSize(threshold);
+        }
+
+        Ok(options)
+    }
+}
+
+impl From<manifest::LayeredMemtableOptions> for LayeredMemtableOptions {
+    fn from(value: manifest::LayeredMemtableOptions) -> Self {
+        Self {
+            mutable_segment_switch_threshold: ReadableSize(value.mutable_segment_switch_threshold),
+        }
+    }
+}
+
+impl From<LayeredMemtableOptions> for manifest::LayeredMemtableOptions {
+    fn from(value: LayeredMemtableOptions) -> Self {
+        Self {
+            mutable_segment_switch_threshold: value.mutable_segment_switch_threshold.0,
         }
     }
 }
@@ -147,6 +198,11 @@ pub enum Error {
         max: usize,
         backtrace: Backtrace,
     },
+    #[snafu(display("Factory err, msg:{msg}, err:{source}"))]
+    Factory { msg: String, source: GenericError },
+
+    #[snafu(display("Factory err, msg:{msg}.\nBacktrace:\n{backtrace}"))]
+    FactoryNoCause { msg: String, backtrace: Backtrace },
 }
 
 pub const TOO_LARGE_MESSAGE: &str = "Memtable key length is too large";
@@ -208,6 +264,7 @@ pub struct ScanRequest {
     pub reverse: bool,
     /// Collector for scan metrics.
     pub metrics_collector: Option<MetricsCollector>,
+    pub time_range: TimeRange,
 }
 
 /// In memory storage for table's data.
@@ -277,7 +334,7 @@ pub trait MemTable {
     fn metrics(&self) -> Metrics;
 }
 
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct Metrics {
     /// Size of original rows.
     pub row_raw_size: usize,
diff --git a/analytic_engine/src/memtable/skiplist/factory.rs b/analytic_engine/src/memtable/skiplist/factory.rs
index 1e6e8b1ddc..3f90bcdebe 100644
--- a/analytic_engine/src/memtable/skiplist/factory.rs
+++ b/analytic_engine/src/memtable/skiplist/factory.rs
@@ -20,9 +20,9 @@ use arena::MonoIncArena;
 use skiplist::{BytewiseComparator, Skiplist};
 
 use crate::memtable::{
-    factory::{Factory, Options, Result},
+    factory::{Factory, Options},
     skiplist::SkiplistMemTable,
-    MemTableRef,
+    MemTableRef, Result,
 };
 
 /// Factory to create memtable
diff --git a/analytic_engine/src/memtable/skiplist/mod.rs b/analytic_engine/src/memtable/skiplist/mod.rs
index a71a82a612..c0a47cf3b1 100644
--- a/analytic_engine/src/memtable/skiplist/mod.rs
+++ b/analytic_engine/src/memtable/skiplist/mod.rs
@@ -286,8 +286,35 @@ mod tests {
     use crate::memtable::{
         factory::{Factory, Options},
         skiplist::factory::SkiplistMemTableFactory,
+        test_util::{TestMemtableBuilder, TestUtil},
+        MemTableRef,
     };
 
+    struct TestMemtableBuilderImpl;
+
+    impl TestMemtableBuilder for TestMemtableBuilderImpl {
+        fn build(&self, data: &[(KeySequence, Row)]) -> MemTableRef {
+            let schema = build_schema();
+            let factory = SkiplistMemTableFactory;
+            let memtable = factory
+                .create_memtable(Options {
+                    schema: schema.clone(),
+                    arena_block_size: 512,
+                    creation_sequence: 1,
+                    collector: Arc::new(NoopCollector {}),
+                })
+                .unwrap();
+
+            let mut ctx =
+                PutContext::new(IndexInWriterSchema::for_same_schema(schema.num_columns()));
+            for (seq, row) in data {
+                memtable.put(&mut ctx, *seq, row, &schema).unwrap();
+            }
+
+            memtable
+        }
+    }
+
     fn test_memtable_scan_for_scan_request(
         schema: Schema,
         memtable: Arc<dyn MemTable + Send + Sync>,
@@ -309,6 +336,7 @@ mod tests {
                     need_dedup: true,
                     reverse: false,
                     metrics_collector: None,
+                    time_range: TimeRange::min_to_max(),
                 },
                 vec![
                     build_row(b"a", 1, 10.0, "v1", 1000, 1_000_000),
@@ -329,6 +357,7 @@ mod tests {
                     need_dedup: true,
                     reverse: false,
                     metrics_collector: None,
+                    time_range: TimeRange::min_to_max(),
                 },
                 vec![
                     build_row(b"a", 1, 10.0, "v1", 1000, 1_000_000),
@@ -348,6 +377,7 @@ mod tests {
                     need_dedup: true,
                     reverse: false,
                     metrics_collector: None,
+                    time_range: TimeRange::min_to_max(),
                 },
                 vec![
                     build_row(b"a", 1, 10.0, "v1", 1000, 1_000_000),
@@ -383,6 +413,7 @@ mod tests {
                 need_dedup: true,
                 reverse: false,
                 metrics_collector: None,
+                time_range: TimeRange::min_to_max(),
             },
             vec![
                 build_row_for_two_column(b"a", 1),
@@ -401,19 +432,52 @@ mod tests {
 
     #[test]
     fn test_memtable_scan() {
-        let schema = build_schema();
-        let factory = SkiplistMemTableFactory;
-        let memtable = factory
-            .create_memtable(Options {
-                schema: schema.clone(),
-                arena_block_size: 512,
-                creation_sequence: 1,
-                collector: Arc::new(NoopCollector {}),
-            })
-            .unwrap();
-
-        let mut ctx = PutContext::new(IndexInWriterSchema::for_same_schema(schema.num_columns()));
-        let input = vec![
+        let data = test_data();
+        let builder = TestMemtableBuilderImpl;
+        let test_util = TestUtil::new(builder, data);
+        let memtable = test_util.memtable();
+        let schema = memtable.schema().clone();
+
+        test_memtable_scan_for_scan_request(schema.clone(), memtable.clone());
+        test_memtable_scan_for_projection(schema, memtable);
+    }
+
+    fn check_iterator<T: Iterator<Item = Result<FetchedRecordBatch>>>(
+        iter: T,
+        expected_rows: Vec<Row>,
+    ) {
+        let mut visited_rows = 0;
+        for batch in iter {
+            let batch = batch.unwrap();
+            for row_idx in 0..batch.num_rows() {
+                assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]);
+                visited_rows += 1;
+            }
+        }
+
+        assert_eq!(visited_rows, expected_rows.len());
+    }
+
+    fn build_scan_key(c1: &str, c2: i64) -> Bytes {
+        let mut buf = ByteVec::new();
+        let encoder = MemComparable;
+        encoder.encode(&mut buf, &Datum::from(c1)).unwrap();
+        encoder.encode(&mut buf, &Datum::from(c2)).unwrap();
+
+        Bytes::from(buf)
+    }
+
+    pub fn build_row_for_two_column(key1: &[u8], key2: i64) -> Row {
+        let datums = vec![
+            Datum::Varbinary(Bytes::copy_from_slice(key1)),
+            Datum::Timestamp(Timestamp::new(key2)),
+        ];
+
+        Row::from_datums(datums)
+    }
+
+    fn test_data() -> Vec<(KeySequence, Row)> {
+        vec![
             (
                 KeySequence::new(1, 1),
                 build_row(b"a", 1, 10.0, "v1", 1000, 1_000_000),
@@ -453,47 +517,6 @@ mod tests {
                 KeySequence::new(3, 4),
                 build_row(b"g", 7, 10.0, "v7", 7000, 7_000_000),
             ),
-        ];
-
-        for (seq, row) in input {
-            memtable.put(&mut ctx, seq, &row, &schema).unwrap();
-        }
-
-        test_memtable_scan_for_scan_request(schema.clone(), memtable.clone());
-        test_memtable_scan_for_projection(schema, memtable);
-    }
-
-    fn check_iterator<T: Iterator<Item = Result<FetchedRecordBatch>>>(
-        iter: T,
-        expected_rows: Vec<Row>,
-    ) {
-        let mut visited_rows = 0;
-        for batch in iter {
-            let batch = batch.unwrap();
-            for row_idx in 0..batch.num_rows() {
-                assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]);
-                visited_rows += 1;
-            }
-        }
-
-        assert_eq!(visited_rows, expected_rows.len());
-    }
-
-    fn build_scan_key(c1: &str, c2: i64) -> Bytes {
-        let mut buf = ByteVec::new();
-        let encoder = MemComparable;
-        encoder.encode(&mut buf, &Datum::from(c1)).unwrap();
-        encoder.encode(&mut buf, &Datum::from(c2)).unwrap();
-
-        Bytes::from(buf)
-    }
-
-    pub fn build_row_for_two_column(key1: &[u8], key2: i64) -> Row {
-        let datums = vec![
-            Datum::Varbinary(Bytes::copy_from_slice(key1)),
-            Datum::Timestamp(Timestamp::new(key2)),
-        ];
-
-        Row::from_datums(datums)
+        ]
     }
 }
diff --git a/analytic_engine/src/memtable/test_util.rs b/analytic_engine/src/memtable/test_util.rs
new file mode 100644
index 0000000000..72364b1854
--- /dev/null
+++ b/analytic_engine/src/memtable/test_util.rs
@@ -0,0 +1,42 @@
+// Copyright 2023 The HoraeDB Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_types::row::Row;
+
+use crate::memtable::*;
+
+pub trait TestMemtableBuilder {
+    fn build(&self, data: &[(KeySequence, Row)]) -> MemTableRef;
+}
+
+pub struct TestUtil {
+    memtable: MemTableRef,
+    data: Vec<(KeySequence, Row)>,
+}
+
+impl TestUtil {
+    pub fn new<B: TestMemtableBuilder>(builder: B, data: Vec<(KeySequence, Row)>) -> Self {
+        let memtable = builder.build(&data);
+
+        Self { memtable, data }
+    }
+
+    pub fn memtable(&self) -> MemTableRef {
+        self.memtable.clone()
+    }
+
+    pub fn data(&self) -> Vec<Row> {
+        self.data.iter().map(|d| d.1.clone()).collect()
+    }
+}
diff --git a/analytic_engine/src/row_iter/record_batch_stream.rs b/analytic_engine/src/row_iter/record_batch_stream.rs
index dd0f4d132e..15ecc8e33a 100644
--- a/analytic_engine/src/row_iter/record_batch_stream.rs
+++ b/analytic_engine/src/row_iter/record_batch_stream.rs
@@ -256,6 +256,7 @@ pub fn stream_from_memtable(
         need_dedup: ctx.need_dedup,
         reverse: ctx.reverse,
         metrics_collector,
+        time_range: ctx.predicate.time_range(),
     };
 
     let iter = memtable.scan(scan_ctx, scan_req).context(ScanMemtable)?;
diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index 0b7ffdc273..305bc93746 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -560,10 +560,21 @@ impl Stream for RecordBatchProjector {
                         }
                         projector.metrics.row_num += record_batch.num_rows();
 
-                        let projected_batch =
-                            FetchedRecordBatch::try_new(&projector.row_projector, record_batch)
-                                .box_err()
-                                .context(DecodeRecordBatch {});
+                        let fetched_schema = projector.row_projector.fetched_schema().clone();
+                        let primary_key_indexes = projector
+                            .row_projector
+                            .primary_key_indexes()
+                            .map(|idxs| idxs.to_vec());
+                        let fetching_column_indexes =
+                            projector.row_projector.target_record_projection_remapping();
+                        let projected_batch = FetchedRecordBatch::try_new(
+                            fetched_schema,
+                            primary_key_indexes,
+                            fetching_column_indexes,
+                            record_batch,
+                        )
+                        .box_err()
+                        .context(DecodeRecordBatch {});
 
                         Poll::Ready(Some(projected_batch))
                     }
diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs
index 998c5c3b14..4d26448f5d 100644
--- a/analytic_engine/src/table/data.rs
+++ b/analytic_engine/src/table/data.rs
@@ -54,6 +54,7 @@ use crate::{
     memtable::{
         columnar::factory::ColumnarMemTableFactory,
         factory::{FactoryRef as MemTableFactoryRef, Options as MemTableOptions},
+        layered::factory::LayeredMemtableFactory,
         skiplist::factory::SkiplistMemTableFactory,
         MemtableType,
     },
@@ -70,9 +71,7 @@ use crate::{
 #[derive(Debug, Snafu)]
 pub enum Error {
     #[snafu(display("Failed to create memtable, err:{}", source))]
-    CreateMemTable {
-        source: crate::memtable::factory::Error,
-    },
+    CreateMemTable { source: crate::memtable::Error },
 
     #[snafu(display(
         "Failed to find or create memtable, timestamp overflow, timestamp:{:?}, duration:{:?}.\nBacktrace:\n{}",
@@ -232,6 +231,9 @@ pub struct TableData {
     /// Whether enable primary key sampling
     enable_primary_key_sampling: bool,
 
+    /// Whether enable layered memtable
+    pub enable_layered_memtable: bool,
+
     /// Metrics of this table
     pub metrics: Metrics,
 
@@ -315,7 +317,22 @@ impl TableData {
 
         let memtable_factory: MemTableFactoryRef = match opts.memtable_type {
             MemtableType::SkipList => Arc::new(SkiplistMemTableFactory),
-            MemtableType::Columnar => Arc::new(ColumnarMemTableFactory),
+            MemtableType::Column => Arc::new(ColumnarMemTableFactory),
+        };
+
+        // Wrap it by `LayeredMemtable`.
+        let mutable_segment_switch_threshold = opts
+            .layered_memtable_opts
+            .mutable_segment_switch_threshold
+            .0 as usize;
+        let enable_layered_memtable = mutable_segment_switch_threshold > 0;
+        let memtable_factory = if enable_layered_memtable {
+            Arc::new(LayeredMemtableFactory::new(
+                memtable_factory,
+                mutable_segment_switch_threshold,
+            ))
+        } else {
+            memtable_factory
         };
 
         let purge_queue = purger.create_purge_queue(space_id, id);
@@ -355,6 +372,7 @@ impl TableData {
             manifest_updates: AtomicUsize::new(0),
             manifest_snapshot_every_n_updates,
             enable_primary_key_sampling,
+            enable_layered_memtable,
         })
     }
 
@@ -376,7 +394,27 @@ impl TableData {
             metrics_opt,
             enable_primary_key_sampling,
         } = config;
-        let memtable_factory = Arc::new(SkiplistMemTableFactory);
+
+        let memtable_factory: MemTableFactoryRef = match add_meta.opts.memtable_type {
+            MemtableType::SkipList => Arc::new(SkiplistMemTableFactory),
+            MemtableType::Column => Arc::new(ColumnarMemTableFactory),
+        };
+        // Maybe wrap it by `LayeredMemtable`.
+        let mutable_segment_switch_threshold = add_meta
+            .opts
+            .layered_memtable_opts
+            .mutable_segment_switch_threshold
+            .0 as usize;
+        let enable_layered_memtable = mutable_segment_switch_threshold > 0;
+        let memtable_factory = if enable_layered_memtable {
+            Arc::new(LayeredMemtableFactory::new(
+                memtable_factory,
+                mutable_segment_switch_threshold,
+            )) as _
+        } else {
+            memtable_factory as _
+        };
+
         let purge_queue = purger.create_purge_queue(add_meta.space_id, add_meta.table_id);
         let current_version =
             TableVersion::new(mem_size_options.size_sampling_interval, purge_queue);
@@ -410,6 +448,7 @@ impl TableData {
             manifest_updates: AtomicUsize::new(0),
             manifest_snapshot_every_n_updates,
             enable_primary_key_sampling,
+            enable_layered_memtable,
         })
     }
 
diff --git a/analytic_engine/src/table_meta_set_impl.rs b/analytic_engine/src/table_meta_set_impl.rs
index 00ead80aae..fa69642b3d 100644
--- a/analytic_engine/src/table_meta_set_impl.rs
+++ b/analytic_engine/src/table_meta_set_impl.rs
@@ -134,6 +134,7 @@ impl TableMetaSetImpl {
                     collector: space.mem_usage_collector.clone(),
                     size_sampling_interval: space.mem_usage_sampling_interval,
                 };
+
                 let table_data = Arc::new(
                     TableData::new(
                         TableDesc {
diff --git a/analytic_engine/src/table_options.rs b/analytic_engine/src/table_options.rs
index 35b0afc63e..c33a75dbf3 100644
--- a/analytic_engine/src/table_options.rs
+++ b/analytic_engine/src/table_options.rs
@@ -33,7 +33,7 @@ use crate::{
     compaction::{
         self, CompactionStrategy, SizeTieredCompactionOptions, TimeWindowCompactionOptions,
     },
-    memtable::MemtableType,
+    memtable::{LayeredMemtableOptions, MemtableType},
 };
 
 const UPDATE_MODE_OVERWRITE: &str = "OVERWRITE";
@@ -98,6 +98,7 @@ pub enum Error {
         backtrace
     ))]
     ParseUpdateMode { s: String, backtrace: Backtrace },
+
     #[snafu(display(
         "Failed to parse compression, name:{}.\nBacktrace:\n{}",
         name,
@@ -134,6 +135,17 @@ pub enum Error {
         backtrace
     ))]
     HybridDeprecated { backtrace: Backtrace },
+
+    #[snafu(display(
+        "Failed to parse layered memtable options, err:{source}.\nBacktrace:\n{backtrace}",
+    ))]
+    ParseLayeredMemtableOptions {
+        source: crate::memtable::Error,
+        backtrace: Backtrace,
+    },
+
+    #[snafu(display("Layered memtable options is missing.\nBacktrace:\n{backtrace}",))]
+    MissingLayeredMemtableOptions { backtrace: Backtrace },
 }
 
 define_result!(Error);
@@ -396,8 +408,11 @@ pub struct TableOptions {
     pub num_rows_per_row_group: usize,
     /// Table Compression
     pub compression: Compression,
+
     /// Memtable type
     pub memtable_type: MemtableType,
+    /// Layered memtable options
+    pub layered_memtable_opts: LayeredMemtableOptions,
 }
 
 impl TableOptions {
@@ -583,6 +598,8 @@ impl From<TableOptions> for manifest_pb::TableOptions {
             ),
         };
 
+        let layered_memtable_opts = opts.layered_memtable_opts.into();
+
         manifest_pb::TableOptions {
             segment_duration,
             enable_ttl: opts.enable_ttl,
@@ -598,6 +615,7 @@ impl From<TableOptions> for manifest_pb::TableOptions {
             storage_format_hint: Some(manifest_pb::StorageFormatHint::from(
                 opts.storage_format_hint,
             )),
+            layered_memtable_options: Some(layered_memtable_opts),
             // TODO: persist `memtable_type` in PB.
         }
     }
@@ -659,6 +677,11 @@ impl TryFrom<manifest_pb::TableOptions> for TableOptions {
         };
 
         let storage_format_hint = opts.storage_format_hint.context(MissingStorageFormatHint)?;
+        let layered_memtable_opts = opts
+            .layered_memtable_options
+            .context(MissingLayeredMemtableOptions)?
+            .into();
+
         let table_opts = Self {
             segment_duration,
             enable_ttl: opts.enable_ttl,
@@ -671,6 +694,7 @@ impl TryFrom<manifest_pb::TableOptions> for TableOptions {
             compression: Compression::from(compression),
             storage_format_hint: StorageFormatHint::try_from(storage_format_hint)?,
             memtable_type: MemtableType::SkipList,
+            layered_memtable_opts,
         };
 
         Ok(table_opts)
@@ -691,6 +715,7 @@ impl Default for TableOptions {
             compression: Compression::Zstd,
             storage_format_hint: StorageFormatHint::default(),
             memtable_type: MemtableType::SkipList,
+            layered_memtable_opts: LayeredMemtableOptions::default(),
         }
     }
 }
@@ -712,54 +737,59 @@ pub fn merge_table_options_for_alter(
 /// The options will override the old options.
 fn merge_table_options(
     options: &HashMap<String, String>,
-    table_old_opts: &TableOptions,
+    base_table_opts: &TableOptions,
     is_create: bool,
 ) -> Result<TableOptions> {
-    let mut table_opts = table_old_opts.clone();
+    let mut base_table_opts = base_table_opts.clone();
     if is_create {
         if let Some(v) = options.get(SEGMENT_DURATION) {
             if v.is_empty() {
-                table_opts.segment_duration = None;
+                base_table_opts.segment_duration = None;
             } else {
-                table_opts.segment_duration = Some(parse_duration(v).context(ParseDuration)?);
+                base_table_opts.segment_duration = Some(parse_duration(v).context(ParseDuration)?);
             }
         }
         if let Some(v) = options.get(UPDATE_MODE) {
-            table_opts.update_mode = UpdateMode::parse_from(v)?;
+            base_table_opts.update_mode = UpdateMode::parse_from(v)?;
         }
     }
 
     if let Some(v) = options.get(TTL) {
-        table_opts.ttl = parse_duration(v).context(ParseDuration)?;
+        base_table_opts.ttl = parse_duration(v).context(ParseDuration)?;
     }
     if let Some(v) = options.get(OPTION_KEY_ENABLE_TTL) {
-        table_opts.enable_ttl = v.parse::<bool>().context(ParseBool)?;
+        base_table_opts.enable_ttl = v.parse::<bool>().context(ParseBool)?;
     }
     if let Some(v) = options.get(ARENA_BLOCK_SIZE) {
         let size = parse_size(v)?;
-        table_opts.arena_block_size = size.0 as u32;
+        base_table_opts.arena_block_size = size.0 as u32;
     }
     if let Some(v) = options.get(WRITE_BUFFER_SIZE) {
         let size = parse_size(v)?;
-        table_opts.write_buffer_size = size.0 as u32;
+        base_table_opts.write_buffer_size = size.0 as u32;
     }
     if let Some(v) = options.get(COMPACTION_STRATEGY) {
-        table_opts.compaction_strategy =
+        base_table_opts.compaction_strategy =
             CompactionStrategy::parse_from(v, options).context(ParseStrategy { value: v })?;
     }
     if let Some(v) = options.get(NUM_ROWS_PER_ROW_GROUP) {
-        table_opts.num_rows_per_row_group = v.parse().context(ParseInt)?;
+        base_table_opts.num_rows_per_row_group = v.parse().context(ParseInt)?;
     }
     if let Some(v) = options.get(COMPRESSION) {
-        table_opts.compression = Compression::parse_from(v)?;
+        base_table_opts.compression = Compression::parse_from(v)?;
     }
     if let Some(v) = options.get(STORAGE_FORMAT) {
-        table_opts.storage_format_hint = v.as_str().try_into()?;
+        base_table_opts.storage_format_hint = v.as_str().try_into()?;
     }
     if let Some(v) = options.get(MEMTABLE_TYPE) {
-        table_opts.memtable_type = MemtableType::parse_from(v);
+        base_table_opts.memtable_type = MemtableType::parse_from(v);
     }
-    Ok(table_opts)
+
+    let layered_memtable_opts =
+        LayeredMemtableOptions::parse_from(options).context(ParseLayeredMemtableOptions)?;
+    base_table_opts.layered_memtable_opts = layered_memtable_opts;
+
+    Ok(base_table_opts)
 }
 
 fn parse_size(v: &str) -> Result<ReadableSize> {
diff --git a/benchmarks/src/scan_memtable_bench.rs b/benchmarks/src/scan_memtable_bench.rs
index 72e09a054c..79c9378c96 100644
--- a/benchmarks/src/scan_memtable_bench.rs
+++ b/benchmarks/src/scan_memtable_bench.rs
@@ -25,7 +25,10 @@ use analytic_engine::{
     sst::meta_data::cache::MetaCacheRef,
 };
 use arena::NoopCollector;
-use common_types::projected_schema::{ProjectedSchema, RowProjectorBuilder};
+use common_types::{
+    projected_schema::{ProjectedSchema, RowProjectorBuilder},
+    time::TimeRange,
+};
 use logger::info;
 use object_store::{LocalFileSystem, Path};
 
@@ -103,6 +106,7 @@ impl ScanMemTableBench {
             reverse: false,
             metrics_collector: None,
             row_projector_builder,
+            time_range: TimeRange::min_to_max(),
         };
 
         let iter = self.memtable.scan(scan_ctx, scan_req).unwrap();
diff --git a/common_types/src/lib.rs b/common_types/src/lib.rs
index 78995e1217..f54e92ae52 100644
--- a/common_types/src/lib.rs
+++ b/common_types/src/lib.rs
@@ -50,6 +50,7 @@ pub const UPDATE_MODE: &str = "update_mode";
 pub const COMPRESSION: &str = "compression";
 pub const STORAGE_FORMAT: &str = "storage_format";
 pub const MEMTABLE_TYPE: &str = "memtable_type";
+pub const MUTABLE_SEGMENT_SWITCH_THRESHOLD: &str = "mutable_segment_switch_threshold";
 
 #[cfg(any(test, feature = "test"))]
 pub mod tests;
diff --git a/common_types/src/projected_schema.rs b/common_types/src/projected_schema.rs
index d0f780d8b6..1ef458e9f9 100644
--- a/common_types/src/projected_schema.rs
+++ b/common_types/src/projected_schema.rs
@@ -224,7 +224,7 @@ impl RowProjector {
 
     /// The projected indexes of all columns(existed and not exist) in the
     /// projected source schema.
-    pub fn fetched_projected_source_column_indexes(&self) -> &[Option<usize>] {
+    pub fn target_record_projection_remapping(&self) -> &[Option<usize>] {
         &self.target_record_projection_remapping
     }
 
diff --git a/common_types/src/record_batch.rs b/common_types/src/record_batch.rs
index 1b7d610d8e..d4a3b24adc 100644
--- a/common_types/src/record_batch.rs
+++ b/common_types/src/record_batch.rs
@@ -29,7 +29,7 @@ use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu};
 use crate::{
     column_block::{cast_nanosecond_to_mills, ColumnBlock, ColumnBlockBuilder},
     datum::DatumKind,
-    projected_schema::{ProjectedSchema, RowProjector},
+    projected_schema::ProjectedSchema,
     row::{
         contiguous::{ContiguousRow, ProjectedContiguousRow},
         Row, RowViewOnBatch,
@@ -279,6 +279,11 @@ impl RecordBatch {
     pub fn into_arrow_record_batch(self) -> ArrowRecordBatch {
         self.data.arrow_record_batch
     }
+
+    #[inline]
+    pub fn into_record_batch_data(self) -> RecordBatchData {
+        self.data
+    }
 }
 
 impl TryFrom<ArrowRecordBatch> for RecordBatch {
@@ -371,14 +376,16 @@ pub struct FetchedRecordBatch {
 }
 
 impl FetchedRecordBatch {
-    pub fn try_new(ctx: &RowProjector, arrow_record_batch: ArrowRecordBatch) -> Result<Self> {
-        let column_indexes = ctx.fetched_projected_source_column_indexes();
-        let schema = ctx.fetched_schema().clone();
-        let mut column_blocks = Vec::with_capacity(schema.num_columns());
-
+    pub fn try_new(
+        fetched_schema: RecordSchema,
+        primary_key_indexes: Option<Vec<usize>>,
+        column_indexes: &[Option<usize>],
+        arrow_record_batch: ArrowRecordBatch,
+    ) -> Result<Self> {
+        let mut column_blocks = Vec::with_capacity(fetched_schema.num_columns());
         let num_rows = arrow_record_batch.num_rows();
         let num_columns = arrow_record_batch.num_columns();
-        for (col_idx_opt, col_schema) in column_indexes.iter().zip(schema.columns()) {
+        for (col_idx_opt, col_schema) in column_indexes.iter().zip(fetched_schema.columns()) {
             match col_idx_opt {
                 Some(col_idx) => {
                     ensure!(
@@ -409,11 +416,11 @@ impl FetchedRecordBatch {
             }
         }
 
-        let data = RecordBatchData::new(schema.to_arrow_schema_ref(), column_blocks)?;
+        let data = RecordBatchData::new(fetched_schema.to_arrow_schema_ref(), column_blocks)?;
 
         Ok(FetchedRecordBatch {
-            schema,
-            primary_key_indexes: ctx.primary_key_indexes().map(|idxs| idxs.to_vec()),
+            schema: fetched_schema,
+            primary_key_indexes,
             data,
         })
     }
diff --git a/common_types/src/time.rs b/common_types/src/time.rs
index 342a5509fd..14266d673d 100644
--- a/common_types/src/time.rs
+++ b/common_types/src/time.rs
@@ -300,6 +300,13 @@ impl TimeRange {
             self.exclusive_end.min(other.exclusive_end),
         )
     }
+
+    pub fn merge_range(&self, other: TimeRange) -> TimeRange {
+        TimeRange {
+            inclusive_start: self.inclusive_start.min(other.inclusive_start),
+            exclusive_end: self.exclusive_end.max(other.exclusive_end),
+        }
+    }
 }
 
 impl From<TimeRange> for time_range::TimeRange {
diff --git a/system_catalog/src/sys_catalog_table.rs b/system_catalog/src/sys_catalog_table.rs
index 00f7b061c4..0eb8c523ca 100644
--- a/system_catalog/src/sys_catalog_table.rs
+++ b/system_catalog/src/sys_catalog_table.rs
@@ -309,6 +309,11 @@ impl SysCatalogTable {
             common_types::OPTION_KEY_ENABLE_TTL.to_string(),
             DEFAULT_ENABLE_TTL.to_string(),
         );
+        // Disable layered memtable for system catalog table.
+        options.insert(
+            common_types::MUTABLE_SEGMENT_SWITCH_THRESHOLD.to_string(),
+            0.to_string(),
+        );
         let params = CreateTableParams {
             catalog_name: consts::SYSTEM_CATALOG.to_string(),
             schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(),

From 2e900eaf6e6d3aaa954e7c70f5e723743ab699b7 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Tue, 9 Jan 2024 14:40:19 +0800
Subject: [PATCH 32/38] chore: skip wal seq check when wal is disabled (#1430)

## Rationale


## Detailed Changes
- Disable seq check when wal is disabled
- Fix request id in remote query.

## Test Plan
---
 analytic_engine/src/instance/write.rs        | 25 ++++++++++----------
 server/src/grpc/remote_engine_service/mod.rs | 18 +++++++-------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/analytic_engine/src/instance/write.rs b/analytic_engine/src/instance/write.rs
index 2e5d60d07c..244399c54b 100644
--- a/analytic_engine/src/instance/write.rs
+++ b/analytic_engine/src/instance/write.rs
@@ -25,7 +25,6 @@ use codec::{
 use common_types::{
     row::RowGroup,
     schema::{IndexInWriterSchema, Schema},
-    MIN_SEQUENCE_NUMBER,
 };
 use itertools::Itertools;
 use logger::{debug, error, info, trace, warn};
@@ -529,17 +528,19 @@ impl<'a> Writer<'a> {
                 e
             })?;
 
-        // When seq is MIN_SEQUENCE_NUMBER, it means the wal used for write is not
-        // normal, ignore check in this case.
-        // NOTE: Currently write wal will only increment seq by one,
-        // this may change in future.
-        if sequence != MIN_SEQUENCE_NUMBER && table_data.last_sequence() + 1 != sequence {
-            warn!(
-                "Sequence must be consecutive, table:{}, table_id:{}, last_sequence:{}, wal_sequence:{}",
-                table_data.name,table_data.id,
-                table_data.last_sequence(),
-                sequence
-            );
+        // When wal is disabled, there is no need to do this check.
+        if !self.instance.disable_wal {
+            // NOTE: Currently write wal will only increment seq by one,
+            // this may change in future.
+            let last_seq = table_data.last_sequence();
+            if sequence != last_seq + 1 {
+                warn!(
+                    "Sequence must be consecutive, table:{}, table_id:{}, last_sequence:{}, wal_sequence:{}",
+                    table_data.name,table_data.id,
+                    table_data.last_sequence(),
+                    sequence
+                );
+            }
         }
 
         debug!(
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index df201a1257..20b0b3da32 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -149,7 +149,7 @@ struct ExecutePlanMetricCollector {
 
 impl ExecutePlanMetricCollector {
     fn new(
-        request_id: String,
+        request_id: RequestId,
         query: String,
         slow_threshold_secs: u64,
         priority: Priority,
@@ -157,7 +157,7 @@ impl ExecutePlanMetricCollector {
         Self {
             start: Instant::now(),
             query,
-            request_id: request_id.into(),
+            request_id,
             slow_threshold: Duration::from_secs(slow_threshold_secs),
             priority,
         }
@@ -711,12 +711,10 @@ impl RemoteEngineServiceImpl {
             priority,
         );
 
-        debug!(
-            "Execute remote query, ctx:{query_ctx:?}, query:{}",
-            &ctx.displayable_query
-        );
+        debug!("Execute remote query, id:{}", query_ctx.request_id.as_str());
+
         let metric = ExecutePlanMetricCollector::new(
-            ctx.request_id.to_string(),
+            query_ctx.request_id.clone(),
             ctx.displayable_query,
             slow_threshold_secs,
             query_ctx.priority,
@@ -775,8 +773,12 @@ impl RemoteEngineServiceImpl {
             ctx.timeout_ms,
             priority,
         );
+        debug!(
+            "Execute dedupped remote query, id:{}",
+            query_ctx.request_id.as_str()
+        );
         let metric = ExecutePlanMetricCollector::new(
-            ctx.request_id.to_string(),
+            query_ctx.request_id.clone(),
             ctx.displayable_query,
             slow_threshold_secs,
             query_ctx.priority,

From 6cbf8c4a18502462bd85f14a692d161446a1ac47 Mon Sep 17 00:00:00 2001
From: Jiacai Liu <dev@liujiacai.net>
Date: Tue, 9 Jan 2024 16:46:19 +0800
Subject: [PATCH 33/38] feat: update disk cache in another thread to avoid
 blocking normal query process (#1431)

## Rationale
When there is a cache miss in disk cache, it will
1. Fetch data from remote
2. Insert data to cache, which will incur disk IO
3. Return the data for query.

We can move the second step to another thread to avoid it blocking the
normal query process.

## Detailed Changes
- Make write disk nonblocking
- Block on test explicitly, otherwise it will throw errors below

> Cannot drop a runtime in a context where blocking is not allowed. This
happens when a runtime is dropped from within an asynchronous context.


## Test Plan
CI
---
 analytic_engine/src/setup.rs              |   1 +
 components/object_store/src/disk_cache.rs | 699 ++++++++++++----------
 2 files changed, 388 insertions(+), 312 deletions(-)

diff --git a/analytic_engine/src/setup.rs b/analytic_engine/src/setup.rs
index 371fadc33a..1c55c92565 100644
--- a/analytic_engine/src/setup.rs
+++ b/analytic_engine/src/setup.rs
@@ -241,6 +241,7 @@ fn open_storage(
                     opts.disk_cache_page_size.as_byte() as usize,
                     store,
                     opts.disk_cache_partition_bits,
+                    engine_runtimes.io_runtime.clone(),
                 )
                 .await
                 .context(OpenObjectStore)?,
diff --git a/components/object_store/src/disk_cache.rs b/components/object_store/src/disk_cache.rs
index 53d537ffa6..937181a69b 100644
--- a/components/object_store/src/disk_cache.rs
+++ b/components/object_store/src/disk_cache.rs
@@ -31,6 +31,7 @@ use logger::{debug, warn};
 use lru::LruCache;
 use notifier::notifier::{ExecutionGuard, RequestNotifiers};
 use partitioned_lock::PartitionedMutex;
+use runtime::RuntimeRef;
 use serde::{Deserialize, Serialize};
 use snafu::{ensure, Backtrace, ResultExt, Snafu};
 use time_ext;
@@ -261,10 +262,10 @@ struct PageMeta {
     // TODO: Introduce the CRC for integration check.
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 struct DiskCache {
     root_dir: String,
-    meta_cache: PartitionedMutex<PageMetaCache, SeaHasherBuilder>,
+    meta_cache: Arc<PartitionedMutex<PageMetaCache, SeaHasherBuilder>>,
 }
 
 #[derive(Debug, Clone)]
@@ -292,7 +293,11 @@ impl DiskCache {
 
         Ok(Self {
             root_dir,
-            meta_cache: PartitionedMutex::try_new(init_lru, partition_bits, SeaHasherBuilder {})?,
+            meta_cache: Arc::new(PartitionedMutex::try_new(
+                init_lru,
+                partition_bits,
+                SeaHasherBuilder {},
+            )?),
         })
     }
 
@@ -509,6 +514,7 @@ pub struct DiskCacheStore {
     meta_cache: PartitionedMutex<LruCache<Path, FileMeta>, SeaHasherBuilder>,
     underlying_store: Arc<dyn ObjectStore>,
     request_notifiers: Arc<RequestNotifiers<String, oneshot::Sender<StdResult<Bytes, Error>>>>,
+    runtime: RuntimeRef,
 }
 
 impl DiskCacheStore {
@@ -518,6 +524,7 @@ impl DiskCacheStore {
         page_size: usize,
         underlying_store: Arc<dyn ObjectStore>,
         partition_bits: usize,
+        runtime: RuntimeRef,
     ) -> Result<Self> {
         let page_num = cap / page_size;
         ensure!(page_num != 0, InvalidCapacity);
@@ -550,6 +557,7 @@ impl DiskCacheStore {
             meta_cache,
             underlying_store,
             request_notifiers,
+            runtime,
         })
     }
 
@@ -744,7 +752,17 @@ impl DiskCacheStore {
             .zip(notifiers_vec.into_iter())
             .zip(need_fetch_block_cache_key.into_iter())
         {
-            self.cache.insert_data(cache_key, bytes.clone()).await;
+            {
+                let cache = self.cache.clone();
+                let bytes = bytes.clone();
+                let handle = self
+                    .runtime
+                    .spawn(async move { cache.insert_data(cache_key, bytes).await });
+                // In test, wait the handle to finish, otherwise the test may fail.
+                if cfg!(test) {
+                    let _ = handle.await;
+                }
+            }
             for notifier in notifiers {
                 if notifier.send(Ok(bytes.clone())).is_err() {
                     // The error contains sent bytes, which maybe very large,
@@ -992,6 +1010,7 @@ impl ObjectStore for DiskCacheStore {
 
 #[cfg(test)]
 mod test {
+    use runtime::{Builder, RuntimeRef};
     use tempfile::{tempdir, TempDir};
     use upstream::local::LocalFileSystem;
 
@@ -1007,6 +1026,7 @@ mod test {
         page_size: usize,
         cap: usize,
         partition_bits: usize,
+        runtime: RuntimeRef,
     ) -> StoreWithCacheDir {
         let local_store = Arc::new(MemoryStore::default());
 
@@ -1017,6 +1037,7 @@ mod test {
             page_size,
             local_store,
             partition_bits,
+            runtime,
         )
         .await
         .unwrap();
@@ -1034,341 +1055,392 @@ mod test {
             .exists()
     }
 
-    #[tokio::test]
-    async fn test_disk_cache_out_of_range() {
-        let page_size = 16;
-        // 51 byte
-        let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
-        let location = Path::from("out_of_range_test.sst");
-        let store = prepare_store(page_size, 32, 0).await;
-        let buf = Bytes::from_static(data);
-        store.inner.put(&location, buf.clone()).await.unwrap();
-
-        // Read one page out of range.
-        let res = store.inner.get_range(&location, 48..54).await;
-        assert!(res.is_err());
-
-        // Read multiple pages out of range.
-        let res = store.inner.get_range(&location, 24..54).await;
-        assert!(res.is_err());
+    #[test]
+    fn test_disk_cache_out_of_range() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            let page_size = 16;
+            // 51 byte
+            let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
+            let location = Path::from("out_of_range_test.sst");
+            let store = prepare_store(page_size, 32, 0, rt.clone()).await;
+            let buf = Bytes::from_static(data);
+            store.inner.put(&location, buf.clone()).await.unwrap();
+
+            // Read one page out of range.
+            let res = store.inner.get_range(&location, 48..54).await;
+            assert!(res.is_err());
+
+            // Read multiple pages out of range.
+            let res = store.inner.get_range(&location, 24..54).await;
+            assert!(res.is_err());
+        });
     }
 
-    #[tokio::test]
-    async fn test_disk_cache_store_get_range() {
-        let page_size = 16;
-        // 51 byte
-        let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
-        let location = Path::from("1.sst");
-        let store = prepare_store(page_size, 1024, 0).await;
-
-        let mut buf = BytesMut::with_capacity(data.len() * 4);
-        // extend 4 times, then location will contain 200 bytes
-        for _ in 0..4 {
-            buf.extend_from_slice(data);
-        }
-        store.inner.put(&location, buf.freeze()).await.unwrap();
-
-        let testcases = vec![
-            (0..6, "a b c "),
-            (0..16, "a b c d e f g h "),
-            // len of aligned ranges will be 2
-            (0..17, "a b c d e f g h i"),
-            (16..17, "i"),
-            // len of aligned ranges will be 6
-            (16..100, "i j k l m n o p q r s t u v w x y za b c d e f g h i j k l m n o p q r s t u v w x y"),
-        ];
-
-        for (input, expected) in testcases {
-            assert_eq!(
-                store.inner.get_range(&location, input).await.unwrap(),
-                Bytes::copy_from_slice(expected.as_bytes())
-            );
-        }
-
-        // remove cached values, then get again
-        {
-            for range in [0..16, 16..32, 32..48, 48..64, 64..80, 80..96, 96..112] {
-                let data_cache = store
-                    .inner
-                    .cache
-                    .meta_cache
-                    .lock(&DiskCacheStore::page_cache_name(&location, &range).as_str());
-                assert!(data_cache
-                    .contains(DiskCacheStore::page_cache_name(&location, &range).as_str()));
-                assert!(test_file_exists(&store.cache_dir, &location, &range));
+    #[test]
+    fn test_disk_cache_store_get_range() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            let page_size = 16;
+            // 51 byte
+            let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
+            let location = Path::from("1.sst");
+            let store = prepare_store(page_size, 1024, 0, rt.clone()).await;
+
+            let mut buf = BytesMut::with_capacity(data.len() * 4);
+            // extend 4 times, then location will contain 200 bytes
+            for _ in 0..4 {
+                buf.extend_from_slice(data);
+            }
+            store.inner.put(&location, buf.freeze()).await.unwrap();
+
+            let testcases = vec![
+                (0..6, "a b c "),
+                (0..16, "a b c d e f g h "),
+                // len of aligned ranges will be 2
+                (0..17, "a b c d e f g h i"),
+                (16..17, "i"),
+                // len of aligned ranges will be 6
+                (16..100, "i j k l m n o p q r s t u v w x y za b c d e f g h i j k l m n o p q r s t u v w x y"),
+            ];
+
+            for (input, expected) in testcases {
+                assert_eq!(
+                    store.inner.get_range(&location, input).await.unwrap(),
+                    Bytes::copy_from_slice(expected.as_bytes())
+                );
             }
 
-            for range in [16..32, 48..64, 80..96] {
-                let mut data_cache = store
-                    .inner
-                    .cache
-                    .meta_cache
-                    .lock(&DiskCacheStore::page_cache_name(&location, &range).as_str());
-                assert!(data_cache
-                    .pop(&DiskCacheStore::page_cache_name(&location, &range))
-                    .is_some());
+            // remove cached values, then get again
+            {
+                for range in [0..16, 16..32, 32..48, 48..64, 64..80, 80..96, 96..112] {
+                    let data_cache = store
+                        .inner
+                        .cache
+                        .meta_cache
+                        .lock(&DiskCacheStore::page_cache_name(&location, &range).as_str());
+                    assert!(data_cache
+                            .contains(DiskCacheStore::page_cache_name(&location, &range).as_str()));
+                    assert!(test_file_exists(&store.cache_dir, &location, &range));
+                }
+
+                for range in [16..32, 48..64, 80..96] {
+                    let mut data_cache = store
+                        .inner
+                        .cache
+                        .meta_cache
+                        .lock(&DiskCacheStore::page_cache_name(&location, &range).as_str());
+                    assert!(data_cache
+                            .pop(&DiskCacheStore::page_cache_name(&location, &range))
+                            .is_some());
+                }
             }
-        }
 
-        assert_eq!(
-            store.inner.get_range(&location, 16..100).await.unwrap(),
-            Bytes::copy_from_slice(
-                b"i j k l m n o p q r s t u v w x y za b c d e f g h i j k l m n o p q r s t u v w x y"
-            )
-        );
+            assert_eq!(
+                store.inner.get_range(&location, 16..100).await.unwrap(),
+                Bytes::copy_from_slice(
+                    b"i j k l m n o p q r s t u v w x y za b c d e f g h i j k l m n o p q r s t u v w x y"
+                )
+            );
+
+        });
     }
 
-    #[tokio::test]
-    async fn test_disk_cache_multi_thread_fetch_same_block() {
-        let page_size = 16;
-        // 51 byte
-        let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
-        let location = Path::from("1.sst");
-        let store = Arc::new(prepare_store(page_size, 32, 0).await);
-
-        let mut buf = BytesMut::with_capacity(data.len() * 4);
-        // extend 4 times, then location will contain 200 bytes
-        for _ in 0..4 {
-            buf.extend_from_slice(data);
-        }
-        store.inner.put(&location, buf.freeze()).await.unwrap();
-
-        let testcases = vec![
-            (0..6, "a b c "),
-            (0..16, "a b c d e f g h "),
-            (0..17, "a b c d e f g h i"),
-            (16..17, "i"),
-            (16..100, "i j k l m n o p q r s t u v w x y za b c d e f g h i j k l m n o p q r s t u v w x y"),
-        ];
-        let testcases = testcases
-            .iter()
-            .cycle()
-            .take(testcases.len() * 100)
-            .cloned()
-            .collect::<Vec<_>>();
-
-        let mut tasks = Vec::with_capacity(testcases.len());
-        for (input, _) in &testcases {
-            let store = store.clone();
-            let location = location.clone();
-            let input = input.clone();
-
-            tasks.push(tokio::spawn(async move {
-                store.inner.get_range(&location, input).await.unwrap()
-            }));
-        }
+    #[test]
+    fn test_disk_cache_multi_thread_fetch_same_block() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            let page_size = 16;
+            // 51 byte
+            let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
+            let location = Path::from("1.sst");
+            let store = Arc::new(prepare_store(page_size, 32, 0,rt.clone()).await);
+
+            let mut buf = BytesMut::with_capacity(data.len() * 4);
+            // extend 4 times, then location will contain 200 bytes
+            for _ in 0..4 {
+                buf.extend_from_slice(data);
+            }
+            store.inner.put(&location, buf.freeze()).await.unwrap();
+
+            let testcases = vec![
+                (0..6, "a b c "),
+                (0..16, "a b c d e f g h "),
+                (0..17, "a b c d e f g h i"),
+                (16..17, "i"),
+                (16..100, "i j k l m n o p q r s t u v w x y za b c d e f g h i j k l m n o p q r s t u v w x y"),
+            ];
+            let testcases = testcases
+                .iter()
+                .cycle()
+                .take(testcases.len() * 100)
+                .cloned()
+                .collect::<Vec<_>>();
+
+            let mut tasks = Vec::with_capacity(testcases.len());
+            for (input, _) in &testcases {
+                let store = store.clone();
+                let location = location.clone();
+                let input = input.clone();
+
+                tasks.push(tokio::spawn(async move {
+                    store.inner.get_range(&location, input).await.unwrap()
+                }));
+            }
 
-        let actual = futures::future::join_all(tasks).await;
-        for (actual, (_, expected)) in actual.into_iter().zip(testcases.into_iter()) {
-            assert_eq!(actual.unwrap(), Bytes::from(expected))
-        }
+            let actual = futures::future::join_all(tasks).await;
+            for (actual, (_, expected)) in actual.into_iter().zip(testcases.into_iter()) {
+                assert_eq!(actual.unwrap(), Bytes::from(expected))
+            }
+        });
     }
 
-    #[tokio::test]
-    async fn test_disk_cache_remove_cache_file() {
-        let page_size = 16;
-        // 51 byte
-        let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
-        let location = Path::from("remove_cache_file.sst");
-        let store = prepare_store(page_size, 32, 0).await;
-        let mut buf = BytesMut::with_capacity(data.len() * 4);
-        // extend 4 times, then location will contain 200 bytes, but cache cap is 32
-        for _ in 0..4 {
-            buf.extend_from_slice(data);
-        }
-        store.inner.put(&location, buf.freeze()).await.unwrap();
-
-        let _ = store.inner.get_range(&location, 0..16).await.unwrap();
-        let _ = store.inner.get_range(&location, 16..32).await.unwrap();
-        // cache is full now
-        assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
-
-        // insert new cache, evict oldest entry
-        let _ = store.inner.get_range(&location, 32..48).await.unwrap();
-        assert!(!test_file_exists(&store.cache_dir, &location, &(0..16)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(32..48)));
-
-        // insert new cache, evict oldest entry
-        let _ = store.inner.get_range(&location, 48..64).await.unwrap();
-        assert!(!test_file_exists(&store.cache_dir, &location, &(16..32)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(48..64)));
+    #[test]
+    fn test_disk_cache_remove_cache_file() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            let page_size = 16;
+            // 51 byte
+            let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
+            let location = Path::from("remove_cache_file.sst");
+            let store = prepare_store(page_size, 32, 0, rt.clone()).await;
+            let mut buf = BytesMut::with_capacity(data.len() * 4);
+            // extend 4 times, then location will contain 200 bytes, but cache cap is 32
+            for _ in 0..4 {
+                buf.extend_from_slice(data);
+            }
+            store.inner.put(&location, buf.freeze()).await.unwrap();
+
+            let _ = store.inner.get_range(&location, 0..16).await.unwrap();
+            let _ = store.inner.get_range(&location, 16..32).await.unwrap();
+            // cache is full now
+            assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
+
+            // insert new cache, evict oldest entry
+            let _ = store.inner.get_range(&location, 32..48).await.unwrap();
+            assert!(!test_file_exists(&store.cache_dir, &location, &(0..16)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(32..48)));
+
+            // insert new cache, evict oldest entry
+            let _ = store.inner.get_range(&location, 48..64).await.unwrap();
+            assert!(!test_file_exists(&store.cache_dir, &location, &(16..32)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(48..64)));
+        });
     }
 
-    #[tokio::test]
-    async fn test_disk_cache_remove_cache_file_two_partition() {
-        let page_size = 16;
-        // 51 byte
-        let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
-        let location = Path::from("remove_cache_file_two_partition.sst");
-        // partition_cap: 64 / 16 / 2 = 2
-        let store = prepare_store(page_size, 64, 1).await;
-        let mut buf = BytesMut::with_capacity(data.len() * 8);
-        // extend 8 times
-        for _ in 0..8 {
-            buf.extend_from_slice(data);
-        }
-        store.inner.put(&location, buf.freeze()).await.unwrap();
-        // use seahash
-        // 0..16: partition 1
-        // 16..32 partition 1
-        // 32..48 partition 0
-        // 48..64 partition 1
-        // 64..80 partition 1
-        // 80..96 partition 0
-        // 96..112 partition 0
-        // 112..128 partition 0
-        // 128..144 partition 0
-        let _ = store.inner.get_range(&location, 0..16).await.unwrap();
-        let _ = store.inner.get_range(&location, 16..32).await.unwrap();
-        // partition 1 cache is full now
-        assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
-
-        let _ = store.inner.get_range(&location, 32..48).await.unwrap();
-        let _ = store.inner.get_range(&location, 80..96).await.unwrap();
-        // partition 0 cache is full now
-
-        assert!(test_file_exists(&store.cache_dir, &location, &(32..48)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(80..96)));
-
-        // insert new entry into partition 0, evict partition 0's oldest entry
-        let _ = store.inner.get_range(&location, 96..112).await.unwrap();
-        assert!(!test_file_exists(&store.cache_dir, &location, &(32..48)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(80..96)));
-
-        assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
-
-        // insert new entry into partition 0, evict partition 0's oldest entry
-        let _ = store.inner.get_range(&location, 128..144).await.unwrap();
-        assert!(!test_file_exists(&store.cache_dir, &location, &(80..96)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(96..112)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(128..144)));
-
-        assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
-
-        // insert new entry into partition 1, evict partition 1's oldest entry
-        let _ = store.inner.get_range(&location, 64..80).await.unwrap();
-        assert!(!test_file_exists(&store.cache_dir, &location, &(0..16)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(64..80)));
-
-        assert!(test_file_exists(&store.cache_dir, &location, &(96..112)));
-        assert!(test_file_exists(&store.cache_dir, &location, &(128..144)));
+    #[test]
+    fn test_disk_cache_remove_cache_file_two_partition() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            let page_size = 16;
+            // 51 byte
+            let data = b"a b c d e f g h i j k l m n o p q r s t u v w x y z";
+            let location = Path::from("remove_cache_file_two_partition.sst");
+            // partition_cap: 64 / 16 / 2 = 2
+            let store = prepare_store(page_size, 64, 1, rt.clone()).await;
+            let mut buf = BytesMut::with_capacity(data.len() * 8);
+            // extend 8 times
+            for _ in 0..8 {
+                buf.extend_from_slice(data);
+            }
+            store.inner.put(&location, buf.freeze()).await.unwrap();
+            // use seahash
+            // 0..16: partition 1
+            // 16..32 partition 1
+            // 32..48 partition 0
+            // 48..64 partition 1
+            // 64..80 partition 1
+            // 80..96 partition 0
+            // 96..112 partition 0
+            // 112..128 partition 0
+            // 128..144 partition 0
+            let _ = store.inner.get_range(&location, 0..16).await.unwrap();
+            let _ = store.inner.get_range(&location, 16..32).await.unwrap();
+            // partition 1 cache is full now
+            assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
+
+            let _ = store.inner.get_range(&location, 32..48).await.unwrap();
+            let _ = store.inner.get_range(&location, 80..96).await.unwrap();
+            // partition 0 cache is full now
+
+            assert!(test_file_exists(&store.cache_dir, &location, &(32..48)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(80..96)));
+
+            // insert new entry into partition 0, evict partition 0's oldest entry
+            let _ = store.inner.get_range(&location, 96..112).await.unwrap();
+            assert!(!test_file_exists(&store.cache_dir, &location, &(32..48)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(80..96)));
+
+            assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
+
+            // insert new entry into partition 0, evict partition 0's oldest entry
+            let _ = store.inner.get_range(&location, 128..144).await.unwrap();
+            assert!(!test_file_exists(&store.cache_dir, &location, &(80..96)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(96..112)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(128..144)));
+
+            assert!(test_file_exists(&store.cache_dir, &location, &(0..16)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
+
+            // insert new entry into partition 1, evict partition 1's oldest entry
+            let _ = store.inner.get_range(&location, 64..80).await.unwrap();
+            assert!(!test_file_exists(&store.cache_dir, &location, &(0..16)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(16..32)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(64..80)));
+
+            assert!(test_file_exists(&store.cache_dir, &location, &(96..112)));
+            assert!(test_file_exists(&store.cache_dir, &location, &(128..144)));
+        });
     }
 
-    #[tokio::test]
-    async fn test_disk_cache_manifest() {
-        let cache_dir = tempdir().unwrap();
-        let cache_root_dir = cache_dir.as_ref().to_string_lossy().to_string();
-        let page_size = 8;
-        let first_create_time = {
-            let _store = {
-                let local_path = tempdir().unwrap();
-                let local_store =
-                    Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
-                DiskCacheStore::try_new(cache_root_dir.clone(), 160, 8, local_store, 0)
+    #[test]
+    fn test_disk_cache_manifest() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            let cache_dir = tempdir().unwrap();
+            let cache_root_dir = cache_dir.as_ref().to_string_lossy().to_string();
+            let page_size = 8;
+            let first_create_time = {
+                let _store = {
+                    let local_path = tempdir().unwrap();
+                    let local_store =
+                        Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
+                    DiskCacheStore::try_new(
+                        cache_root_dir.clone(),
+                        160,
+                        8,
+                        local_store,
+                        0,
+                        rt.clone(),
+                    )
                     .await
                     .unwrap()
+                };
+                let manifest =
+                    DiskCacheStore::create_manifest_if_not_exists(&cache_root_dir, page_size)
+                        .await
+                        .unwrap();
+
+                assert_eq!(manifest.page_size, 8);
+                assert_eq!(manifest.version, Manifest::CURRENT_VERSION);
+                manifest.create_at
             };
-            let manifest =
-                DiskCacheStore::create_manifest_if_not_exists(&cache_root_dir, page_size)
+
+            // open again
+            {
+                let _store = {
+                    let local_path = tempdir().unwrap();
+                    let local_store =
+                        Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
+                    DiskCacheStore::try_new(
+                        cache_root_dir.clone(),
+                        160,
+                        8,
+                        local_store,
+                        0,
+                        rt.clone(),
+                    )
                     .await
-                    .unwrap();
+                    .unwrap()
+                };
 
-            assert_eq!(manifest.page_size, 8);
-            assert_eq!(manifest.version, Manifest::CURRENT_VERSION);
-            manifest.create_at
-        };
+                let manifest =
+                    DiskCacheStore::create_manifest_if_not_exists(&cache_root_dir, page_size)
+                        .await
+                        .unwrap();
+                assert_eq!(manifest.create_at, first_create_time);
+                assert_eq!(manifest.page_size, 8);
+                assert_eq!(manifest.version, Manifest::CURRENT_VERSION);
+            }
 
-        // open again
-        {
-            let _store = {
+            // open again, but with different page_size
+            {
                 let local_path = tempdir().unwrap();
                 let local_store =
                     Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
-                DiskCacheStore::try_new(cache_root_dir.clone(), 160, 8, local_store, 0)
-                    .await
-                    .unwrap()
-            };
-
-            let manifest =
-                DiskCacheStore::create_manifest_if_not_exists(&cache_root_dir, page_size)
-                    .await
-                    .unwrap();
-            assert_eq!(manifest.create_at, first_create_time);
-            assert_eq!(manifest.page_size, 8);
-            assert_eq!(manifest.version, Manifest::CURRENT_VERSION);
-        }
+                let store = DiskCacheStore::try_new(
+                    cache_dir.as_ref().to_string_lossy().to_string(),
+                    160,
+                    page_size * 2,
+                    local_store,
+                    0,
+                    rt.clone(),
+                )
+                .await;
 
-        // open again, but with different page_size
-        {
-            let local_path = tempdir().unwrap();
-            let local_store =
-                Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
-            let store = DiskCacheStore::try_new(
-                cache_dir.as_ref().to_string_lossy().to_string(),
-                160,
-                page_size * 2,
-                local_store,
-                0,
-            )
-            .await;
-
-            assert!(store.is_err())
-        }
+                assert!(store.is_err())
+            }
+        });
     }
 
-    #[tokio::test]
-    async fn test_disk_cache_recovery() {
-        let cache_dir = tempdir().unwrap();
-        let cache_root_dir = cache_dir.as_ref().to_string_lossy().to_string();
-        let page_size = 16;
-        let location = Path::from("recovery.sst");
-        {
-            let store = {
-                let local_path = tempdir().unwrap();
-                let local_store =
-                    Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
-                DiskCacheStore::try_new(cache_root_dir.clone(), 10240, page_size, local_store, 0)
+    #[test]
+    fn test_disk_cache_recovery() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            let cache_dir = tempdir().unwrap();
+            let cache_root_dir = cache_dir.as_ref().to_string_lossy().to_string();
+            let page_size = 16;
+            let location = Path::from("recovery.sst");
+            {
+                let store = {
+                    let local_path = tempdir().unwrap();
+                    let local_store =
+                        Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
+                    DiskCacheStore::try_new(
+                        cache_root_dir.clone(),
+                        10240,
+                        page_size,
+                        local_store,
+                        0,
+                        rt.clone(),
+                    )
                     .await
                     .unwrap()
+                };
+                let data = b"abcd";
+                let mut buf = BytesMut::with_capacity(data.len() * 1024);
+                for _ in 0..1024 {
+                    buf.extend_from_slice(data);
+                }
+                let buf = buf.freeze();
+                store.put(&location, buf.clone()).await.unwrap();
+                let read_range = 16..100;
+                let bytes = store
+                    .get_range(&location, read_range.clone())
+                    .await
+                    .unwrap();
+                assert_eq!(bytes.len(), read_range.len());
+                assert_eq!(bytes[..], buf[read_range])
             };
-            let data = b"abcd";
-            let mut buf = BytesMut::with_capacity(data.len() * 1024);
-            for _ in 0..1024 {
-                buf.extend_from_slice(data);
-            }
-            let buf = buf.freeze();
-            store.put(&location, buf.clone()).await.unwrap();
-            let read_range = 16..100;
-            let bytes = store
-                .get_range(&location, read_range.clone())
-                .await
-                .unwrap();
-            assert_eq!(bytes.len(), read_range.len());
-            assert_eq!(bytes[..], buf[read_range])
-        };
 
-        // recover
-        {
-            let store = {
-                let local_path = tempdir().unwrap();
-                let local_store =
-                    Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
-                DiskCacheStore::try_new(cache_root_dir.clone(), 160, page_size, local_store, 0)
+            // recover
+            {
+                let store = {
+                    let local_path = tempdir().unwrap();
+                    let local_store =
+                        Arc::new(LocalFileSystem::new_with_prefix(local_path.path()).unwrap());
+                    DiskCacheStore::try_new(
+                        cache_root_dir.clone(),
+                        160,
+                        page_size,
+                        local_store,
+                        0,
+                        rt.clone(),
+                    )
                     .await
                     .unwrap()
+                };
+                for range in [16..32, 32..48, 48..64, 64..80, 80..96, 96..112] {
+                    let filename = DiskCacheStore::page_cache_name(&location, &range);
+                    let cache = store.cache.meta_cache.lock(&filename);
+                    assert!(cache.contains(&filename));
+                    assert!(test_file_exists(&cache_dir, &location, &range));
+                }
             };
-            for range in [16..32, 32..48, 48..64, 64..80, 80..96, 96..112] {
-                let filename = DiskCacheStore::page_cache_name(&location, &range);
-                let cache = store.cache.meta_cache.lock(&filename);
-                assert!(cache.contains(&filename));
-                assert!(test_file_exists(&cache_dir, &location, &range));
-            }
-        };
+        });
     }
 
     #[test]
@@ -1381,18 +1453,21 @@ mod test {
         }
     }
 
-    #[tokio::test]
-    async fn test_corrupt_disk_cache() {
-        for page_size in [1, 2, 4, 8, 16, 32, 64, 128] {
-            corrupt_disk_cache(page_size).await;
-        }
+    #[test]
+    fn test_corrupt_disk_cache() {
+        let rt = Arc::new(Builder::default().build().unwrap());
+        rt.block_on(async {
+            for page_size in [1, 2, 4, 8, 16, 32, 64, 128] {
+                corrupt_disk_cache(page_size, rt.clone()).await;
+            }
+        });
     }
 
-    async fn corrupt_disk_cache(page_size: usize) {
+    async fn corrupt_disk_cache(page_size: usize, rt: RuntimeRef) {
         let StoreWithCacheDir {
             inner: store,
             cache_dir,
-        } = prepare_store(page_size, 1024, 0).await;
+        } = prepare_store(page_size, 1024, 0, rt).await;
         let test_file_name = "corrupted_disk_cache_file";
         let test_file_path = Path::from(test_file_name);
         let test_file_bytes = Bytes::from("corrupted_disk_cache_file_data");

From 74c56412b07770f0d58533cdd9192a093ee21ca1 Mon Sep 17 00:00:00 2001
From: kamille <34352236+Rachelint@users.noreply.github.com>
Date: Tue, 9 Jan 2024 16:59:13 +0800
Subject: [PATCH 34/38] fix: compatible for old table options (#1432)

## Rationale
Make it compatible for old table options.

## Detailed Changes
When `layered_memtable_opts` not found in `TableOptions`, we disable
`layered_memtable`.

## Test Plan
Test manually.
---
 analytic_engine/src/table_options.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/analytic_engine/src/table_options.rs b/analytic_engine/src/table_options.rs
index c33a75dbf3..dfdbd1a289 100644
--- a/analytic_engine/src/table_options.rs
+++ b/analytic_engine/src/table_options.rs
@@ -677,10 +677,14 @@ impl TryFrom<manifest_pb::TableOptions> for TableOptions {
         };
 
         let storage_format_hint = opts.storage_format_hint.context(MissingStorageFormatHint)?;
-        let layered_memtable_opts = opts
-            .layered_memtable_options
-            .context(MissingLayeredMemtableOptions)?
-            .into();
+        // For compatible with old `table_options`, `layered_memtable_options` is
+        // allowed to be `None`, and when found `None`, we disable `layered_memtable`.
+        let layered_memtable_opts = match opts.layered_memtable_options {
+            Some(v) => v.into(),
+            None => LayeredMemtableOptions {
+                mutable_segment_switch_threshold: ReadableSize(0),
+            },
+        };
 
         let table_opts = Self {
             segment_duration,

From a7fe3a40cf8af5644e58d439b5ceea86c34cf9b0 Mon Sep 17 00:00:00 2001
From: "xikai.wxk" <xikai.wxk@antgroup.com>
Date: Tue, 9 Jan 2024 20:46:15 +0800
Subject: [PATCH 35/38] fix: invalid license header

---
 .../src/memtable/layered/factory.rs           | 25 +++++++++++--------
 analytic_engine/src/memtable/layered/iter.rs  | 25 +++++++++++--------
 analytic_engine/src/memtable/layered/mod.rs   | 25 +++++++++++--------
 analytic_engine/src/memtable/test_util.rs     | 25 +++++++++++--------
 catalog_impls/src/cluster_based.rs            | 25 +++++++++++--------
 components/runtime/src/priority_runtime.rs    | 25 +++++++++++--------
 query_frontend/src/logical_optimizer/mod.rs   | 25 +++++++++++--------
 .../src/grpc/remote_engine_service/metrics.rs | 25 +++++++++++--------
 8 files changed, 112 insertions(+), 88 deletions(-)

diff --git a/analytic_engine/src/memtable/layered/factory.rs b/analytic_engine/src/memtable/layered/factory.rs
index 03c793b9c5..002943ab20 100644
--- a/analytic_engine/src/memtable/layered/factory.rs
+++ b/analytic_engine/src/memtable/layered/factory.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 //! Skiplist memtable factory
 
diff --git a/analytic_engine/src/memtable/layered/iter.rs b/analytic_engine/src/memtable/layered/iter.rs
index 6e1f303083..be32b77e02 100644
--- a/analytic_engine/src/memtable/layered/iter.rs
+++ b/analytic_engine/src/memtable/layered/iter.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 //! Skiplist memtable iterator
 
diff --git a/analytic_engine/src/memtable/layered/mod.rs b/analytic_engine/src/memtable/layered/mod.rs
index 92087e1e10..0f4b697cfc 100644
--- a/analytic_engine/src/memtable/layered/mod.rs
+++ b/analytic_engine/src/memtable/layered/mod.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 //! MemTable based on skiplist
 
diff --git a/analytic_engine/src/memtable/test_util.rs b/analytic_engine/src/memtable/test_util.rs
index 72364b1854..18a26f5eed 100644
--- a/analytic_engine/src/memtable/test_util.rs
+++ b/analytic_engine/src/memtable/test_util.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 use common_types::row::Row;
 
diff --git a/catalog_impls/src/cluster_based.rs b/catalog_impls/src/cluster_based.rs
index 650d201957..c1208f678c 100644
--- a/catalog_impls/src/cluster_based.rs
+++ b/catalog_impls/src/cluster_based.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 use async_trait::async_trait;
 use catalog::{
diff --git a/components/runtime/src/priority_runtime.rs b/components/runtime/src/priority_runtime.rs
index 1f69bd8a1c..922b80ea5a 100644
--- a/components/runtime/src/priority_runtime.rs
+++ b/components/runtime/src/priority_runtime.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 use std::future::Future;
 
diff --git a/query_frontend/src/logical_optimizer/mod.rs b/query_frontend/src/logical_optimizer/mod.rs
index 9fcd46b331..4d62e87750 100644
--- a/query_frontend/src/logical_optimizer/mod.rs
+++ b/query_frontend/src/logical_optimizer/mod.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 //! Logical optimizer
 
diff --git a/server/src/grpc/remote_engine_service/metrics.rs b/server/src/grpc/remote_engine_service/metrics.rs
index c6c8124a02..ae91914db2 100644
--- a/server/src/grpc/remote_engine_service/metrics.rs
+++ b/server/src/grpc/remote_engine_service/metrics.rs
@@ -1,16 +1,19 @@
-// Copyright 2023 The HoraeDB Authors
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+//   http://www.apache.org/licenses/LICENSE-2.0
 //
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
 
 use lazy_static::lazy_static;
 use prometheus::{register_int_counter_vec, IntCounterVec};

From f5b6b4f655fe18c99848d07cca1c74a0b8054305 Mon Sep 17 00:00:00 2001
From: "xikai.wxk" <xikai.wxk@antgroup.com>
Date: Wed, 10 Jan 2024 10:26:19 +0800
Subject: [PATCH 36/38] fix: upgrade the horaedb proto to latest version

---
 Cargo.lock                                   | 2 +-
 Cargo.toml                                   | 2 +-
 server/src/grpc/remote_engine_service/mod.rs | 4 ++--
 table_engine/src/remote/model.rs             | 3 +--
 table_engine/src/table.rs                    | 2 +-
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 968217f04b..40c74b63a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3079,7 +3079,7 @@ dependencies = [
 [[package]]
 name = "horaedbproto"
 version = "2.0.0"
-source = "git+https://github.com/apache/incubator-horaedb-proto.git?rev=4a6f323b892c5944acdcf5447a3cc1e0c18f6e16#4a6f323b892c5944acdcf5447a3cc1e0c18f6e16"
+source = "git+https://github.com/apache/incubator-horaedb-proto.git?rev=19ece8f771fc0b3e8e734072cc3d8040de6c74cb#19ece8f771fc0b3e8e734072cc3d8040de6c74cb"
 dependencies = [
  "prost",
  "protoc-bin-vendored",
diff --git a/Cargo.toml b/Cargo.toml
index 809409daf1..fe02d11e71 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -97,7 +97,7 @@ bytes = "1"
 bytes_ext = { path = "components/bytes_ext" }
 catalog = { path = "catalog" }
 catalog_impls = { path = "catalog_impls" }
-horaedbproto = { git = "https://github.com/apache/incubator-horaedb-proto.git", rev = "4a6f323b892c5944acdcf5447a3cc1e0c18f6e16" }
+horaedbproto = { git = "https://github.com/apache/incubator-horaedb-proto.git", rev = "19ece8f771fc0b3e8e734072cc3d8040de6c74cb" }
 codec = { path = "components/codec" }
 chrono = "0.4"
 clap = "3.0"
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index 34909add50..8a4ad57b99 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -707,7 +707,7 @@ impl RemoteEngineServiceImpl {
 
         let priority = ctx.priority();
         let query_ctx = create_query_ctx(
-            ctx.request_id_str,
+            ctx.request_id,
             ctx.default_catalog,
             ctx.default_schema,
             ctx.timeout_ms,
@@ -770,7 +770,7 @@ impl RemoteEngineServiceImpl {
             .load(std::sync::atomic::Ordering::Relaxed);
         let priority = ctx.priority();
         let query_ctx = create_query_ctx(
-            ctx.request_id_str,
+            ctx.request_id,
             ctx.default_catalog,
             ctx.default_schema,
             ctx.timeout_ms,
diff --git a/table_engine/src/remote/model.rs b/table_engine/src/remote/model.rs
index 358ef3669d..a779eb3169 100644
--- a/table_engine/src/remote/model.rs
+++ b/table_engine/src/remote/model.rs
@@ -482,8 +482,7 @@ impl From<RemoteExecuteRequest> for horaedbproto::remote_engine::ExecutePlanRequ
             None
         };
         let pb_context = horaedbproto::remote_engine::ExecContext {
-            request_id: 0,
-            request_id_str: String::from(value.context.request_id),
+            request_id: String::from(value.context.request_id),
             default_catalog: value.context.default_catalog,
             default_schema: value.context.default_schema,
             timeout_ms: rest_duration_ms,
diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs
index 208d47ff16..7365ca66a4 100644
--- a/table_engine/src/table.rs
+++ b/table_engine/src/table.rs
@@ -443,7 +443,7 @@ impl TryFrom<ReadRequest> for horaedbproto::remote_engine::TableReadRequest {
                 })?;
 
         Ok(Self {
-            request_id: 0, // this field not used any more
+            request_id: String::from(request.request_id),
             opts: Some(request.opts.into()),
             projected_schema: Some(request.projected_schema.into()),
             predicate: Some(predicate_pb),

From 8675544eec0de786855d25fbffc31ec9b1045243 Mon Sep 17 00:00:00 2001
From: "xikai.wxk" <xikai.wxk@antgroup.com>
Date: Wed, 10 Jan 2024 17:08:30 +0800
Subject: [PATCH 37/38] fix: update the integration test results

---
 .../cases/env/cluster/ddl/partition_table.result         | 9 +++++----
 .../cases/env/cluster/ddl/partition_table.sql            | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.result b/integration_tests/cases/env/cluster/ddl/partition_table.result
index 77636b2207..d376718cc7 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.result
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.result
@@ -83,7 +83,7 @@ UInt64(16367588166920223437),Timestamp(1651737067000),String("horaedb9"),Int32(0
 EXPLAIN ANALYZE SELECT * from partition_table_t where name = "ceresdb0";
 
 plan_type,plan,
-String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:1, metrics=[\npartition_table_t:\n    __partition_table_t_1:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_1:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_1, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name = Utf8(\"ceresdb0\")], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n=0]\n"),
+String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:1, metrics=[\npartition_table_t:\n    __partition_table_t_1:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_1:\nCoalescePartitionsExec, metrics=[output_rows=0, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_1, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name = Utf8(\"ceresdb0\")], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=1\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=0\n        total_rows_fetch_from_one=0\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n=0]\n"),
 
 
 -- SQLNESS REPLACE duration=\d+.?\d*(µ|m|n) duration=xx
@@ -92,21 +92,22 @@ String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:f
 EXPLAIN ANALYZE SELECT * from partition_table_t where name in ("ceresdb0", "ceresdb1", "ceresdb2", "ceresdb3", "ceresdb4");
 
 plan_type,plan,
-String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:3, metrics=[\npartition_table_t:\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=1, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=1\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=2, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=2\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=1\n        total_rows_fetch_from_one=2\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n=0]\n"),
+String("Plan with Metrics"),String("ResolvedPartitionedScan: pushdown_continue:false, partition_count:3, metrics=[\npartition_table_t:\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n    __partition_table_t_x:\n        poll_duration=xxs\n        total_duration=xxs\n        wait_duration=xxs\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=0, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=1\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=0\n        total_rows_fetch_from_one=0\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=0, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=1\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=0\n        total_rows_fetch_from_one=0\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n\n__partition_table_t_x:\nCoalescePartitionsExec, metrics=[output_rows=0, elapsed_compute=xxs]\n  ScanTable: table=__partition_table_t_x, parallelism=8, priority=Low, metrics=[\nPredicate { exprs:[name IN ([Utf8(\"ceresdb0\"), Utf8(\"ceresdb1\"), Utf8(\"ceresdb2\"), Utf8(\"ceresdb3\"), Utf8(\"ceresdb4\")])], time_range:TimeRange { inclusive_start: Timestamp(-9223372036854775808), exclusive_end: Timestamp(9223372036854775807) } }\nscan_table:\n    do_merge_sort=true\n    iter_num=1\n    merge_iter_0:\n        init_duration=xxs\n        num_memtables=0\n        num_ssts=0\n        scan_count=1\n        scan_duration=xxs\n        times_fetch_row_from_multiple=0\n        times_fetch_rows_from_one=0\n        total_rows_fetch_from_one=0\n        scan_memtable_1, fetched_columns:[tsid,t,name,id,value]:\n=0]\n=0]\n"),
 
 
 ALTER TABLE partition_table_t ADD COLUMN (b string);
 
 affected_rows: 0
 
+-- SQLNESS REPLACE endpoint:(.*?), endpoint:xx,
 INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, "horaedb0", 100);
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute insert, err:Failed to write table, err:Failed to write tables in batch, tables:[\"__partition_table_t_1\"], err:Failed to query from table in server, table_idents:[TableIdentifier { catalog: \"horaedb\", schema: \"public\", table: \"__partition_table_t_1\" }], code:401, msg:failed to decode row group payload. Caused by: Schema mismatch with the write request, msg:expect 6 columns, but got 5. sql:INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, \"horaedb0\", 100);" })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute insert, err:Failed to write table, err:Failed to write tables in batch, tables:[\"__partition_table_t_1\"], err:Failed to query from table in server, table_idents:[TableIdentifier { catalog: \"horaedb\", schema: \"public\", table: \"__partition_table_t_1\" }], endpoint:xx, code:401, msg:failed to decode row group payload. Caused by: Schema mismatch with the write request, msg:expect 6 columns, but got 5. sql:INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, \"horaedb0\", 100);" })
 
 -- SQLNESS REPLACE endpoint:(.*?), endpoint:xx,
 INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, "ceresdb0", 100);
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute insert, err:Failed to write table, err:Failed to write tables in batch, tables:[\"__partition_table_t_1\"], err:Failed to query from table in server, table_idents:[TableIdentifier { catalog: \"ceresdb\", schema: \"public\", table: \"__partition_table_t_1\" }], endpoint:xx, code:401, msg:failed to decode row group payload. Caused by: Schema mismatch with the write request, msg:expect 6 columns, but got 5. sql:INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, \"ceresdb0\", 100);" })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute insert, err:Failed to write table, err:Failed to write tables in batch, tables:[\"__partition_table_t_1\"], err:Failed to query from table in server, table_idents:[TableIdentifier { catalog: \"horaedb\", schema: \"public\", table: \"__partition_table_t_1\" }], endpoint:xx, code:401, msg:failed to decode row group payload. Caused by: Schema mismatch with the write request, msg:expect 6 columns, but got 5. sql:INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, \"ceresdb0\", 100);" })
 
 ALTER TABLE partition_table_t MODIFY SETTING enable_ttl='true';
 
diff --git a/integration_tests/cases/env/cluster/ddl/partition_table.sql b/integration_tests/cases/env/cluster/ddl/partition_table.sql
index de2f4148d5..a36b59ac2d 100644
--- a/integration_tests/cases/env/cluster/ddl/partition_table.sql
+++ b/integration_tests/cases/env/cluster/ddl/partition_table.sql
@@ -46,6 +46,7 @@ EXPLAIN ANALYZE SELECT * from partition_table_t where name in ("ceresdb0", "cere
 
 ALTER TABLE partition_table_t ADD COLUMN (b string);
 
+-- SQLNESS REPLACE endpoint:(.*?), endpoint:xx,
 INSERT INTO partition_table_t (t, id, name, value) VALUES (1651737067000, 10, "horaedb0", 100);
 
 -- SQLNESS REPLACE endpoint:(.*?), endpoint:xx,

From 96ae42055c6f653ece09ccc831d54f8a38ce890c Mon Sep 17 00:00:00 2001
From: WEI Xikai <ShiKaiWi@users.noreply.github.com>
Date: Thu, 11 Jan 2024 10:36:04 +0800
Subject: [PATCH 38/38] chore: enable merge on github (#1435)

## Rationale
Sometimes merging a branch with some reviewed PRs into another is
necessary, but it is disabled now.

## Detailed Changes
Enable to merge some branch to another without squashing.

## Test Plan
Nothing to test.
---
 .asf.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.asf.yaml b/.asf.yaml
index a3004e81a2..bf79facaad 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -34,7 +34,7 @@ github:
     - horaedb
   enabled_merge_buttons:
     squash:  true
-    merge:   false
+    merge:   true
     rebase:  true
   protected_branches:
     main: