Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add more details about the sst in sst-metadata tool #1019

Merged
merged 5 commits into from
Jun 25, 2023
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion tools/src/bin/sst-metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

//! A cli to query sst meta data

use std::sync::Arc;
use std::{collections::HashMap, sync::Arc};

use analytic_engine::sst::{meta_data::cache::MetaData, parquet::async_reader::ChunkReaderAdapter};
use anyhow::{Context, Result};
Expand Down Expand Up @@ -36,6 +36,34 @@ struct Args {
page_indexes: bool,
}

#[derive(Default, Debug)]
struct FileStatistics {
file_count: u64,
size: usize,
metadata_size: usize,
kv_size: usize,
filter_size: usize,
row_num: i64,
}

impl ToString for FileStatistics {
fn to_string(&self) -> String {
format!("FileStatistics {{\n\tfile_count: {},\n\tsize: {:.2},\n\tmetadata_size: {:.2}, \n\tkv_size: {:.2},\n\tfilter_size: {:.2},\n\trow_num: {},\n}}",
self.file_count,
as_mb(self.size),
as_mb(self.metadata_size),
as_mb(self.kv_size),
as_mb(self.filter_size),
self.row_num)
}
}

#[derive(Default, Debug)]
struct FieldStatistics {
compressed_size: i64,
uncompressed_size: i64,
}

fn new_runtime(thread_num: usize) -> Runtime {
runtime::Builder::default()
.thread_name("sst-metadata")
Expand Down Expand Up @@ -99,6 +127,8 @@ async fn run(args: Args) -> Result<()> {
.cmp(&b.1.custom().time_range.inclusive_start())
});

let mut file_stats = FileStatistics::default();
let mut field_stats_map = HashMap::new();
for (object_meta, sst_metadata, metadata_size, kv_size) in metas {
let ObjectMeta { location, size, .. } = &object_meta;
let custom_meta = sst_metadata.custom();
Expand All @@ -114,6 +144,28 @@ async fn run(args: Args) -> Result<()> {
.unwrap_or(0);
let file_metadata = parquet_meta.file_metadata();
let row_num = file_metadata.num_rows();

file_stats.file_count += 1;
file_stats.size += object_meta.size;
file_stats.metadata_size += metadata_size;
file_stats.kv_size += kv_size;
file_stats.filter_size += filter_size;
file_stats.row_num += row_num;

let fields = file_metadata.schema().get_fields();
for (_, row_group) in parquet_meta.row_groups().iter().enumerate() {
for i in 0..fields.len() {
let column_meta = row_group.column(i);
let field_name = fields.get(i).unwrap().get_basic_info().name().to_string();
if !field_stats_map.contains_key(&field_name) {
tanruixiang marked this conversation as resolved.
Show resolved Hide resolved
field_stats_map.insert(field_name.clone(), FieldStatistics::default());
}
let field_stats = field_stats_map.get_mut(&field_name).unwrap();
field_stats.compressed_size += column_meta.compressed_size();
field_stats.uncompressed_size += column_meta.uncompressed_size();
}
}

if verbose {
println!("object_meta:{object_meta:?}, parquet_meta:{parquet_meta:?}, custom_meta:{custom_meta:?}");
} else {
Expand All @@ -127,6 +179,17 @@ async fn run(args: Args) -> Result<()> {
}
}

println!("{}", file_stats.to_string());
println!("FieldStatistics: ");
for (k, v) in field_stats_map.iter() {
println!(
"{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}",
k,
as_mb(v.compressed_size as usize),
as_mb(v.uncompressed_size as usize),
v.uncompressed_size as f64 / v.compressed_size as f64
);
}
Ok(())
}

Expand Down