Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add more details about the sst in sst-metadata tool #1019

Merged
merged 5 commits into from
Jun 25, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 79 additions & 1 deletion tools/src/bin/sst-metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

//! A cli to query sst meta data

use std::sync::Arc;
use std::{collections::HashMap, sync::Arc};

use analytic_engine::sst::{meta_data::cache::MetaData, parquet::async_reader::ChunkReaderAdapter};
use anyhow::{Context, Result};
Expand All @@ -27,6 +27,10 @@ struct Args {
#[clap(short, long, required(false))]
verbose: bool,

/// File & Field Statistics print
#[clap(short, long, required(false))]
stats: bool,

/// Thread num, 0 means cpu num
#[clap(short, long, default_value_t = 0)]
threads: usize,
Expand All @@ -36,6 +40,34 @@ struct Args {
page_indexes: bool,
}

#[derive(Default, Debug)]
struct FileStatistics {
file_count: u64,
size: usize,
metadata_size: usize,
kv_size: usize,
filter_size: usize,
row_num: i64,
}

impl ToString for FileStatistics {
fn to_string(&self) -> String {
format!("FileStatistics {{\n\tfile_count: {},\n\tsize: {:.2},\n\tmetadata_size: {:.2}, \n\tkv_size: {:.2},\n\tfilter_size: {:.2},\n\trow_num: {},\n}}",
self.file_count,
as_mb(self.size),
as_mb(self.metadata_size),
as_mb(self.kv_size),
as_mb(self.filter_size),
self.row_num)
}
}

#[derive(Default, Debug)]
struct FieldStatistics {
compressed_size: i64,
uncompressed_size: i64,
}

fn new_runtime(thread_num: usize) -> Runtime {
runtime::Builder::default()
.thread_name("sst-metadata")
Expand Down Expand Up @@ -91,6 +123,52 @@ async fn run(args: Args) -> Result<()> {
metas.push(meta);
}

if args.stats {
let mut file_stats = FileStatistics::default();
let mut field_stats_map = HashMap::new();
for (object_meta, sst_metadata, metadata_size, kv_size) in metas {
zouxiang1993 marked this conversation as resolved.
Show resolved Hide resolved
let parquet_meta = sst_metadata.parquet();

file_stats.file_count += 1;
file_stats.size += object_meta.size;
file_stats.metadata_size += metadata_size;
file_stats.kv_size += kv_size;
let filter_size = sst_metadata
.custom()
.parquet_filter
.as_ref()
.map(|f| f.size())
.unwrap_or(0);
file_stats.filter_size += filter_size;
file_stats.row_num += parquet_meta.file_metadata().num_rows();

let fields = parquet_meta.file_metadata().schema().get_fields();
for (_, row_group) in parquet_meta.row_groups().iter().enumerate() {
for i in 0..fields.len() {
let column_meta = row_group.column(i);
let field_name = fields.get(i).unwrap().get_basic_info().name().to_string();
if !field_stats_map.contains_key(&field_name) {
field_stats_map.insert(field_name.clone(), FieldStatistics::default());
}
let field_stats = field_stats_map.get_mut(&field_name).unwrap();
field_stats.compressed_size += column_meta.compressed_size();
field_stats.uncompressed_size += column_meta.uncompressed_size();
}
}
}
println!("{}", file_stats.to_string());

println!("FieldStatistics: ");
for (k, v) in field_stats_map.iter() {
println!("{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}",
k,
as_mb(v.compressed_size as usize),
as_mb(v.uncompressed_size as usize),
v.uncompressed_size as f64 / v.compressed_size as f64);
zouxiang1993 marked this conversation as resolved.
Show resolved Hide resolved
}
return Ok(());
}

// sort by time_range asc
metas.sort_by(|a, b| {
a.1.custom()
Expand Down