Skip to content

Commit

Permalink
get page info on every page
Browse files Browse the repository at this point in the history
  • Loading branch information
XiangpengHao committed Dec 24, 2024
1 parent ab0f69a commit 31d43a0
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 10 deletions.
6 changes: 5 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ impl ParquetReader {
fn info(&self) -> &ParquetInfo {
&self.parquet_info
}

fn bytes(&self) -> &Bytes {
&self.bytes
}
}

impl AsyncFileReader for ParquetReader {
Expand Down Expand Up @@ -448,7 +452,7 @@ fn App() -> impl IntoView {
view! {
<div class="space-y-6">
<div class="w-full">
<MetadataSection parquet_info=info.info().clone() />
<MetadataSection parquet_reader=info.clone() />
</div>
<div class="w-full">
<SchemaSection parquet_info=info.info().clone() />
Expand Down
7 changes: 4 additions & 3 deletions src/metadata.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use leptos::prelude::*;

#[component]
pub fn MetadataSection(parquet_info: super::ParquetInfo) -> impl IntoView {
pub fn MetadataSection(parquet_reader: super::ParquetReader) -> impl IntoView {
let parquet_info = parquet_reader.info().clone();
let created_by = parquet_info
.metadata
.file_metadata()
Expand Down Expand Up @@ -108,8 +109,8 @@ pub fn MetadataSection(parquet_info: super::ParquetInfo) -> impl IntoView {
Some(
view! {
<div>
<super::row_group_column::RowGroupColumn parquet_info=parquet_info
.clone() />
<super::row_group_column::RowGroupColumn parquet_reader=parquet_reader.clone()
/>
</div>
},
)
Expand Down
62 changes: 56 additions & 6 deletions src/row_group_column.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::sync::Arc;

use leptos::prelude::*;
use parquet::file::statistics::Statistics;
use parquet::file::{reader::SerializedPageReader, statistics::Statistics};

use crate::format_rows;

Expand Down Expand Up @@ -101,11 +103,11 @@ fn stats_to_string(stats: Option<Statistics>) -> String {
}

#[component]
pub fn RowGroupColumn(parquet_info: super::ParquetInfo) -> impl IntoView {
pub fn RowGroupColumn(parquet_reader: super::ParquetReader) -> impl IntoView {
let (selected_row_group, set_selected_row_group) = signal(0);
let (selected_column, set_selected_column) = signal(0);

let parquet_info_clone = parquet_info.clone();
let parquet_info_clone = parquet_reader.info().clone();
let row_group_info = move || {
let rg = parquet_info_clone
.metadata
Expand All @@ -117,26 +119,43 @@ pub fn RowGroupColumn(parquet_info: super::ParquetInfo) -> impl IntoView {
(compressed_size, uncompressed_size, num_rows, compression)
};

let parquet_info_clone = parquet_info.clone();
let parquet_info_clone = parquet_reader.info().clone();
let parquet_bytes = parquet_reader.bytes().clone();
let column_info = move || {
let rg = parquet_info_clone
.metadata
.row_group(selected_row_group.get());
let col = rg.column(selected_column.get());
let row_count = rg.num_rows();
let compressed_size = col.compressed_size() as f64 / 1_048_576.0;
let uncompressed_size = col.uncompressed_size() as f64 / 1_048_576.0;
let compression = col.compression();
let statistics = col.statistics().cloned();
let has_bloom_filter = col.bloom_filter_offset().is_some();
let encodings = col.encodings().clone();

let parquet_bytes = Arc::new(parquet_bytes.clone());
let page_reader =
SerializedPageReader::new(parquet_bytes, col, row_count as usize, None).unwrap();

let mut page_info = Vec::new();
for page in page_reader {
if let Ok(page) = page {
let page_type = page.page_type();
let page_size = page.buffer().len() as f64 / 1024.0;
let num_values = page.num_values();
page_info.push((page_type, page_size, num_values));
}
}

(
compressed_size,
uncompressed_size,
compression,
statistics,
has_bloom_filter,
encodings,
page_info,
)
};

Expand All @@ -156,7 +175,7 @@ pub fn RowGroupColumn(parquet_info: super::ParquetInfo) -> impl IntoView {
.set(event_target_value(&ev).parse::<usize>().unwrap_or(0))
}
>
{(0..parquet_info.row_group_count)
{(0..parquet_reader.info().row_group_count)
.map(|i| {
view! {
<option value=i.to_string() class="py-2">
Expand Down Expand Up @@ -217,7 +236,7 @@ pub fn RowGroupColumn(parquet_info: super::ParquetInfo) -> impl IntoView {
.set(event_target_value(&ev).parse::<usize>().unwrap_or(0))
}
>
{parquet_info
{parquet_reader.info()
.schema
.fields
.iter()
Expand All @@ -241,6 +260,7 @@ pub fn RowGroupColumn(parquet_info: super::ParquetInfo) -> impl IntoView {
statistics,
has_bloom_filter,
encodings,
page_info,
) = column_info();
view! {
<div class="grid grid-cols-2 gap-4 bg-gray-50 p-4 rounded-md">
Expand Down Expand Up @@ -280,6 +300,36 @@ pub fn RowGroupColumn(parquet_info: super::ParquetInfo) -> impl IntoView {
<div class="text-sm text-gray-500">"Statistics"</div>
<div class="font-medium text-sm">{stats_to_string(statistics)}</div>
</div>
<div class="col-span-2 space-y-1">
<div class="space-y-0.5">
<div class="flex gap-4 text-sm text-gray-500">
<span class="w-16">Page #</span>
<span class="w-32">Type</span>
<span class="w-24">Size</span>
<span>Rows</span>
</div>
<div class="max-h-[250px] overflow-y-auto pr-2">
{page_info
.into_iter()
.enumerate()
.map(|(i, (page_type, size, values))| {
view! {
<div class="flex gap-4 text-sm">
<span class="w-16">{format!("{}.", i)}</span>
<span class="w-32">{format!("{:?}", page_type)}</span>
<span class="w-24 text-gray-600">
{format!("{:.1} KB", size)}
</span>
<span class="text-gray-600">
{format_rows(values as u64)}
</span>
</div>
}
})
.collect::<Vec<_>>()}
</div>
</div>
</div>
</div>
}
}}
Expand Down

0 comments on commit 31d43a0

Please sign in to comment.