From eae516d903f299bf49fe36e6f896bdcca140043f Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Tue, 3 Dec 2024 21:54:14 -0600 Subject: [PATCH] support more data types and handle empty file --- src/info.rs | 38 +++++++++++++++++++++++------------ src/main.rs | 29 ++++++++++++++++----------- src/query_input.rs | 2 +- src/query_results.rs | 47 +++++++++++++++++++++++++++++++++++++++++++- src/schema.rs | 16 ++++++++++++--- 5 files changed, 102 insertions(+), 30 deletions(-) diff --git a/src/info.rs b/src/info.rs index 1ccc233..e6d3ed7 100644 --- a/src/info.rs +++ b/src/info.rs @@ -9,6 +9,10 @@ pub fn InfoSection(parquet_info: super::ParquetInfo) -> impl IntoView { .unwrap_or("Unknown") .to_string(); let version = parquet_info.metadata.file_metadata().version(); + let has_bloom_filter = parquet_info.has_bloom_filter; + let has_page_index = parquet_info.has_page_index; + let has_column_index = parquet_info.has_column_index; + let has_row_group_stats = parquet_info.has_row_group_stats; // Create a signal for the selected row group let (selected_row_group, set_selected_row_group) = create_signal(0); @@ -65,44 +69,52 @@ pub fn InfoSection(parquet_info: super::ParquetInfo) -> impl IntoView { - + {move || { + if parquet_info.row_group_count > 0 { + Some(view! { + + }) + } else { + None + } + }}

"Features"

- {if parquet_info.has_row_group_stats { "✓" } else { "✗" }} + {if has_row_group_stats { "✓" } else { "✗" }} " Row Group Statistics"
- {if parquet_info.has_column_index { "✓" } else { "✗" }} " Column Index" + {if has_column_index { "✓" } else { "✗" }} " Column Index"
{if parquet_info.has_page_index { "✓" } else { "✗" }} " Page Index"
+ }>{if has_page_index { "✓" } else { "✗" }} " Page Index"
- {if parquet_info.has_bloom_filter { "✓" } else { "✗" }} " Bloom Filter" + {if has_bloom_filter { "✓" } else { "✗" }} " Bloom Filter"
diff --git a/src/main.rs b/src/main.rs index a51ea3b..d020050 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,6 +59,8 @@ impl ParquetInfo { metadata.file_metadata().schema_descr(), metadata.file_metadata().key_value_metadata(), )?; + let first_row_group = metadata.row_groups().first(); + let first_column = first_row_group.map(|rg| rg.columns().first()).flatten(); Ok(Self { file_size: compressed_size, @@ -67,14 +69,14 @@ impl ParquetInfo { row_group_count: metadata.num_row_groups() as u64, row_count: metadata.file_metadata().num_rows() as u64, columns: schema.fields.len() as u64, - has_row_group_stats: metadata.row_group(0).column(0).statistics().is_some(), + has_row_group_stats: first_column + .map(|c| c.statistics().is_some()) + .unwrap_or(false), has_column_index: metadata.column_index().is_some(), has_page_index: metadata.offset_index().is_some(), - has_bloom_filter: metadata - .row_group(0) - .column(0) - .bloom_filter_offset() - .is_some(), + has_bloom_filter: first_column + .map(|c| c.bloom_filter_offset().is_some()) + .unwrap_or(false), schema: Arc::new(schema), metadata: Arc::new(metadata), metadata_len, @@ -284,16 +286,19 @@ fn App() -> impl IntoView { .map(|_| { match file_content.get_untracked() { Some(info) => { - - view! { - 0 { + view! { + + schema=info.schema + error_message=set_error_message + /> + } + } else { + view! {}.into_view() } } None => view! {}.into_view(), diff --git a/src/query_input.rs b/src/query_input.rs index 5ea9dd1..e5c21c9 100644 --- a/src/query_input.rs +++ b/src/query_input.rs @@ -220,7 +220,7 @@ fn process_user_input( web_sys::console::log_1(&format!("Processing user input: {}", input).into()); let prompt = format!( - "Generate a SQL query to answer the following question: {}. You should generate PostgreSQL SQL dialect, all field names should be double quoted, and the output SQL should be executable, be careful about the available columns. The table name is: {}, the schema of the table is: {}. ", + "Generate a SQL query to answer the following question: {}. You should generate PostgreSQL SQL dialect, all field names and table names should be double quoted, and the output SQL should be executable, be careful about the available columns. The table name is: {}, the schema of the table is: {}. ", input, file_name, schema_str ); web_sys::console::log_1(&prompt.clone().into()); diff --git a/src/query_results.rs b/src/query_results.rs index 41e25b7..ca05836 100644 --- a/src/query_results.rs +++ b/src/query_results.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use arrow::array::Array; +use arrow::array::{types::*, Array}; use arrow::datatypes::DataType; use arrow::record_batch::RecordBatch; use datafusion::{ @@ -163,6 +163,51 @@ impl ArrayExt for dyn Array { let value = array.value(index); String::from_utf8_lossy(value).to_string() } + DataType::Dictionary(key_type, _) => { + match key_type.as_ref() { + DataType::Int8 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + DataType::Int16 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + DataType::Int32 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + DataType::Int64 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + DataType::UInt8 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + DataType::UInt16 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + DataType::UInt32 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + DataType::UInt64 => { + let array = as_dictionary_array::(array); + let values = array.values(); + values.value_to_string(array.key(index).unwrap_or_default()) + } + _ => format!("Unsupported dictionary key type {}", key_type), + } + } t => format!("Unsupported datatype {}", t) ) } diff --git a/src/schema.rs b/src/schema.rs index 9a09df5..bceda0c 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -24,13 +24,23 @@ enum SortField { pub fn SchemaSection(parquet_info: super::ParquetInfo) -> impl IntoView { let schema = parquet_info.schema.clone(); let metadata = parquet_info.metadata.clone(); - let mut column_info = - vec![(0, 0, metadata.row_group(0).column(0).compression()); schema.fields.len()]; + let mut column_info = vec![ + ( + 0, + 0, + metadata + .row_groups() + .first() + .map(|rg| rg.columns().first().map(|c| c.compression())) + .flatten(), + ); + schema.fields.len() + ]; for rg in metadata.row_groups() { for (i, col) in rg.columns().iter().enumerate() { column_info[i].0 += col.compressed_size() as u64; column_info[i].1 += col.uncompressed_size() as u64; - column_info[i].2 = col.compression(); + column_info[i].2 = Some(col.compression()); } }