Skip to content

Commit

Permalink
support more data types and handle empty file
Browse files Browse the repository at this point in the history
  • Loading branch information
XiangpengHao committed Dec 4, 2024
1 parent b6402a7 commit eae516d
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 30 deletions.
38 changes: 25 additions & 13 deletions src/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ pub fn InfoSection(parquet_info: super::ParquetInfo) -> impl IntoView {
.unwrap_or("Unknown")
.to_string();
let version = parquet_info.metadata.file_metadata().version();
let has_bloom_filter = parquet_info.has_bloom_filter;
let has_page_index = parquet_info.has_page_index;
let has_column_index = parquet_info.has_column_index;
let has_row_group_stats = parquet_info.has_row_group_stats;

// Create a signal for the selected row group
let (selected_row_group, set_selected_row_group) = create_signal(0);
Expand Down Expand Up @@ -65,44 +69,52 @@ pub fn InfoSection(parquet_info: super::ParquetInfo) -> impl IntoView {
</div>
</div>

<super::row_group::RowGroupSection
parquet_info=parquet_info.clone()
selected_row_group=selected_row_group
set_selected_row_group=set_selected_row_group
/>
{move || {
if parquet_info.row_group_count > 0 {
Some(view! {
<super::row_group::RowGroupSection
parquet_info=parquet_info.clone()
selected_row_group=selected_row_group
set_selected_row_group=set_selected_row_group
/>
})
} else {
None
}
}}

<h2 class="text-xl font-semibold mt-6 mb-4">"Features"</h2>
<div class="grid grid-cols-2 gap-2">
<div class="p-2 rounded ".to_owned()
+ if parquet_info.has_row_group_stats {
+ if has_row_group_stats {
"bg-green-100 text-green-800"
} else {
"bg-gray-100 text-gray-800"
}>
{if parquet_info.has_row_group_stats { "✓" } else { "✗" }}
{if has_row_group_stats { "✓" } else { "✗" }}
" Row Group Statistics"
</div>
<div class="p-2 rounded ".to_owned()
+ if parquet_info.has_column_index {
+ if has_column_index {
"bg-green-100 text-green-800"
} else {
"bg-gray-100 text-gray-800"
}>
{if parquet_info.has_column_index { "✓" } else { "✗" }} " Column Index"
{if has_column_index { "✓" } else { "✗" }} " Column Index"
</div>
<div class="p-2 rounded ".to_owned()
+ if parquet_info.has_page_index {
+ if has_page_index {
"bg-green-100 text-green-800"
} else {
"bg-gray-100 text-gray-800"
}>{if parquet_info.has_page_index { "✓" } else { "✗" }} " Page Index"</div>
}>{if has_page_index { "✓" } else { "✗" }} " Page Index"</div>
<div class="p-2 rounded ".to_owned()
+ if parquet_info.has_bloom_filter {
+ if has_bloom_filter {
"bg-green-100 text-green-800"
} else {
"bg-gray-100 text-gray-800"
}>
{if parquet_info.has_bloom_filter { "✓" } else { "✗" }} " Bloom Filter"
{if has_bloom_filter { "✓" } else { "✗" }} " Bloom Filter"
</div>
</div>
</div>
Expand Down
29 changes: 17 additions & 12 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ impl ParquetInfo {
metadata.file_metadata().schema_descr(),
metadata.file_metadata().key_value_metadata(),
)?;
let first_row_group = metadata.row_groups().first();
let first_column = first_row_group.map(|rg| rg.columns().first()).flatten();

Ok(Self {
file_size: compressed_size,
Expand All @@ -67,14 +69,14 @@ impl ParquetInfo {
row_group_count: metadata.num_row_groups() as u64,
row_count: metadata.file_metadata().num_rows() as u64,
columns: schema.fields.len() as u64,
has_row_group_stats: metadata.row_group(0).column(0).statistics().is_some(),
has_row_group_stats: first_column
.map(|c| c.statistics().is_some())
.unwrap_or(false),
has_column_index: metadata.column_index().is_some(),
has_page_index: metadata.offset_index().is_some(),
has_bloom_filter: metadata
.row_group(0)
.column(0)
.bloom_filter_offset()
.is_some(),
has_bloom_filter: first_column
.map(|c| c.bloom_filter_offset().is_some())
.unwrap_or(false),
schema: Arc::new(schema),
metadata: Arc::new(metadata),
metadata_len,
Expand Down Expand Up @@ -284,16 +286,19 @@ fn App() -> impl IntoView {
.map(|_| {
match file_content.get_untracked() {
Some(info) => {

view! {
<QueryInput
if info.row_group_count > 0 {
view! {
<QueryInput
user_query=user_query
set_user_query=set_user_query
file_name=file_name
execute_query=Arc::new(execute_query)
schema=info.schema
error_message=set_error_message
/>
schema=info.schema
error_message=set_error_message
/>
}
} else {
view! {}.into_view()
}
}
None => view! {}.into_view(),
Expand Down
2 changes: 1 addition & 1 deletion src/query_input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ fn process_user_input(
web_sys::console::log_1(&format!("Processing user input: {}", input).into());

let prompt = format!(
"Generate a SQL query to answer the following question: {}. You should generate PostgreSQL SQL dialect, all field names should be double quoted, and the output SQL should be executable, be careful about the available columns. The table name is: {}, the schema of the table is: {}. ",
"Generate a SQL query to answer the following question: {}. You should generate PostgreSQL SQL dialect, all field names and table names should be double quoted, and the output SQL should be executable, be careful about the available columns. The table name is: {}, the schema of the table is: {}. ",
input, file_name, schema_str
);
web_sys::console::log_1(&prompt.clone().into());
Expand Down
47 changes: 46 additions & 1 deletion src/query_results.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::sync::Arc;

use arrow::array::Array;
use arrow::array::{types::*, Array};
use arrow::datatypes::DataType;
use arrow::record_batch::RecordBatch;
use datafusion::{
Expand Down Expand Up @@ -163,6 +163,51 @@ impl ArrayExt for dyn Array {
let value = array.value(index);
String::from_utf8_lossy(value).to_string()
}
DataType::Dictionary(key_type, _) => {
match key_type.as_ref() {
DataType::Int8 => {
let array = as_dictionary_array::<Int8Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
DataType::Int16 => {
let array = as_dictionary_array::<Int16Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
DataType::Int32 => {
let array = as_dictionary_array::<Int32Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
DataType::Int64 => {
let array = as_dictionary_array::<Int64Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
DataType::UInt8 => {
let array = as_dictionary_array::<UInt8Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
DataType::UInt16 => {
let array = as_dictionary_array::<UInt16Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
DataType::UInt32 => {
let array = as_dictionary_array::<UInt32Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
DataType::UInt64 => {
let array = as_dictionary_array::<UInt64Type>(array);
let values = array.values();
values.value_to_string(array.key(index).unwrap_or_default())
}
_ => format!("Unsupported dictionary key type {}", key_type),
}
}
t => format!("Unsupported datatype {}", t)
)
}
Expand Down
16 changes: 13 additions & 3 deletions src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,23 @@ enum SortField {
pub fn SchemaSection(parquet_info: super::ParquetInfo) -> impl IntoView {
let schema = parquet_info.schema.clone();
let metadata = parquet_info.metadata.clone();
let mut column_info =
vec![(0, 0, metadata.row_group(0).column(0).compression()); schema.fields.len()];
let mut column_info = vec![
(
0,
0,
metadata
.row_groups()
.first()
.map(|rg| rg.columns().first().map(|c| c.compression()))
.flatten(),
);
schema.fields.len()
];
for rg in metadata.row_groups() {
for (i, col) in rg.columns().iter().enumerate() {
column_info[i].0 += col.compressed_size() as u64;
column_info[i].1 += col.uncompressed_size() as u64;
column_info[i].2 = col.compression();
column_info[i].2 = Some(col.compression());
}
}

Expand Down

0 comments on commit eae516d

Please sign in to comment.