Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Parquet several smaller issues #18325

Merged
merged 9 commits into from
Aug 23, 2024
26 changes: 7 additions & 19 deletions crates/polars-io/src/parquet/read/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,8 @@ fn rg_to_dfs_prefiltered(
// column indexes of the schema.
let mut live_idx_to_col_idx = Vec::with_capacity(num_live_columns);
let mut dead_idx_to_col_idx = Vec::with_capacity(num_dead_columns);
for (i, col) in file_metadata.schema().columns().iter().enumerate() {
if live_variables.contains(col.path_in_schema[0].deref()) {
for (i, field) in schema.fields.iter().enumerate() {
if live_variables.contains(&field.name[..]) {
live_idx_to_col_idx.push(i);
} else {
dead_idx_to_col_idx.push(i);
Expand Down Expand Up @@ -406,22 +406,10 @@ fn rg_to_dfs_prefiltered(
})
.collect::<PolarsResult<Vec<_>>>()?;

let mut rearranged_schema: Schema = Schema::new();
if let Some(rc) = &row_index {
rearranged_schema.insert_at_index(
0,
SmartString::from(rc.name.deref()),
IdxType::get_dtype(),
)?;
}
for i in live_idx_to_col_idx.iter().copied() {
rearranged_schema.insert_at_index(
rearranged_schema.len(),
schema.fields[i].name.clone().into(),
schema.fields[i].data_type().into(),
)?;
}
rearranged_schema.merge(Schema::from(schema.as_ref()));
let Some(df) = dfs.first().map(|(_, df)| df) else {
return Ok(Vec::new());
};
let rearranged_schema = df.schema();

rg_columns
.par_chunks_exact_mut(num_dead_columns)
Expand Down Expand Up @@ -520,7 +508,7 @@ fn rg_to_dfs_optionally_par_over_columns(
materialize_hive_partitions(&mut df, schema.as_ref(), hive_partition_columns, rg_slice.1);
apply_predicate(&mut df, predicate, true)?;

*previous_row_count = previous_row_count.checked_add(current_row_count).ok_or(
*previous_row_count = previous_row_count.checked_add(current_row_count).ok_or_else(||
polars_err!(
ComputeError: "Parquet file produces more than pow(2, 32) rows; \
consider compiling with polars-bigidx feature (polars-u64-idx package on python), \
Expand Down
11 changes: 11 additions & 0 deletions crates/polars-parquet/src/arrow/read/deserialize/nested.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,17 @@ pub fn columns_to_iter_recursive(
)?
.collect_n(filter)?
},
Binary | Utf8 => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't support those types. I think we directly use BinviewDecoder here if we can.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will leave this for this PR. But we should be able to remove the code that generates (Large)Utf8, (LargeBinary) in favor of BinaryView.

init.push(InitNested::Primitive(field.is_nullable));
types.pop();
PageNestedDecoder::new(
columns.pop().unwrap(),
field.data_type().clone(),
binary::BinaryDecoder::<i32>::default(),
init,
)?
.collect_n(filter)?
},
_ => match field.data_type().to_logical_type() {
ArrowDataType::Dictionary(key_type, _, _) => {
init.push(InitNested::Primitive(field.is_nullable));
Expand Down