Skip to content

Commit b444ea7

Browse files
alambetseidlmbrobbel
authored
Refactor: extract FooterTail from ParquetMetadataReader (#8437)
# Which issue does this PR close? - Part of #8000 - Prep PR for #8340, to make it easier to review # Rationale for this change In #8340 I am trying to split the "IO" from the "where is the metadata in the file" from the "decode thrift into Rust structures" logic. I want to make it as easy as possible to review so I split it into pieces, but you can see #8340 for how it all fits together # What changes are included in this PR? This PR cleans up the code that handles parsing the 8 byte parquet file footer, `FooterTail`, into its own module and construtor # Are these changes tested? yes, by CI # Are there any user-facing changes? No, this is entirely internal reorganization and I left a `pub use` --------- Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com> Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com>
1 parent e2db7d4 commit b444ea7

File tree

3 files changed

+119
-50
lines changed

3 files changed

+119
-50
lines changed
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::errors::{ParquetError, Result};
19+
use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER};
20+
21+
/// Parsed Parquet footer tail (last 8 bytes of a Parquet file)
22+
///
23+
/// There are 8 bytes at the end of the Parquet footer with the following layout:
24+
/// * 4 bytes for the metadata length
25+
/// * 4 bytes for the magic bytes 'PAR1' or 'PARE' (encrypted footer)
26+
///
27+
/// ```text
28+
/// +-----+------------------+
29+
/// | len | 'PAR1' or 'PARE' |
30+
/// +-----+------------------+
31+
/// ```
32+
///
33+
/// # Examples
34+
/// ```
35+
/// # use parquet::file::metadata::FooterTail;
36+
/// // a non encrypted footer with 28 bytes of metadata
37+
/// let last_8_bytes: [u8; 8] = [0x1C, 0x00, 0x00, 0x00, b'P', b'A', b'R', b'1'];
38+
/// let footer_tail = FooterTail::try_from(last_8_bytes).unwrap();
39+
/// assert_eq!(footer_tail.metadata_length(), 28);
40+
/// assert_eq!(footer_tail.is_encrypted_footer(), false);
41+
/// ```
42+
///
43+
/// ```
44+
/// # use parquet::file::metadata::FooterTail;
45+
/// // an encrypted footer with 512 bytes of metadata
46+
/// let last_8_bytes = vec![0x00, 0x02, 0x00, 0x00, b'P', b'A', b'R', b'E'];
47+
/// let footer_tail = FooterTail::try_from(&last_8_bytes[..]).unwrap();
48+
/// assert_eq!(footer_tail.metadata_length(), 512);
49+
/// assert_eq!(footer_tail.is_encrypted_footer(), true);
50+
/// ```
51+
///
52+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
53+
pub struct FooterTail {
54+
metadata_length: usize,
55+
encrypted_footer: bool,
56+
}
57+
58+
impl FooterTail {
59+
/// Try to decode the footer tail from the given 8 bytes
60+
pub fn try_new(slice: &[u8; FOOTER_SIZE]) -> Result<FooterTail> {
61+
let magic = &slice[4..];
62+
let encrypted_footer = if magic == PARQUET_MAGIC_ENCR_FOOTER {
63+
true
64+
} else if magic == PARQUET_MAGIC {
65+
false
66+
} else {
67+
return Err(general_err!("Invalid Parquet file. Corrupt footer"));
68+
};
69+
// get the metadata length from the footer
70+
let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap());
71+
72+
Ok(FooterTail {
73+
// u32 won't be larger than usize in most cases
74+
metadata_length: metadata_len.try_into()?,
75+
encrypted_footer,
76+
})
77+
}
78+
79+
/// The length of the footer metadata in bytes
80+
pub fn metadata_length(&self) -> usize {
81+
self.metadata_length
82+
}
83+
84+
/// Whether the footer metadata is encrypted
85+
pub fn is_encrypted_footer(&self) -> bool {
86+
self.encrypted_footer
87+
}
88+
}
89+
90+
impl TryFrom<[u8; FOOTER_SIZE]> for FooterTail {
91+
type Error = ParquetError;
92+
93+
fn try_from(value: [u8; FOOTER_SIZE]) -> Result<Self> {
94+
Self::try_new(&value)
95+
}
96+
}
97+
98+
impl TryFrom<&[u8]> for FooterTail {
99+
type Error = ParquetError;
100+
101+
fn try_from(value: &[u8]) -> Result<Self> {
102+
if value.len() != FOOTER_SIZE {
103+
return Err(general_err!(
104+
"Invalid footer length {}, expected {FOOTER_SIZE}",
105+
value.len()
106+
));
107+
}
108+
let slice: &[u8; FOOTER_SIZE] = value.try_into().unwrap();
109+
Self::try_new(slice)
110+
}
111+
}

parquet/src/file/metadata/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
//!
9191
//! * Same name, different struct
9292
//! ```
93+
mod footer_tail;
9394
mod memory;
9495
mod parser;
9596
mod push_decoder;
@@ -121,8 +122,9 @@ use crate::schema::types::{
121122
};
122123
#[cfg(feature = "encryption")]
123124
use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
125+
pub use footer_tail::FooterTail;
124126
pub use push_decoder::ParquetMetaDataPushDecoder;
125-
pub use reader::{FooterTail, PageIndexPolicy, ParquetMetaDataReader};
127+
pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
126128
use std::ops::Range;
127129
use std::sync::Arc;
128130
pub use writer::ParquetMetaDataWriter;

parquet/src/file/metadata/reader.rs

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ use std::{io::Read, ops::Range};
2020
#[cfg(feature = "encryption")]
2121
use crate::encryption::decrypt::FileDecryptionProperties;
2222
use crate::errors::{ParquetError, Result};
23-
use crate::file::metadata::ParquetMetaData;
23+
use crate::file::metadata::{FooterTail, ParquetMetaData};
2424
use crate::file::page_index::index_reader::acc_range;
2525
use crate::file::reader::ChunkReader;
26-
use crate::file::{FOOTER_SIZE, PARQUET_MAGIC, PARQUET_MAGIC_ENCR_FOOTER};
26+
use crate::file::FOOTER_SIZE;
2727

2828
#[cfg(all(feature = "async", feature = "arrow"))]
2929
use crate::arrow::async_reader::{MetadataFetch, MetadataSuffixFetch};
@@ -100,26 +100,6 @@ impl From<bool> for PageIndexPolicy {
100100
}
101101
}
102102

103-
/// Describes how the footer metadata is stored
104-
///
105-
/// This is parsed from the last 8 bytes of the Parquet file
106-
pub struct FooterTail {
107-
metadata_length: usize,
108-
encrypted_footer: bool,
109-
}
110-
111-
impl FooterTail {
112-
/// The length of the footer metadata in bytes
113-
pub fn metadata_length(&self) -> usize {
114-
self.metadata_length
115-
}
116-
117-
/// Whether the footer metadata is encrypted
118-
pub fn is_encrypted_footer(&self) -> bool {
119-
self.encrypted_footer
120-
}
121-
}
122-
123103
impl ParquetMetaDataReader {
124104
/// Create a new [`ParquetMetaDataReader`]
125105
pub fn new() -> Self {
@@ -720,39 +700,15 @@ impl ParquetMetaDataReader {
720700
}
721701
}
722702

723-
/// Decodes the end of the Parquet footer
724-
///
725-
/// There are 8 bytes at the end of the Parquet footer with the following layout:
726-
/// * 4 bytes for the metadata length
727-
/// * 4 bytes for the magic bytes 'PAR1' or 'PARE' (encrypted footer)
728-
///
729-
/// ```text
730-
/// +-----+------------------+
731-
/// | len | 'PAR1' or 'PARE' |
732-
/// +-----+------------------+
733-
/// ```
703+
/// Decodes a [`FooterTail`] from the provided 8-byte slice.
734704
pub fn decode_footer_tail(slice: &[u8; FOOTER_SIZE]) -> Result<FooterTail> {
735-
let magic = &slice[4..];
736-
let encrypted_footer = if magic == PARQUET_MAGIC_ENCR_FOOTER {
737-
true
738-
} else if magic == PARQUET_MAGIC {
739-
false
740-
} else {
741-
return Err(general_err!("Invalid Parquet file. Corrupt footer"));
742-
};
743-
// get the metadata length from the footer
744-
let metadata_len = u32::from_le_bytes(slice[..4].try_into().unwrap());
745-
Ok(FooterTail {
746-
// u32 won't be larger than usize in most cases
747-
metadata_length: metadata_len as usize,
748-
encrypted_footer,
749-
})
705+
FooterTail::try_new(slice)
750706
}
751707

752708
/// Decodes the Parquet footer, returning the metadata length in bytes
753709
#[deprecated(since = "54.3.0", note = "Use decode_footer_tail instead")]
754710
pub fn decode_footer(slice: &[u8; FOOTER_SIZE]) -> Result<usize> {
755-
Self::decode_footer_tail(slice).map(|f| f.metadata_length)
711+
Self::decode_footer_tail(slice).map(|f| f.metadata_length())
756712
}
757713

758714
/// Decodes [`ParquetMetaData`] from the provided bytes.

0 commit comments

Comments
 (0)