|
15 | 15 | // specific language governing permissions and limitations |
16 | 16 | // under the License. |
17 | 17 |
|
18 | | -// a collection of generated structs used to parse thrift metadata |
| 18 | +//! Encryption support for Thrift serialization |
19 | 19 |
|
20 | 20 | use std::io::Write; |
21 | 21 |
|
22 | 22 | use crate::{ |
23 | | - basic::{Compression, EncodingMask}, |
24 | 23 | encryption::decrypt::{FileDecryptionProperties, FileDecryptor}, |
25 | 24 | errors::{ParquetError, Result}, |
26 | 25 | file::{ |
27 | 26 | column_crypto_metadata::ColumnCryptoMetaData, |
28 | 27 | metadata::{ |
29 | | - HeapSize, LevelHistogram, PageEncodingStats, ParquetMetaData, RowGroupMetaData, |
30 | | - thrift_gen::{ |
31 | | - GeospatialStatistics, SizeStatistics, Statistics, convert_geo_stats, convert_stats, |
32 | | - parquet_metadata_from_bytes, |
33 | | - }, |
| 28 | + HeapSize, ParquetMetaData, RowGroupMetaData, |
| 29 | + thrift::{parquet_metadata_from_bytes, read_column_metadata, validate_column_metadata}, |
34 | 30 | }, |
35 | 31 | }, |
36 | 32 | parquet_thrift::{ |
37 | 33 | ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, |
38 | 34 | ThriftCompactOutputProtocol, ThriftSliceInputProtocol, WriteThrift, WriteThriftField, |
39 | | - read_thrift_vec, |
40 | 35 | }, |
41 | 36 | thrift_struct, thrift_union, |
42 | 37 | }; |
@@ -180,41 +175,9 @@ fn row_group_from_encrypted_thrift( |
180 | 175 | })?; |
181 | 176 |
|
182 | 177 | // parse decrypted buffer and then replace fields in 'c' |
183 | | - let col_meta = read_column_metadata(decrypted_cc_buf.as_slice())?; |
184 | | - |
185 | | - let ( |
186 | | - unencoded_byte_array_data_bytes, |
187 | | - repetition_level_histogram, |
188 | | - definition_level_histogram, |
189 | | - ) = if let Some(size_stats) = col_meta.size_statistics { |
190 | | - ( |
191 | | - size_stats.unencoded_byte_array_data_bytes, |
192 | | - size_stats.repetition_level_histogram, |
193 | | - size_stats.definition_level_histogram, |
194 | | - ) |
195 | | - } else { |
196 | | - (None, None, None) |
197 | | - }; |
198 | | - |
199 | | - let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); |
200 | | - let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from); |
201 | | - |
202 | | - c.encodings = col_meta.encodings; |
203 | | - c.compression = col_meta.codec; |
204 | | - c.num_values = col_meta.num_values; |
205 | | - c.total_uncompressed_size = col_meta.total_uncompressed_size; |
206 | | - c.total_compressed_size = col_meta.total_compressed_size; |
207 | | - c.data_page_offset = col_meta.data_page_offset; |
208 | | - c.index_page_offset = col_meta.index_page_offset; |
209 | | - c.dictionary_page_offset = col_meta.dictionary_page_offset; |
210 | | - c.statistics = convert_stats(d.physical_type(), col_meta.statistics)?; |
211 | | - c.encoding_stats = col_meta.encoding_stats; |
212 | | - c.bloom_filter_offset = col_meta.bloom_filter_offset; |
213 | | - c.bloom_filter_length = col_meta.bloom_filter_length; |
214 | | - c.unencoded_byte_array_data_bytes = unencoded_byte_array_data_bytes; |
215 | | - c.repetition_level_histogram = repetition_level_histogram; |
216 | | - c.definition_level_histogram = definition_level_histogram; |
217 | | - c.geo_statistics = convert_geo_stats(col_meta.geospatial_statistics); |
| 178 | + let mut prot = ThriftSliceInputProtocol::new(&decrypted_cc_buf); |
| 179 | + let mask = read_column_metadata(&mut prot, &mut c)?; |
| 180 | + validate_column_metadata(mask)?; |
218 | 181 |
|
219 | 182 | columns.push(c); |
220 | 183 | } else { |
@@ -373,172 +336,3 @@ fn get_file_decryptor( |
373 | 336 | )), |
374 | 337 | } |
375 | 338 | } |
376 | | - |
377 | | -#[derive(Clone, Debug, Eq, PartialEq)] |
378 | | -struct ColumnMetaData<'a> { |
379 | | - encodings: EncodingMask, |
380 | | - codec: Compression, |
381 | | - num_values: i64, |
382 | | - total_uncompressed_size: i64, |
383 | | - total_compressed_size: i64, |
384 | | - data_page_offset: i64, |
385 | | - index_page_offset: Option<i64>, |
386 | | - dictionary_page_offset: Option<i64>, |
387 | | - statistics: Option<Statistics<'a>>, |
388 | | - encoding_stats: Option<Vec<PageEncodingStats>>, |
389 | | - bloom_filter_offset: Option<i64>, |
390 | | - bloom_filter_length: Option<i32>, |
391 | | - size_statistics: Option<SizeStatistics>, |
392 | | - geospatial_statistics: Option<GeospatialStatistics>, |
393 | | -} |
394 | | - |
395 | | -fn read_column_metadata<'a>(buf: &'a [u8]) -> Result<ColumnMetaData<'a>> { |
396 | | - let mut prot = ThriftSliceInputProtocol::new(buf); |
397 | | - |
398 | | - let mut encodings: Option<EncodingMask> = None; |
399 | | - let mut codec: Option<Compression> = None; |
400 | | - let mut num_values: Option<i64> = None; |
401 | | - let mut total_uncompressed_size: Option<i64> = None; |
402 | | - let mut total_compressed_size: Option<i64> = None; |
403 | | - let mut data_page_offset: Option<i64> = None; |
404 | | - let mut index_page_offset: Option<i64> = None; |
405 | | - let mut dictionary_page_offset: Option<i64> = None; |
406 | | - let mut statistics: Option<Statistics> = None; |
407 | | - let mut encoding_stats: Option<Vec<PageEncodingStats>> = None; |
408 | | - let mut bloom_filter_offset: Option<i64> = None; |
409 | | - let mut bloom_filter_length: Option<i32> = None; |
410 | | - let mut size_statistics: Option<SizeStatistics> = None; |
411 | | - let mut geospatial_statistics: Option<GeospatialStatistics> = None; |
412 | | - |
413 | | - // `ColumnMetaData`. Read inline for performance sake. |
414 | | - // struct ColumnMetaData { |
415 | | - // 1: required Type type |
416 | | - // 2: required list<Encoding> encodings |
417 | | - // 3: required list<string> path_in_schema |
418 | | - // 4: required CompressionCodec codec |
419 | | - // 5: required i64 num_values |
420 | | - // 6: required i64 total_uncompressed_size |
421 | | - // 7: required i64 total_compressed_size |
422 | | - // 8: optional list<KeyValue> key_value_metadata |
423 | | - // 9: required i64 data_page_offset |
424 | | - // 10: optional i64 index_page_offset |
425 | | - // 11: optional i64 dictionary_page_offset |
426 | | - // 12: optional Statistics statistics; |
427 | | - // 13: optional list<PageEncodingStats> encoding_stats; |
428 | | - // 14: optional i64 bloom_filter_offset; |
429 | | - // 15: optional i32 bloom_filter_length; |
430 | | - // 16: optional SizeStatistics size_statistics; |
431 | | - // 17: optional GeospatialStatistics geospatial_statistics; |
432 | | - // } |
433 | | - let mut last_field_id = 0i16; |
434 | | - loop { |
435 | | - let field_ident = prot.read_field_begin(last_field_id)?; |
436 | | - if field_ident.field_type == FieldType::Stop { |
437 | | - break; |
438 | | - } |
439 | | - match field_ident.id { |
440 | | - // 1: type is never used, we can use the column descriptor |
441 | | - 2 => { |
442 | | - let val = EncodingMask::read_thrift(&mut prot)?; |
443 | | - encodings = Some(val); |
444 | | - } |
445 | | - // 3: path_in_schema is redundant |
446 | | - 4 => { |
447 | | - codec = Some(Compression::read_thrift(&mut prot)?); |
448 | | - } |
449 | | - 5 => { |
450 | | - num_values = Some(i64::read_thrift(&mut prot)?); |
451 | | - } |
452 | | - 6 => { |
453 | | - total_uncompressed_size = Some(i64::read_thrift(&mut prot)?); |
454 | | - } |
455 | | - 7 => { |
456 | | - total_compressed_size = Some(i64::read_thrift(&mut prot)?); |
457 | | - } |
458 | | - // 8: we don't expose this key value |
459 | | - 9 => { |
460 | | - data_page_offset = Some(i64::read_thrift(&mut prot)?); |
461 | | - } |
462 | | - 10 => { |
463 | | - index_page_offset = Some(i64::read_thrift(&mut prot)?); |
464 | | - } |
465 | | - 11 => { |
466 | | - dictionary_page_offset = Some(i64::read_thrift(&mut prot)?); |
467 | | - } |
468 | | - 12 => { |
469 | | - statistics = Some(Statistics::read_thrift(&mut prot)?); |
470 | | - } |
471 | | - 13 => { |
472 | | - let val = |
473 | | - read_thrift_vec::<PageEncodingStats, ThriftSliceInputProtocol>(&mut prot)?; |
474 | | - encoding_stats = Some(val); |
475 | | - } |
476 | | - 14 => { |
477 | | - bloom_filter_offset = Some(i64::read_thrift(&mut prot)?); |
478 | | - } |
479 | | - 15 => { |
480 | | - bloom_filter_length = Some(i32::read_thrift(&mut prot)?); |
481 | | - } |
482 | | - 16 => { |
483 | | - let val = SizeStatistics::read_thrift(&mut prot)?; |
484 | | - size_statistics = Some(val); |
485 | | - } |
486 | | - 17 => { |
487 | | - let val = GeospatialStatistics::read_thrift(&mut prot)?; |
488 | | - geospatial_statistics = Some(val); |
489 | | - } |
490 | | - _ => { |
491 | | - prot.skip(field_ident.field_type)?; |
492 | | - } |
493 | | - }; |
494 | | - last_field_id = field_ident.id; |
495 | | - } |
496 | | - |
497 | | - let Some(encodings) = encodings else { |
498 | | - return Err(ParquetError::General( |
499 | | - "Required field encodings is missing".to_owned(), |
500 | | - )); |
501 | | - }; |
502 | | - let Some(codec) = codec else { |
503 | | - return Err(ParquetError::General( |
504 | | - "Required field codec is missing".to_owned(), |
505 | | - )); |
506 | | - }; |
507 | | - let Some(num_values) = num_values else { |
508 | | - return Err(ParquetError::General( |
509 | | - "Required field num_values is missing".to_owned(), |
510 | | - )); |
511 | | - }; |
512 | | - let Some(total_uncompressed_size) = total_uncompressed_size else { |
513 | | - return Err(ParquetError::General( |
514 | | - "Required field total_uncompressed_size is missing".to_owned(), |
515 | | - )); |
516 | | - }; |
517 | | - let Some(total_compressed_size) = total_compressed_size else { |
518 | | - return Err(ParquetError::General( |
519 | | - "Required field total_compressed_size is missing".to_owned(), |
520 | | - )); |
521 | | - }; |
522 | | - let Some(data_page_offset) = data_page_offset else { |
523 | | - return Err(ParquetError::General( |
524 | | - "Required field data_page_offset is missing".to_owned(), |
525 | | - )); |
526 | | - }; |
527 | | - |
528 | | - Ok(ColumnMetaData { |
529 | | - encodings, |
530 | | - num_values, |
531 | | - codec, |
532 | | - total_uncompressed_size, |
533 | | - total_compressed_size, |
534 | | - data_page_offset, |
535 | | - index_page_offset, |
536 | | - dictionary_page_offset, |
537 | | - statistics, |
538 | | - encoding_stats, |
539 | | - bloom_filter_offset, |
540 | | - bloom_filter_length, |
541 | | - size_statistics, |
542 | | - geospatial_statistics, |
543 | | - }) |
544 | | -} |
0 commit comments