Skip to content

Commit 60ac1cc

Browse files
authored
fix: Remove datafusion.execution.parquet.cache_metadata config (#17062)
* fix: Remove `datafusion.execution.parquet.cache_metadata` config * prettier * fix prettier? * fix * fix config * fix test behaviour * fix
1 parent df153c2 commit 60ac1cc

File tree

15 files changed

+10
-75
lines changed

15 files changed

+10
-75
lines changed

datafusion/common/src/config.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -559,12 +559,6 @@ config_namespace! {
559559
/// (reading) Use any available bloom filters when reading parquet files
560560
pub bloom_filter_on_read: bool, default = true
561561

562-
/// (reading) Whether or not to enable the caching of embedded metadata of Parquet files
563-
/// (footer and page metadata). Enabling it can offer substantial performance improvements
564-
/// for repeated queries over large files. By default, the cache is automatically
565-
/// invalidated when the underlying file is modified.
566-
pub cache_metadata: bool, default = false
567-
568562
// The following options affect writing to parquet files
569563
// and map to parquet::file::properties::WriterProperties
570564

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,6 @@ impl ParquetOptions {
234234
binary_as_string: _, // not used for writer props
235235
coerce_int96: _, // not used for writer props
236236
skip_arrow_metadata: _,
237-
cache_metadata: _,
238237
} = self;
239238

240239
let mut builder = WriterProperties::builder()
@@ -502,7 +501,6 @@ mod tests {
502501
binary_as_string: defaults.binary_as_string,
503502
skip_arrow_metadata: defaults.skip_arrow_metadata,
504503
coerce_int96: None,
505-
cache_metadata: defaults.cache_metadata,
506504
}
507505
}
508506

@@ -613,7 +611,6 @@ mod tests {
613611
binary_as_string: global_options_defaults.binary_as_string,
614612
skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,
615613
coerce_int96: None,
616-
cache_metadata: global_options_defaults.cache_metadata,
617614
},
618615
column_specific_options,
619616
key_value_metadata,

datafusion/core/src/datasource/file_format/options.rs

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,6 @@ pub struct ParquetReadOptions<'a> {
254254
pub file_sort_order: Vec<Vec<SortExpr>>,
255255
/// Properties for decryption of Parquet files that use modular encryption
256256
pub file_decryption_properties: Option<ConfigFileDecryptionProperties>,
257-
/// Whether or not to enable the caching of embedded metadata of this Parquet file (footer and
258-
/// page metadata). Enabling it can offer substantial performance improvements for repeated
259-
/// queries over large files. By default, the cache is automatically invalidated when the
260-
/// underlying file is modified.
261-
pub cache_metadata: Option<bool>,
262257
}
263258

264259
impl Default for ParquetReadOptions<'_> {
@@ -271,7 +266,6 @@ impl Default for ParquetReadOptions<'_> {
271266
schema: None,
272267
file_sort_order: vec![],
273268
file_decryption_properties: None,
274-
cache_metadata: None,
275269
}
276270
}
277271
}
@@ -331,12 +325,6 @@ impl<'a> ParquetReadOptions<'a> {
331325
self.file_decryption_properties = Some(file_decryption_properties);
332326
self
333327
}
334-
335-
/// Specify whether to enable or not metadata caching
336-
pub fn cache_metadata(mut self, cache_metadata: bool) -> Self {
337-
self.cache_metadata = Some(cache_metadata);
338-
self
339-
}
340328
}
341329

342330
/// Options that control the reading of ARROW files.
@@ -602,9 +590,6 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> {
602590
if let Some(file_decryption_properties) = &self.file_decryption_properties {
603591
options.crypto.file_decryption = Some(file_decryption_properties.clone());
604592
}
605-
if let Some(cache_metadata) = self.cache_metadata {
606-
options.global.cache_metadata = cache_metadata;
607-
}
608593
let mut file_format = ParquetFormat::new().with_options(options);
609594

610595
if let Some(parquet_pruning) = self.parquet_pruning {

datafusion/core/tests/parquet/page_pruning.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -903,8 +903,8 @@ async fn without_pushdown_filter() {
903903
)
904904
.unwrap();
905905

906-
// Without filter will not read pageIndex.
907-
assert!(bytes_scanned_with_filter > bytes_scanned_without_filter);
906+
// Same amount of bytes are scanned when defaulting to cache parquet metadata
907+
assert!(bytes_scanned_with_filter == bytes_scanned_without_filter);
908908
}
909909

910910
#[tokio::test]

datafusion/datasource-parquet/src/file_format.rs

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -447,17 +447,14 @@ impl FileFormat for ParquetFormat {
447447

448448
let mut source = ParquetSource::new(self.options.clone());
449449

450-
// Use the CachedParquetFileReaderFactory when metadata caching is enabled
451-
if self.options.global.cache_metadata {
452-
let metadata_cache =
453-
state.runtime_env().cache_manager.get_file_metadata_cache();
454-
let store = state
455-
.runtime_env()
456-
.object_store(conf.object_store_url.clone())?;
457-
let cached_parquet_read_factory =
458-
Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
459-
source = source.with_parquet_file_reader_factory(cached_parquet_read_factory);
460-
}
450+
// Use the CachedParquetFileReaderFactory
451+
let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache();
452+
let store = state
453+
.runtime_env()
454+
.object_store(conf.object_store_url.clone())?;
455+
let cached_parquet_read_factory =
456+
Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache));
457+
source = source.with_parquet_file_reader_factory(cached_parquet_read_factory);
461458

462459
if let Some(metadata_size_hint) = metadata_size_hint {
463460
source = source.with_metadata_size_hint(metadata_size_hint)

datafusion/proto-common/proto/datafusion_common.proto

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,6 @@ message ParquetOptions {
530530
bool schema_force_view_types = 28; // default = false
531531
bool binary_as_string = 29; // default = false
532532
bool skip_arrow_metadata = 30; // default = false
533-
bool cache_metadata = 33; // default = false
534533

535534
oneof metadata_size_hint_opt {
536535
uint64 metadata_size_hint = 4;

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -999,7 +999,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
999999
protobuf::parquet_options::CoerceInt96Opt::CoerceInt96(v) => Some(v),
10001000
}).unwrap_or(None),
10011001
skip_arrow_metadata: value.skip_arrow_metadata,
1002-
cache_metadata: value.cache_metadata,
10031002
})
10041003
}
10051004
}

datafusion/proto-common/src/generated/pbjson.rs

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5568,9 +5568,6 @@ impl serde::Serialize for ParquetOptions {
55685568
if self.skip_arrow_metadata {
55695569
len += 1;
55705570
}
5571-
if self.cache_metadata {
5572-
len += 1;
5573-
}
55745571
if self.dictionary_page_size_limit != 0 {
55755572
len += 1;
55765573
}
@@ -5670,9 +5667,6 @@ impl serde::Serialize for ParquetOptions {
56705667
if self.skip_arrow_metadata {
56715668
struct_ser.serialize_field("skipArrowMetadata", &self.skip_arrow_metadata)?;
56725669
}
5673-
if self.cache_metadata {
5674-
struct_ser.serialize_field("cacheMetadata", &self.cache_metadata)?;
5675-
}
56765670
if self.dictionary_page_size_limit != 0 {
56775671
#[allow(clippy::needless_borrow)]
56785672
#[allow(clippy::needless_borrows_for_generic_args)]
@@ -5810,8 +5804,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
58105804
"binaryAsString",
58115805
"skip_arrow_metadata",
58125806
"skipArrowMetadata",
5813-
"cache_metadata",
5814-
"cacheMetadata",
58155807
"dictionary_page_size_limit",
58165808
"dictionaryPageSizeLimit",
58175809
"data_page_row_count_limit",
@@ -5858,7 +5850,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
58585850
SchemaForceViewTypes,
58595851
BinaryAsString,
58605852
SkipArrowMetadata,
5861-
CacheMetadata,
58625853
DictionaryPageSizeLimit,
58635854
DataPageRowCountLimit,
58645855
MaxRowGroupSize,
@@ -5910,7 +5901,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
59105901
"schemaForceViewTypes" | "schema_force_view_types" => Ok(GeneratedField::SchemaForceViewTypes),
59115902
"binaryAsString" | "binary_as_string" => Ok(GeneratedField::BinaryAsString),
59125903
"skipArrowMetadata" | "skip_arrow_metadata" => Ok(GeneratedField::SkipArrowMetadata),
5913-
"cacheMetadata" | "cache_metadata" => Ok(GeneratedField::CacheMetadata),
59145904
"dictionaryPageSizeLimit" | "dictionary_page_size_limit" => Ok(GeneratedField::DictionaryPageSizeLimit),
59155905
"dataPageRowCountLimit" | "data_page_row_count_limit" => Ok(GeneratedField::DataPageRowCountLimit),
59165906
"maxRowGroupSize" | "max_row_group_size" => Ok(GeneratedField::MaxRowGroupSize),
@@ -5960,7 +5950,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
59605950
let mut schema_force_view_types__ = None;
59615951
let mut binary_as_string__ = None;
59625952
let mut skip_arrow_metadata__ = None;
5963-
let mut cache_metadata__ = None;
59645953
let mut dictionary_page_size_limit__ = None;
59655954
let mut data_page_row_count_limit__ = None;
59665955
let mut max_row_group_size__ = None;
@@ -6081,12 +6070,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
60816070
}
60826071
skip_arrow_metadata__ = Some(map_.next_value()?);
60836072
}
6084-
GeneratedField::CacheMetadata => {
6085-
if cache_metadata__.is_some() {
6086-
return Err(serde::de::Error::duplicate_field("cacheMetadata"));
6087-
}
6088-
cache_metadata__ = Some(map_.next_value()?);
6089-
}
60906073
GeneratedField::DictionaryPageSizeLimit => {
60916074
if dictionary_page_size_limit__.is_some() {
60926075
return Err(serde::de::Error::duplicate_field("dictionaryPageSizeLimit"));
@@ -6196,7 +6179,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
61966179
schema_force_view_types: schema_force_view_types__.unwrap_or_default(),
61976180
binary_as_string: binary_as_string__.unwrap_or_default(),
61986181
skip_arrow_metadata: skip_arrow_metadata__.unwrap_or_default(),
6199-
cache_metadata: cache_metadata__.unwrap_or_default(),
62006182
dictionary_page_size_limit: dictionary_page_size_limit__.unwrap_or_default(),
62016183
data_page_row_count_limit: data_page_row_count_limit__.unwrap_or_default(),
62026184
max_row_group_size: max_row_group_size__.unwrap_or_default(),

datafusion/proto-common/src/generated/prost.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -795,9 +795,6 @@ pub struct ParquetOptions {
795795
/// default = false
796796
#[prost(bool, tag = "30")]
797797
pub skip_arrow_metadata: bool,
798-
/// default = false
799-
#[prost(bool, tag = "33")]
800-
pub cache_metadata: bool,
801798
#[prost(uint64, tag = "12")]
802799
pub dictionary_page_size_limit: u64,
803800
#[prost(uint64, tag = "18")]

datafusion/proto-common/src/to_proto/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -842,7 +842,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
842842
binary_as_string: value.binary_as_string,
843843
skip_arrow_metadata: value.skip_arrow_metadata,
844844
coerce_int96_opt: value.coerce_int96.clone().map(protobuf::parquet_options::CoerceInt96Opt::CoerceInt96),
845-
cache_metadata: value.cache_metadata,
846845
})
847846
}
848847
}

0 commit comments

Comments
 (0)