Skip to content

Commit f849968

Browse files
authored
Fix can not load parquet table form spark in datafusion-cli. (#1665)
* fix can not load parquet table form spark * add Invalid file in log. * fix fmt
1 parent 1caf52a commit f849968

File tree

9 files changed

+43
-24
lines changed

9 files changed

+43
-24
lines changed

benchmarks/src/bin/tpch.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ use datafusion::{
5454
},
5555
};
5656

57+
use datafusion::datasource::file_format::csv::DEFAULT_CSV_EXTENSION;
58+
use datafusion::datasource::file_format::parquet::DEFAULT_PARQUET_EXTENSION;
5759
use structopt::StructOpt;
5860

5961
#[cfg(feature = "snmalloc")]
@@ -652,13 +654,13 @@ fn get_table(
652654
.with_delimiter(b',')
653655
.with_has_header(true);
654656

655-
(Arc::new(format), path, ".csv")
657+
(Arc::new(format), path, DEFAULT_CSV_EXTENSION)
656658
}
657659
"parquet" => {
658660
let path = format!("{}/{}", path, table);
659661
let format = ParquetFormat::default().with_enable_pruning(true);
660662

661-
(Arc::new(format), path, ".parquet")
663+
(Arc::new(format), path, DEFAULT_PARQUET_EXTENSION)
662664
}
663665
other => {
664666
unimplemented!("Invalid file format '{}'", other);

datafusion-examples/examples/parquet_sql_multiple_files.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use datafusion::datasource::file_format::parquet::ParquetFormat;
18+
use datafusion::datasource::file_format::parquet::{
19+
ParquetFormat, DEFAULT_PARQUET_EXTENSION,
20+
};
1921
use datafusion::datasource::listing::ListingOptions;
2022
use datafusion::error::Result;
2123
use datafusion::prelude::*;
@@ -33,7 +35,7 @@ async fn main() -> Result<()> {
3335
// Configure listing options
3436
let file_format = ParquetFormat::default().with_enable_pruning(true);
3537
let listing_options = ListingOptions {
36-
file_extension: ".parquet".to_owned(),
38+
file_extension: DEFAULT_PARQUET_EXTENSION.to_owned(),
3739
format: Arc::new(file_format),
3840
table_partition_cols: vec![],
3941
collect_stat: true,

datafusion/src/datasource/file_format/avro.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ use crate::physical_plan::file_format::{AvroExec, FileScanConfig};
3434
use crate::physical_plan::ExecutionPlan;
3535
use crate::physical_plan::Statistics;
3636

37+
/// The default file extension of avro files
38+
pub const DEFAULT_AVRO_EXTENSION: &str = ".avro";
3739
/// Avro `FileFormat` implementation.
3840
#[derive(Default, Debug)]
3941
pub struct AvroFormat;

datafusion/src/datasource/file_format/csv.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ use crate::physical_plan::file_format::{CsvExec, FileScanConfig};
3333
use crate::physical_plan::ExecutionPlan;
3434
use crate::physical_plan::Statistics;
3535

36+
/// The default file extension of csv files
37+
pub const DEFAULT_CSV_EXTENSION: &str = ".csv";
3638
/// Character Separated Value `FileFormat` implementation.
3739
#[derive(Debug)]
3840
pub struct CsvFormat {

datafusion/src/datasource/file_format/json.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ use crate::physical_plan::file_format::NdJsonExec;
3737
use crate::physical_plan::ExecutionPlan;
3838
use crate::physical_plan::Statistics;
3939

40+
/// The default file extension of json files
41+
pub const DEFAULT_JSON_EXTENSION: &str = ".json";
4042
/// New line delimited JSON `FileFormat` implementation.
4143
#[derive(Debug, Default)]
4244
pub struct JsonFormat {

datafusion/src/datasource/listing/table.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,8 @@ impl ListingTable {
266266
mod tests {
267267
use arrow::datatypes::DataType;
268268

269+
use crate::datasource::file_format::avro::DEFAULT_AVRO_EXTENSION;
270+
use crate::datasource::file_format::parquet::DEFAULT_PARQUET_EXTENSION;
269271
use crate::{
270272
datasource::{
271273
file_format::{avro::AvroFormat, parquet::ParquetFormat},
@@ -318,7 +320,7 @@ mod tests {
318320
let store = TestObjectStore::new_arc(&[("table/p1=v1/file.avro", 100)]);
319321

320322
let opt = ListingOptions {
321-
file_extension: ".avro".to_owned(),
323+
file_extension: DEFAULT_AVRO_EXTENSION.to_owned(),
322324
format: Arc::new(AvroFormat {}),
323325
table_partition_cols: vec![String::from("p1")],
324326
target_partitions: 4,
@@ -419,7 +421,7 @@ mod tests {
419421
let testdata = crate::test_util::parquet_test_data();
420422
let filename = format!("{}/{}", testdata, name);
421423
let opt = ListingOptions {
422-
file_extension: "parquet".to_owned(),
424+
file_extension: DEFAULT_PARQUET_EXTENSION.to_owned(),
423425
format: Arc::new(ParquetFormat::default()),
424426
table_partition_cols: vec![],
425427
target_partitions: 2,

datafusion/src/execution/context.rs

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ use crate::{
2424
datasource::listing::{ListingOptions, ListingTable},
2525
datasource::{
2626
file_format::{
27-
avro::AvroFormat,
28-
csv::CsvFormat,
27+
avro::{AvroFormat, DEFAULT_AVRO_EXTENSION},
28+
csv::{CsvFormat, DEFAULT_CSV_EXTENSION},
2929
parquet::{ParquetFormat, DEFAULT_PARQUET_EXTENSION},
3030
FileFormat,
3131
},
@@ -218,17 +218,20 @@ impl ExecutionContext {
218218
ref file_type,
219219
ref has_header,
220220
}) => {
221-
let file_format = match file_type {
222-
FileType::CSV => {
223-
Ok(Arc::new(CsvFormat::default().with_has_header(*has_header))
224-
as Arc<dyn FileFormat>)
225-
}
226-
FileType::Parquet => {
227-
Ok(Arc::new(ParquetFormat::default()) as Arc<dyn FileFormat>)
228-
}
229-
FileType::Avro => {
230-
Ok(Arc::new(AvroFormat::default()) as Arc<dyn FileFormat>)
231-
}
221+
let (file_format, file_extension) = match file_type {
222+
FileType::CSV => Ok((
223+
Arc::new(CsvFormat::default().with_has_header(*has_header))
224+
as Arc<dyn FileFormat>,
225+
DEFAULT_CSV_EXTENSION,
226+
)),
227+
FileType::Parquet => Ok((
228+
Arc::new(ParquetFormat::default()) as Arc<dyn FileFormat>,
229+
DEFAULT_PARQUET_EXTENSION,
230+
)),
231+
FileType::Avro => Ok((
232+
Arc::new(AvroFormat::default()) as Arc<dyn FileFormat>,
233+
DEFAULT_AVRO_EXTENSION,
234+
)),
232235
_ => Err(DataFusionError::NotImplemented(format!(
233236
"Unsupported file type {:?}.",
234237
file_type
@@ -238,7 +241,7 @@ impl ExecutionContext {
238241
let options = ListingOptions {
239242
format: file_format,
240243
collect_stat: false,
241-
file_extension: String::new(),
244+
file_extension: file_extension.to_owned(),
242245
target_partitions: self
243246
.state
244247
.lock()

datafusion/src/execution/options.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use std::sync::Arc;
2121

2222
use arrow::datatypes::{Schema, SchemaRef};
2323

24+
use crate::datasource::file_format::json::DEFAULT_JSON_EXTENSION;
2425
use crate::datasource::{
2526
file_format::{avro::AvroFormat, csv::CsvFormat},
2627
listing::ListingOptions,
@@ -173,7 +174,7 @@ impl<'a> Default for NdJsonReadOptions<'a> {
173174
Self {
174175
schema: None,
175176
schema_infer_max_records: 1000,
176-
file_extension: ".json",
177+
file_extension: DEFAULT_JSON_EXTENSION,
177178
}
178179
}
179180
}

datafusion/src/physical_plan/file_format/parquet.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ impl ExecutionPlan for ParquetExec {
221221
object_store.as_ref(),
222222
file_schema_ref,
223223
partition_index,
224-
partition,
224+
&partition,
225225
metrics,
226226
&projection,
227227
&pruning_predicate,
@@ -230,7 +230,10 @@ impl ExecutionPlan for ParquetExec {
230230
limit,
231231
partition_col_proj,
232232
) {
233-
println!("Parquet reader thread terminated due to error: {:?}", e);
233+
println!(
234+
"Parquet reader thread terminated due to error: {:?} for files: {:?}",
235+
e, partition
236+
);
234237
}
235238
});
236239

@@ -445,7 +448,7 @@ fn read_partition(
445448
object_store: &dyn ObjectStore,
446449
file_schema: SchemaRef,
447450
partition_index: usize,
448-
partition: Vec<PartitionedFile>,
451+
partition: &[PartitionedFile],
449452
metrics: ExecutionPlanMetricsSet,
450453
projection: &[usize],
451454
pruning_predicate: &Option<PruningPredicate>,

0 commit comments

Comments
 (0)