@@ -128,9 +128,21 @@ impl TableProviderFactory for ListingTableFactory {
128128 // if the folder then rewrite a file path as 'path/*.parquet'
129129 // to only read the files the reader can understand
130130 if table_path. is_folder ( ) && table_path. get_glob ( ) . is_none ( ) {
131- table_path = table_path. with_glob (
132- format ! ( "*.{}" , cmd. file_type. to_lowercase( ) ) . as_ref ( ) ,
133- ) ?;
131+ // Since there are no files yet to infer an actual extension,
132+ // derive the pattern based on compression type.
133+ // So for gzipped CSV the pattern is `*.csv.gz`
134+ let glob = match options. format . compression_type ( ) {
135+ Some ( compression) => {
136+ match options. format . get_ext_with_compression ( & compression) {
137+ // Use glob based on `FileFormat` extension
138+ Ok ( ext) => format ! ( "*.{ext}" ) ,
139+ // Fallback to `file_type`, if not supported by `FileFormat`
140+ Err ( _) => format ! ( "*.{}" , cmd. file_type. to_lowercase( ) ) ,
141+ }
142+ }
143+ None => format ! ( "*.{}" , cmd. file_type. to_lowercase( ) ) ,
144+ } ;
145+ table_path = table_path. with_glob ( glob. as_ref ( ) ) ?;
134146 }
135147 let schema = options. infer_schema ( session_state, & table_path) . await ?;
136148 let df_schema = Arc :: clone ( & schema) . to_dfschema ( ) ?;
@@ -175,13 +187,15 @@ fn get_extension(path: &str) -> String {
175187
176188#[ cfg( test) ]
177189mod tests {
190+ use glob:: Pattern ;
178191 use std:: collections:: HashMap ;
179192
180193 use super :: * ;
181194 use crate :: {
182195 datasource:: file_format:: csv:: CsvFormat , execution:: context:: SessionContext ,
183196 } ;
184197
198+ use datafusion_common:: parsers:: CompressionTypeVariant ;
185199 use datafusion_common:: { Constraints , DFSchema , TableReference } ;
186200
187201 #[ tokio:: test]
@@ -264,4 +278,101 @@ mod tests {
264278 let listing_options = listing_table. options ( ) ;
265279 assert_eq ! ( ".tbl" , listing_options. file_extension) ;
266280 }
281+
282+ /// Validates that CreateExternalTable with compression
283+ /// searches for gzipped files in a directory location
284+ #[ tokio:: test]
285+ async fn test_create_using_folder_with_compression ( ) {
286+ let dir = tempfile:: tempdir ( ) . unwrap ( ) ;
287+
288+ let factory = ListingTableFactory :: new ( ) ;
289+ let context = SessionContext :: new ( ) ;
290+ let state = context. state ( ) ;
291+ let name = TableReference :: bare ( "foo" ) ;
292+
293+ let mut options = HashMap :: new ( ) ;
294+ options. insert ( "format.schema_infer_max_rec" . to_owned ( ) , "1000" . to_owned ( ) ) ;
295+ options. insert ( "format.has_header" . into ( ) , "true" . into ( ) ) ;
296+ options. insert ( "format.compression" . into ( ) , "gzip" . into ( ) ) ;
297+ let cmd = CreateExternalTable {
298+ name,
299+ location : dir. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ,
300+ file_type : "csv" . to_string ( ) ,
301+ schema : Arc :: new ( DFSchema :: empty ( ) ) ,
302+ table_partition_cols : vec ! [ ] ,
303+ if_not_exists : false ,
304+ temporary : false ,
305+ definition : None ,
306+ order_exprs : vec ! [ ] ,
307+ unbounded : false ,
308+ options,
309+ constraints : Constraints :: default ( ) ,
310+ column_defaults : HashMap :: new ( ) ,
311+ } ;
312+ let table_provider = factory. create ( & state, & cmd) . await . unwrap ( ) ;
313+ let listing_table = table_provider
314+ . as_any ( )
315+ . downcast_ref :: < ListingTable > ( )
316+ . unwrap ( ) ;
317+
318+ // Verify compression is used
319+ let format = listing_table. options ( ) . format . clone ( ) ;
320+ let csv_format = format. as_any ( ) . downcast_ref :: < CsvFormat > ( ) . unwrap ( ) ;
321+ let csv_options = csv_format. options ( ) . clone ( ) ;
322+ assert_eq ! ( csv_options. compression, CompressionTypeVariant :: GZIP ) ;
323+
324+ let listing_options = listing_table. options ( ) ;
325+ assert_eq ! ( "" , listing_options. file_extension) ;
326+ // Glob pattern is set to search for gzipped files
327+ let table_path = listing_table. table_paths ( ) . first ( ) . unwrap ( ) ;
328+ assert_eq ! (
329+ table_path. get_glob( ) . clone( ) . unwrap( ) ,
330+ Pattern :: new( "*.csv.gz" ) . unwrap( )
331+ ) ;
332+ }
333+
334+ /// Validates that CreateExternalTable without compression
335+ /// searches for normal files in a directory location
336+ #[ tokio:: test]
337+ async fn test_create_using_folder_without_compression ( ) {
338+ let dir = tempfile:: tempdir ( ) . unwrap ( ) ;
339+
340+ let factory = ListingTableFactory :: new ( ) ;
341+ let context = SessionContext :: new ( ) ;
342+ let state = context. state ( ) ;
343+ let name = TableReference :: bare ( "foo" ) ;
344+
345+ let mut options = HashMap :: new ( ) ;
346+ options. insert ( "format.schema_infer_max_rec" . to_owned ( ) , "1000" . to_owned ( ) ) ;
347+ options. insert ( "format.has_header" . into ( ) , "true" . into ( ) ) ;
348+ let cmd = CreateExternalTable {
349+ name,
350+ location : dir. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ,
351+ file_type : "csv" . to_string ( ) ,
352+ schema : Arc :: new ( DFSchema :: empty ( ) ) ,
353+ table_partition_cols : vec ! [ ] ,
354+ if_not_exists : false ,
355+ temporary : false ,
356+ definition : None ,
357+ order_exprs : vec ! [ ] ,
358+ unbounded : false ,
359+ options,
360+ constraints : Constraints :: default ( ) ,
361+ column_defaults : HashMap :: new ( ) ,
362+ } ;
363+ let table_provider = factory. create ( & state, & cmd) . await . unwrap ( ) ;
364+ let listing_table = table_provider
365+ . as_any ( )
366+ . downcast_ref :: < ListingTable > ( )
367+ . unwrap ( ) ;
368+
369+ let listing_options = listing_table. options ( ) ;
370+ assert_eq ! ( "" , listing_options. file_extension) ;
371+ // Glob pattern is set to search for gzipped files
372+ let table_path = listing_table. table_paths ( ) . first ( ) . unwrap ( ) ;
373+ assert_eq ! (
374+ table_path. get_glob( ) . clone( ) . unwrap( ) ,
375+ Pattern :: new( "*.csv" ) . unwrap( )
376+ ) ;
377+ }
267378}
0 commit comments