@@ -28,6 +28,7 @@ use arrow::compute::SortOptions;
2828use arrow:: datatypes:: { Int32Type , SchemaRef } ;
2929use arrow_schema:: { DataType , Field , Schema } ;
3030use datafusion:: assert_batches_eq;
31+ use datafusion:: config:: SpillCompression ;
3132use datafusion:: datasource:: memory:: MemorySourceConfig ;
3233use datafusion:: datasource:: source:: DataSourceExec ;
3334use datafusion:: datasource:: { MemTable , TableProvider } ;
@@ -545,10 +546,11 @@ async fn test_external_sort_zero_merge_reservation() {
545546// Tests for disk limit (`max_temp_directory_size` in `DiskManager`)
546547// ------------------------------------------------------------------
547548
548- // Create a new `SessionContext` with speicified disk limit and memory pool limit
549+ // Create a new `SessionContext` with speicified disk limit, memory pool limit, and spill compression codec
549550async fn setup_context (
550551 disk_limit : u64 ,
551552 memory_pool_limit : usize ,
553+ spill_compression : SpillCompression ,
552554) -> Result < SessionContext > {
553555 let disk_manager = DiskManagerBuilder :: default ( )
554556 . with_mode ( DiskManagerMode :: OsTmpDirectory )
@@ -570,6 +572,7 @@ async fn setup_context(
570572 let config = SessionConfig :: new ( )
571573 . with_sort_spill_reservation_bytes ( 64 * 1024 ) // 256KB
572574 . with_sort_in_place_threshold_bytes ( 0 )
575+ . with_spill_compression ( spill_compression)
573576 . with_batch_size ( 64 ) // To reduce test memory usage
574577 . with_target_partitions ( 1 ) ;
575578
@@ -580,7 +583,8 @@ async fn setup_context(
580583/// (specified by `max_temp_directory_size` in `DiskManager`)
581584#[ tokio:: test]
582585async fn test_disk_spill_limit_reached ( ) -> Result < ( ) > {
583- let ctx = setup_context ( 1024 * 1024 , 1024 * 1024 ) . await ?; // 1MB disk limit, 1MB memory limit
586+ let spill_compression = SpillCompression :: Uncompressed ;
587+ let ctx = setup_context ( 1024 * 1024 , 1024 * 1024 , spill_compression) . await ?; // 1MB disk limit, 1MB memory limit
584588
585589 let df = ctx
586590 . sql ( "select * from generate_series(1, 1000000000000) as t1(v1) order by v1" )
@@ -602,7 +606,8 @@ async fn test_disk_spill_limit_reached() -> Result<()> {
602606#[ tokio:: test]
603607async fn test_disk_spill_limit_not_reached ( ) -> Result < ( ) > {
604608 let disk_spill_limit = 1024 * 1024 ; // 1MB
605- let ctx = setup_context ( disk_spill_limit, 128 * 1024 ) . await ?; // 1MB disk limit, 128KB memory limit
609+ let spill_compression = SpillCompression :: Uncompressed ;
610+ let ctx = setup_context ( disk_spill_limit, 128 * 1024 , spill_compression) . await ?; // 1MB disk limit, 128KB memory limit
606611
607612 let df = ctx
608613 . sql ( "select * from generate_series(1, 10000) as t1(v1) order by v1" )
@@ -630,6 +635,77 @@ async fn test_disk_spill_limit_not_reached() -> Result<()> {
630635 Ok ( ( ) )
631636}
632637
638+ /// External query should succeed using zstd as spill compression codec and
639+ /// and all temporary spill files are properly cleaned up after execution.
640+ /// Note: This test does not inspect file contents (e.g. magic number),
641+ /// as spill files are automatically deleted on drop.
642+ #[ tokio:: test]
643+ async fn test_spill_file_compressed_with_zstd ( ) -> Result < ( ) > {
644+ let disk_spill_limit = 1024 * 1024 ; // 1MB
645+ let spill_compression = SpillCompression :: Zstd ;
646+ let ctx = setup_context ( disk_spill_limit, 128 * 1024 , spill_compression) . await ?; // 1MB disk limit, 128KB memory limit, zstd
647+
648+ let df = ctx
649+ . sql ( "select * from generate_series(1, 100000) as t1(v1) order by v1" )
650+ . await
651+ . unwrap ( ) ;
652+ let plan = df. create_physical_plan ( ) . await . unwrap ( ) ;
653+
654+ let task_ctx = ctx. task_ctx ( ) ;
655+ let _ = collect_batches ( Arc :: clone ( & plan) , task_ctx)
656+ . await
657+ . expect ( "Query execution failed" ) ;
658+
659+ let spill_count = plan. metrics ( ) . unwrap ( ) . spill_count ( ) . unwrap ( ) ;
660+ let spilled_bytes = plan. metrics ( ) . unwrap ( ) . spilled_bytes ( ) . unwrap ( ) ;
661+
662+ println ! ( "spill count {spill_count}" ) ;
663+ assert ! ( spill_count > 0 ) ;
664+ assert ! ( ( spilled_bytes as u64 ) < disk_spill_limit) ;
665+
666+ // Verify that all temporary files have been properly cleaned up by checking
667+ // that the total disk usage tracked by the disk manager is zero
668+ let current_disk_usage = ctx. runtime_env ( ) . disk_manager . used_disk_space ( ) ;
669+ assert_eq ! ( current_disk_usage, 0 ) ;
670+
671+ Ok ( ( ) )
672+ }
673+
674+ /// External query should succeed using lz4_frame as spill compression codec and
675+ /// and all temporary spill files are properly cleaned up after execution.
676+ /// Note: This test does not inspect file contents (e.g. magic number),
677+ /// as spill files are automatically deleted on drop.
678+ #[ tokio:: test]
679+ async fn test_spill_file_compressed_with_lz4_frame ( ) -> Result < ( ) > {
680+ let disk_spill_limit = 1024 * 1024 ; // 1MB
681+ let spill_compression = SpillCompression :: Lz4Frame ;
682+ let ctx = setup_context ( disk_spill_limit, 128 * 1024 , spill_compression) . await ?; // 1MB disk limit, 128KB memory limit, lz4_frame
683+
684+ let df = ctx
685+ . sql ( "select * from generate_series(1, 100000) as t1(v1) order by v1" )
686+ . await
687+ . unwrap ( ) ;
688+ let plan = df. create_physical_plan ( ) . await . unwrap ( ) ;
689+
690+ let task_ctx = ctx. task_ctx ( ) ;
691+ let _ = collect_batches ( Arc :: clone ( & plan) , task_ctx)
692+ . await
693+ . expect ( "Query execution failed" ) ;
694+
695+ let spill_count = plan. metrics ( ) . unwrap ( ) . spill_count ( ) . unwrap ( ) ;
696+ let spilled_bytes = plan. metrics ( ) . unwrap ( ) . spilled_bytes ( ) . unwrap ( ) ;
697+
698+ println ! ( "spill count {spill_count}" ) ;
699+ assert ! ( spill_count > 0 ) ;
700+ assert ! ( ( spilled_bytes as u64 ) < disk_spill_limit) ;
701+
702+ // Verify that all temporary files have been properly cleaned up by checking
703+ // that the total disk usage tracked by the disk manager is zero
704+ let current_disk_usage = ctx. runtime_env ( ) . disk_manager . used_disk_space ( ) ;
705+ assert_eq ! ( current_disk_usage, 0 ) ;
706+
707+ Ok ( ( ) )
708+ }
633709/// Run the query with the specified memory limit,
634710/// and verifies the expected errors are returned
635711#[ derive( Clone , Debug ) ]
0 commit comments