@@ -77,6 +77,9 @@ pub struct DataFrameWriteOptions {
7777 /// Sets which columns should be used for hive-style partitioned writes by name.
7878 /// Can be set to empty vec![] for non-partitioned writes.
7979 partition_by : Vec < String > ,
80+ /// Sets which columns should be used for sorting the output by name.
81+ /// Can be set to empty vec![] for non-sorted writes.
82+ sort_by : Vec < SortExpr > ,
8083}
8184
8285impl DataFrameWriteOptions {
@@ -86,6 +89,7 @@ impl DataFrameWriteOptions {
8689 insert_op : InsertOp :: Append ,
8790 single_file_output : false ,
8891 partition_by : vec ! [ ] ,
92+ sort_by : vec ! [ ] ,
8993 }
9094 }
9195
@@ -106,6 +110,12 @@ impl DataFrameWriteOptions {
106110 self . partition_by = partition_by;
107111 self
108112 }
113+
114+ /// Sets the sort_by columns for output sorting
115+ pub fn with_sort_by ( mut self , sort_by : Vec < SortExpr > ) -> Self {
116+ self . sort_by = sort_by;
117+ self
118+ }
109119}
110120
111121impl Default for DataFrameWriteOptions {
@@ -1517,8 +1527,17 @@ impl DataFrame {
15171527 write_options : DataFrameWriteOptions ,
15181528 ) -> Result < Vec < RecordBatch > , DataFusionError > {
15191529 let arrow_schema = Schema :: from ( self . schema ( ) ) ;
1530+
1531+ let plan = if write_options. sort_by . is_empty ( ) {
1532+ self . plan
1533+ } else {
1534+ LogicalPlanBuilder :: from ( self . plan )
1535+ . sort ( write_options. sort_by ) ?
1536+ . build ( ) ?
1537+ } ;
1538+
15201539 let plan = LogicalPlanBuilder :: insert_into (
1521- self . plan ,
1540+ plan,
15221541 table_name. to_owned ( ) ,
15231542 & arrow_schema,
15241543 write_options. insert_op ,
@@ -1577,8 +1596,16 @@ impl DataFrame {
15771596
15781597 let file_type = format_as_file_type ( format) ;
15791598
1599+ let plan = if options. sort_by . is_empty ( ) {
1600+ self . plan
1601+ } else {
1602+ LogicalPlanBuilder :: from ( self . plan )
1603+ . sort ( options. sort_by ) ?
1604+ . build ( ) ?
1605+ } ;
1606+
15801607 let plan = LogicalPlanBuilder :: copy_to (
1581- self . plan ,
1608+ plan,
15821609 path. into ( ) ,
15831610 file_type,
15841611 HashMap :: new ( ) ,
@@ -1638,8 +1665,16 @@ impl DataFrame {
16381665
16391666 let file_type = format_as_file_type ( format) ;
16401667
1668+ let plan = if options. sort_by . is_empty ( ) {
1669+ self . plan
1670+ } else {
1671+ LogicalPlanBuilder :: from ( self . plan )
1672+ . sort ( options. sort_by ) ?
1673+ . build ( ) ?
1674+ } ;
1675+
16411676 let plan = LogicalPlanBuilder :: copy_to (
1642- self . plan ,
1677+ plan,
16431678 path. into ( ) ,
16441679 file_type,
16451680 Default :: default ( ) ,
@@ -1940,6 +1975,7 @@ mod tests {
19401975 use crate :: physical_plan:: { ColumnarValue , Partitioning , PhysicalExpr } ;
19411976 use crate :: test_util:: { register_aggregate_csv, test_table, test_table_with_name} ;
19421977
1978+ use crate :: prelude:: { CsvReadOptions , NdJsonReadOptions , ParquetReadOptions } ;
19431979 use arrow:: array:: Int32Array ;
19441980 use datafusion_common:: { assert_batches_eq, Constraint , Constraints , ScalarValue } ;
19451981 use datafusion_common_runtime:: SpawnedTask ;
@@ -1954,6 +1990,7 @@ mod tests {
19541990 use datafusion_physical_expr:: expressions:: Column ;
19551991 use datafusion_physical_plan:: { get_plan_string, ExecutionPlanProperties } ;
19561992 use sqlparser:: ast:: NullTreatment ;
1993+ use tempfile:: TempDir ;
19571994
19581995 // Get string representation of the plan
19591996 async fn assert_physical_plan ( df : & DataFrame , expected : Vec < & str > ) {
@@ -4057,4 +4094,237 @@ mod tests {
40574094
40584095 Ok ( ( ) )
40594096 }
4097+
4098+ // Test issue: https://github.com/apache/datafusion/issues/13873
4099+ #[ tokio:: test]
4100+ async fn write_parquet_with_order ( ) -> Result < ( ) > {
4101+ let tmp_dir = TempDir :: new ( ) ?;
4102+ let schema = Arc :: new ( Schema :: new ( vec ! [
4103+ Field :: new( "a" , DataType :: Int32 , true ) ,
4104+ Field :: new( "b" , DataType :: Int32 , true ) ,
4105+ ] ) ) ;
4106+
4107+ let ctx = SessionContext :: new ( ) ;
4108+ let write_df = ctx. read_batch ( RecordBatch :: try_new (
4109+ schema. clone ( ) ,
4110+ vec ! [
4111+ Arc :: new( Int32Array :: from( vec![ 1 , 5 , 7 , 3 , 2 ] ) ) ,
4112+ Arc :: new( Int32Array :: from( vec![ 2 , 3 , 4 , 5 , 6 ] ) ) ,
4113+ ] ,
4114+ ) ?) ?;
4115+
4116+ let test_path = tmp_dir. path ( ) . join ( "test.parquet" ) ;
4117+
4118+ write_df
4119+ . clone ( )
4120+ . write_parquet (
4121+ test_path. to_str ( ) . unwrap ( ) ,
4122+ DataFrameWriteOptions :: new ( )
4123+ . with_sort_by ( vec ! [ col( "a" ) . sort( true , true ) ] ) ,
4124+ None ,
4125+ )
4126+ . await ?;
4127+
4128+ let ctx = SessionContext :: new ( ) ;
4129+ ctx. register_parquet (
4130+ "data" ,
4131+ test_path. to_str ( ) . unwrap ( ) ,
4132+ ParquetReadOptions :: default ( ) ,
4133+ )
4134+ . await ?;
4135+
4136+ let df = ctx. sql ( "SELECT * FROM data" ) . await ?;
4137+ let results = df. collect ( ) . await ?;
4138+
4139+ let df_explain = ctx. sql ( "explain SELECT a FROM data" ) . await ?;
4140+ let explain_result = df_explain. collect ( ) . await ?;
4141+
4142+ println ! ( "explain_result {:?}" , explain_result) ;
4143+
4144+ assert_batches_eq ! (
4145+ & [
4146+ "+---+---+" ,
4147+ "| a | b |" ,
4148+ "+---+---+" ,
4149+ "| 1 | 2 |" ,
4150+ "| 2 | 6 |" ,
4151+ "| 3 | 5 |" ,
4152+ "| 5 | 3 |" ,
4153+ "| 7 | 4 |" ,
4154+ "+---+---+" ,
4155+ ] ,
4156+ & results
4157+ ) ;
4158+ Ok ( ( ) )
4159+ }
4160+
4161+ // Test issue: https://github.com/apache/datafusion/issues/13873
4162+ #[ tokio:: test]
4163+ async fn write_csv_with_order ( ) -> Result < ( ) > {
4164+ let tmp_dir = TempDir :: new ( ) ?;
4165+ let schema = Arc :: new ( Schema :: new ( vec ! [
4166+ Field :: new( "a" , DataType :: Int32 , true ) ,
4167+ Field :: new( "b" , DataType :: Int32 , true ) ,
4168+ ] ) ) ;
4169+
4170+ let ctx = SessionContext :: new ( ) ;
4171+ let write_df = ctx. read_batch ( RecordBatch :: try_new (
4172+ schema. clone ( ) ,
4173+ vec ! [
4174+ Arc :: new( Int32Array :: from( vec![ 1 , 5 , 7 , 3 , 2 ] ) ) ,
4175+ Arc :: new( Int32Array :: from( vec![ 2 , 3 , 4 , 5 , 6 ] ) ) ,
4176+ ] ,
4177+ ) ?) ?;
4178+
4179+ let test_path = tmp_dir. path ( ) . join ( "test.csv" ) ;
4180+
4181+ write_df
4182+ . clone ( )
4183+ . write_csv (
4184+ test_path. to_str ( ) . unwrap ( ) ,
4185+ DataFrameWriteOptions :: new ( )
4186+ . with_sort_by ( vec ! [ col( "a" ) . sort( true , true ) ] ) ,
4187+ None ,
4188+ )
4189+ . await ?;
4190+
4191+ let ctx = SessionContext :: new ( ) ;
4192+ ctx. register_csv (
4193+ "data" ,
4194+ test_path. to_str ( ) . unwrap ( ) ,
4195+ CsvReadOptions :: new ( ) . schema ( & schema) ,
4196+ )
4197+ . await ?;
4198+
4199+ let df = ctx. sql ( "SELECT * FROM data" ) . await ?;
4200+ let results = df. collect ( ) . await ?;
4201+
4202+ assert_batches_eq ! (
4203+ & [
4204+ "+---+---+" ,
4205+ "| a | b |" ,
4206+ "+---+---+" ,
4207+ "| 1 | 2 |" ,
4208+ "| 2 | 6 |" ,
4209+ "| 3 | 5 |" ,
4210+ "| 5 | 3 |" ,
4211+ "| 7 | 4 |" ,
4212+ "+---+---+" ,
4213+ ] ,
4214+ & results
4215+ ) ;
4216+ Ok ( ( ) )
4217+ }
4218+
4219+ // Test issue: https://github.com/apache/datafusion/issues/13873
4220+ #[ tokio:: test]
4221+ async fn write_json_with_order ( ) -> Result < ( ) > {
4222+ let tmp_dir = TempDir :: new ( ) ?;
4223+ let schema = Arc :: new ( Schema :: new ( vec ! [
4224+ Field :: new( "a" , DataType :: Int32 , true ) ,
4225+ Field :: new( "b" , DataType :: Int32 , true ) ,
4226+ ] ) ) ;
4227+
4228+ let ctx = SessionContext :: new ( ) ;
4229+ let write_df = ctx. read_batch ( RecordBatch :: try_new (
4230+ schema. clone ( ) ,
4231+ vec ! [
4232+ Arc :: new( Int32Array :: from( vec![ 1 , 5 , 7 , 3 , 2 ] ) ) ,
4233+ Arc :: new( Int32Array :: from( vec![ 2 , 3 , 4 , 5 , 6 ] ) ) ,
4234+ ] ,
4235+ ) ?) ?;
4236+
4237+ let test_path = tmp_dir. path ( ) . join ( "test.json" ) ;
4238+
4239+ write_df
4240+ . clone ( )
4241+ . write_json (
4242+ test_path. to_str ( ) . unwrap ( ) ,
4243+ DataFrameWriteOptions :: new ( )
4244+ . with_sort_by ( vec ! [ col( "a" ) . sort( true , true ) ] ) ,
4245+ None ,
4246+ )
4247+ . await ?;
4248+
4249+ let ctx = SessionContext :: new ( ) ;
4250+ ctx. register_json (
4251+ "data" ,
4252+ test_path. to_str ( ) . unwrap ( ) ,
4253+ NdJsonReadOptions :: default ( ) . schema ( & schema) ,
4254+ )
4255+ . await ?;
4256+
4257+ let df = ctx. sql ( "SELECT * FROM data" ) . await ?;
4258+ let results = df. collect ( ) . await ?;
4259+
4260+ assert_batches_eq ! (
4261+ & [
4262+ "+---+---+" ,
4263+ "| a | b |" ,
4264+ "+---+---+" ,
4265+ "| 1 | 2 |" ,
4266+ "| 2 | 6 |" ,
4267+ "| 3 | 5 |" ,
4268+ "| 5 | 3 |" ,
4269+ "| 7 | 4 |" ,
4270+ "+---+---+" ,
4271+ ] ,
4272+ & results
4273+ ) ;
4274+ Ok ( ( ) )
4275+ }
4276+
4277+ // Test issue: https://github.com/apache/datafusion/issues/13873
4278+ #[ tokio:: test]
4279+ async fn write_table_with_order ( ) -> Result < ( ) > {
4280+ let tmp_dir = TempDir :: new ( ) ?;
4281+ let ctx = SessionContext :: new ( ) ;
4282+ let location = tmp_dir. path ( ) . join ( "test_table/" ) ;
4283+
4284+ let mut write_df = ctx
4285+ . sql ( "values ('z'), ('x'), ('a'), ('b'), ('c')" )
4286+ . await
4287+ . unwrap ( ) ;
4288+
4289+ // Ensure the column names and types match the target table
4290+ write_df = write_df
4291+ . with_column_renamed ( "column1" , "tablecol1" )
4292+ . unwrap ( ) ;
4293+ let sql_str =
4294+ "create external table data(tablecol1 varchar) stored as parquet location '"
4295+ . to_owned ( )
4296+ + location. to_str ( ) . unwrap ( )
4297+ + "'" ;
4298+
4299+ ctx. sql ( sql_str. as_str ( ) ) . await ?. collect ( ) . await ?;
4300+
4301+ // This is equivalent to INSERT INTO test.
4302+ write_df
4303+ . clone ( )
4304+ . write_table (
4305+ "data" ,
4306+ DataFrameWriteOptions :: new ( )
4307+ . with_sort_by ( vec ! [ col( "tablecol1" ) . sort( true , true ) ] ) ,
4308+ )
4309+ . await ?;
4310+
4311+ let df = ctx. sql ( "SELECT * FROM data" ) . await ?;
4312+ let results = df. collect ( ) . await ?;
4313+
4314+ assert_batches_eq ! (
4315+ & [
4316+ "+-----------+" ,
4317+ "| tablecol1 |" ,
4318+ "+-----------+" ,
4319+ "| a |" ,
4320+ "| b |" ,
4321+ "| c |" ,
4322+ "| x |" ,
4323+ "| z |" ,
4324+ "+-----------+" ,
4325+ ] ,
4326+ & results
4327+ ) ;
4328+ Ok ( ( ) )
4329+ }
40604330}
0 commit comments