Skip to content

Commit 6cfd1cf

Browse files
authored
Support (order by / sort) for DataFrameWriteOptions (#13874)
* Support (order by / sort) for DataFrameWriteOptions * Fix fmt * Fix import * Add insert into example
1 parent b4b267a commit 6cfd1cf

File tree

2 files changed

+282
-4
lines changed

2 files changed

+282
-4
lines changed

datafusion/core/src/dataframe/mod.rs

Lines changed: 273 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ pub struct DataFrameWriteOptions {
7777
/// Sets which columns should be used for hive-style partitioned writes by name.
7878
/// Can be set to empty vec![] for non-partitioned writes.
7979
partition_by: Vec<String>,
80+
/// Sets which columns should be used for sorting the output by name.
81+
/// Can be set to empty vec![] for non-sorted writes.
82+
sort_by: Vec<SortExpr>,
8083
}
8184

8285
impl DataFrameWriteOptions {
@@ -86,6 +89,7 @@ impl DataFrameWriteOptions {
8689
insert_op: InsertOp::Append,
8790
single_file_output: false,
8891
partition_by: vec![],
92+
sort_by: vec![],
8993
}
9094
}
9195

@@ -106,6 +110,12 @@ impl DataFrameWriteOptions {
106110
self.partition_by = partition_by;
107111
self
108112
}
113+
114+
/// Sets the sort_by columns for output sorting
115+
pub fn with_sort_by(mut self, sort_by: Vec<SortExpr>) -> Self {
116+
self.sort_by = sort_by;
117+
self
118+
}
109119
}
110120

111121
impl Default for DataFrameWriteOptions {
@@ -1517,8 +1527,17 @@ impl DataFrame {
15171527
write_options: DataFrameWriteOptions,
15181528
) -> Result<Vec<RecordBatch>, DataFusionError> {
15191529
let arrow_schema = Schema::from(self.schema());
1530+
1531+
let plan = if write_options.sort_by.is_empty() {
1532+
self.plan
1533+
} else {
1534+
LogicalPlanBuilder::from(self.plan)
1535+
.sort(write_options.sort_by)?
1536+
.build()?
1537+
};
1538+
15201539
let plan = LogicalPlanBuilder::insert_into(
1521-
self.plan,
1540+
plan,
15221541
table_name.to_owned(),
15231542
&arrow_schema,
15241543
write_options.insert_op,
@@ -1577,8 +1596,16 @@ impl DataFrame {
15771596

15781597
let file_type = format_as_file_type(format);
15791598

1599+
let plan = if options.sort_by.is_empty() {
1600+
self.plan
1601+
} else {
1602+
LogicalPlanBuilder::from(self.plan)
1603+
.sort(options.sort_by)?
1604+
.build()?
1605+
};
1606+
15801607
let plan = LogicalPlanBuilder::copy_to(
1581-
self.plan,
1608+
plan,
15821609
path.into(),
15831610
file_type,
15841611
HashMap::new(),
@@ -1638,8 +1665,16 @@ impl DataFrame {
16381665

16391666
let file_type = format_as_file_type(format);
16401667

1668+
let plan = if options.sort_by.is_empty() {
1669+
self.plan
1670+
} else {
1671+
LogicalPlanBuilder::from(self.plan)
1672+
.sort(options.sort_by)?
1673+
.build()?
1674+
};
1675+
16411676
let plan = LogicalPlanBuilder::copy_to(
1642-
self.plan,
1677+
plan,
16431678
path.into(),
16441679
file_type,
16451680
Default::default(),
@@ -1940,6 +1975,7 @@ mod tests {
19401975
use crate::physical_plan::{ColumnarValue, Partitioning, PhysicalExpr};
19411976
use crate::test_util::{register_aggregate_csv, test_table, test_table_with_name};
19421977

1978+
use crate::prelude::{CsvReadOptions, NdJsonReadOptions, ParquetReadOptions};
19431979
use arrow::array::Int32Array;
19441980
use datafusion_common::{assert_batches_eq, Constraint, Constraints, ScalarValue};
19451981
use datafusion_common_runtime::SpawnedTask;
@@ -1954,6 +1990,7 @@ mod tests {
19541990
use datafusion_physical_expr::expressions::Column;
19551991
use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};
19561992
use sqlparser::ast::NullTreatment;
1993+
use tempfile::TempDir;
19571994

19581995
// Get string representation of the plan
19591996
async fn assert_physical_plan(df: &DataFrame, expected: Vec<&str>) {
@@ -4057,4 +4094,237 @@ mod tests {
40574094

40584095
Ok(())
40594096
}
4097+
4098+
// Test issue: https://github.com/apache/datafusion/issues/13873
4099+
#[tokio::test]
4100+
async fn write_parquet_with_order() -> Result<()> {
4101+
let tmp_dir = TempDir::new()?;
4102+
let schema = Arc::new(Schema::new(vec![
4103+
Field::new("a", DataType::Int32, true),
4104+
Field::new("b", DataType::Int32, true),
4105+
]));
4106+
4107+
let ctx = SessionContext::new();
4108+
let write_df = ctx.read_batch(RecordBatch::try_new(
4109+
schema.clone(),
4110+
vec![
4111+
Arc::new(Int32Array::from(vec![1, 5, 7, 3, 2])),
4112+
Arc::new(Int32Array::from(vec![2, 3, 4, 5, 6])),
4113+
],
4114+
)?)?;
4115+
4116+
let test_path = tmp_dir.path().join("test.parquet");
4117+
4118+
write_df
4119+
.clone()
4120+
.write_parquet(
4121+
test_path.to_str().unwrap(),
4122+
DataFrameWriteOptions::new()
4123+
.with_sort_by(vec![col("a").sort(true, true)]),
4124+
None,
4125+
)
4126+
.await?;
4127+
4128+
let ctx = SessionContext::new();
4129+
ctx.register_parquet(
4130+
"data",
4131+
test_path.to_str().unwrap(),
4132+
ParquetReadOptions::default(),
4133+
)
4134+
.await?;
4135+
4136+
let df = ctx.sql("SELECT * FROM data").await?;
4137+
let results = df.collect().await?;
4138+
4139+
let df_explain = ctx.sql("explain SELECT a FROM data").await?;
4140+
let explain_result = df_explain.collect().await?;
4141+
4142+
println!("explain_result {:?}", explain_result);
4143+
4144+
assert_batches_eq!(
4145+
&[
4146+
"+---+---+",
4147+
"| a | b |",
4148+
"+---+---+",
4149+
"| 1 | 2 |",
4150+
"| 2 | 6 |",
4151+
"| 3 | 5 |",
4152+
"| 5 | 3 |",
4153+
"| 7 | 4 |",
4154+
"+---+---+",
4155+
],
4156+
&results
4157+
);
4158+
Ok(())
4159+
}
4160+
4161+
// Test issue: https://github.com/apache/datafusion/issues/13873
4162+
#[tokio::test]
4163+
async fn write_csv_with_order() -> Result<()> {
4164+
let tmp_dir = TempDir::new()?;
4165+
let schema = Arc::new(Schema::new(vec![
4166+
Field::new("a", DataType::Int32, true),
4167+
Field::new("b", DataType::Int32, true),
4168+
]));
4169+
4170+
let ctx = SessionContext::new();
4171+
let write_df = ctx.read_batch(RecordBatch::try_new(
4172+
schema.clone(),
4173+
vec![
4174+
Arc::new(Int32Array::from(vec![1, 5, 7, 3, 2])),
4175+
Arc::new(Int32Array::from(vec![2, 3, 4, 5, 6])),
4176+
],
4177+
)?)?;
4178+
4179+
let test_path = tmp_dir.path().join("test.csv");
4180+
4181+
write_df
4182+
.clone()
4183+
.write_csv(
4184+
test_path.to_str().unwrap(),
4185+
DataFrameWriteOptions::new()
4186+
.with_sort_by(vec![col("a").sort(true, true)]),
4187+
None,
4188+
)
4189+
.await?;
4190+
4191+
let ctx = SessionContext::new();
4192+
ctx.register_csv(
4193+
"data",
4194+
test_path.to_str().unwrap(),
4195+
CsvReadOptions::new().schema(&schema),
4196+
)
4197+
.await?;
4198+
4199+
let df = ctx.sql("SELECT * FROM data").await?;
4200+
let results = df.collect().await?;
4201+
4202+
assert_batches_eq!(
4203+
&[
4204+
"+---+---+",
4205+
"| a | b |",
4206+
"+---+---+",
4207+
"| 1 | 2 |",
4208+
"| 2 | 6 |",
4209+
"| 3 | 5 |",
4210+
"| 5 | 3 |",
4211+
"| 7 | 4 |",
4212+
"+---+---+",
4213+
],
4214+
&results
4215+
);
4216+
Ok(())
4217+
}
4218+
4219+
// Test issue: https://github.com/apache/datafusion/issues/13873
4220+
#[tokio::test]
4221+
async fn write_json_with_order() -> Result<()> {
4222+
let tmp_dir = TempDir::new()?;
4223+
let schema = Arc::new(Schema::new(vec![
4224+
Field::new("a", DataType::Int32, true),
4225+
Field::new("b", DataType::Int32, true),
4226+
]));
4227+
4228+
let ctx = SessionContext::new();
4229+
let write_df = ctx.read_batch(RecordBatch::try_new(
4230+
schema.clone(),
4231+
vec![
4232+
Arc::new(Int32Array::from(vec![1, 5, 7, 3, 2])),
4233+
Arc::new(Int32Array::from(vec![2, 3, 4, 5, 6])),
4234+
],
4235+
)?)?;
4236+
4237+
let test_path = tmp_dir.path().join("test.json");
4238+
4239+
write_df
4240+
.clone()
4241+
.write_json(
4242+
test_path.to_str().unwrap(),
4243+
DataFrameWriteOptions::new()
4244+
.with_sort_by(vec![col("a").sort(true, true)]),
4245+
None,
4246+
)
4247+
.await?;
4248+
4249+
let ctx = SessionContext::new();
4250+
ctx.register_json(
4251+
"data",
4252+
test_path.to_str().unwrap(),
4253+
NdJsonReadOptions::default().schema(&schema),
4254+
)
4255+
.await?;
4256+
4257+
let df = ctx.sql("SELECT * FROM data").await?;
4258+
let results = df.collect().await?;
4259+
4260+
assert_batches_eq!(
4261+
&[
4262+
"+---+---+",
4263+
"| a | b |",
4264+
"+---+---+",
4265+
"| 1 | 2 |",
4266+
"| 2 | 6 |",
4267+
"| 3 | 5 |",
4268+
"| 5 | 3 |",
4269+
"| 7 | 4 |",
4270+
"+---+---+",
4271+
],
4272+
&results
4273+
);
4274+
Ok(())
4275+
}
4276+
4277+
// Test issue: https://github.com/apache/datafusion/issues/13873
4278+
#[tokio::test]
4279+
async fn write_table_with_order() -> Result<()> {
4280+
let tmp_dir = TempDir::new()?;
4281+
let ctx = SessionContext::new();
4282+
let location = tmp_dir.path().join("test_table/");
4283+
4284+
let mut write_df = ctx
4285+
.sql("values ('z'), ('x'), ('a'), ('b'), ('c')")
4286+
.await
4287+
.unwrap();
4288+
4289+
// Ensure the column names and types match the target table
4290+
write_df = write_df
4291+
.with_column_renamed("column1", "tablecol1")
4292+
.unwrap();
4293+
let sql_str =
4294+
"create external table data(tablecol1 varchar) stored as parquet location '"
4295+
.to_owned()
4296+
+ location.to_str().unwrap()
4297+
+ "'";
4298+
4299+
ctx.sql(sql_str.as_str()).await?.collect().await?;
4300+
4301+
// This is equivalent to INSERT INTO test.
4302+
write_df
4303+
.clone()
4304+
.write_table(
4305+
"data",
4306+
DataFrameWriteOptions::new()
4307+
.with_sort_by(vec![col("tablecol1").sort(true, true)]),
4308+
)
4309+
.await?;
4310+
4311+
let df = ctx.sql("SELECT * FROM data").await?;
4312+
let results = df.collect().await?;
4313+
4314+
assert_batches_eq!(
4315+
&[
4316+
"+-----------+",
4317+
"| tablecol1 |",
4318+
"+-----------+",
4319+
"| a |",
4320+
"| b |",
4321+
"| c |",
4322+
"| x |",
4323+
"| z |",
4324+
"+-----------+",
4325+
],
4326+
&results
4327+
);
4328+
Ok(())
4329+
}
40604330
}

datafusion/core/src/dataframe/parquet.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,16 @@ impl DataFrame {
7474

7575
let file_type = format_as_file_type(format);
7676

77+
let plan = if options.sort_by.is_empty() {
78+
self.plan
79+
} else {
80+
LogicalPlanBuilder::from(self.plan)
81+
.sort(options.sort_by)?
82+
.build()?
83+
};
84+
7785
let plan = LogicalPlanBuilder::copy_to(
78-
self.plan,
86+
plan,
7987
path.into(),
8088
file_type,
8189
Default::default(),

0 commit comments

Comments
 (0)