Skip to content

Commit 894a879

Browse files
authored
fix: When consuming Substrait, temporarily rename clashing duplicate columns (#11329)
* cleanup project internals * alias intermediate duplicate columns * fix test * fix clippy
1 parent 37428bb commit 894a879

File tree

2 files changed

+38
-11
lines changed

2 files changed

+38
-11
lines changed

datafusion/substrait/src/logical_plan/consumer.rs

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ use datafusion::{
6060
prelude::{Column, SessionContext},
6161
scalar::ScalarValue,
6262
};
63-
use std::collections::HashMap;
63+
use std::collections::{HashMap, HashSet};
6464
use std::str::FromStr;
6565
use std::sync::Arc;
6666
use substrait::proto::exchange_rel::ExchangeKind;
@@ -404,22 +404,33 @@ pub async fn from_substrait_rel(
404404
let mut input = LogicalPlanBuilder::from(
405405
from_substrait_rel(ctx, input, extensions).await?,
406406
);
407+
let mut names: HashSet<String> = HashSet::new();
407408
let mut exprs: Vec<Expr> = vec![];
408409
for e in &p.expressions {
409410
let x =
410411
from_substrait_rex(ctx, e, input.clone().schema(), extensions)
411412
.await?;
412413
// if the expression is WindowFunction, wrap in a Window relation
413-
// before returning and do not add to list of this Projection's expression list
414-
// otherwise, add expression to the Projection's expression list
415-
match &*x {
416-
Expr::WindowFunction(_) => {
417-
input = input.window(vec![x.as_ref().clone()])?;
418-
exprs.push(x.as_ref().clone());
419-
}
420-
_ => {
421-
exprs.push(x.as_ref().clone());
422-
}
414+
if let Expr::WindowFunction(_) = x.as_ref() {
415+
// Adding the same expression here and in the project below
416+
// works because the project's builder uses columnize_expr(..)
417+
// to transform it into a column reference
418+
input = input.window(vec![x.as_ref().clone()])?
419+
}
420+
// Ensure the expression has a unique display name, so that project's
421+
// validate_unique_names doesn't fail
422+
let name = x.display_name()?;
423+
let mut new_name = name.clone();
424+
let mut i = 0;
425+
while names.contains(&new_name) {
426+
new_name = format!("{}__temp__{}", name, i);
427+
i += 1;
428+
}
429+
names.insert(new_name.clone());
430+
if new_name != name {
431+
exprs.push(x.as_ref().clone().alias(new_name.clone()));
432+
} else {
433+
exprs.push(x.as_ref().clone());
423434
}
424435
}
425436
input.project(exprs)?.build()

datafusion/substrait/tests/cases/roundtrip_logical_plan.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,22 @@ async fn roundtrip_values_duplicate_column_join() -> Result<()> {
751751
.await
752752
}
753753

754+
#[tokio::test]
755+
async fn duplicate_column() -> Result<()> {
756+
// Substrait does not keep column names (aliases) in the plan, rather it operates on column indices
757+
// only. DataFusion however, is strict about not having duplicate column names appear in the plan.
758+
// This test confirms that we generate aliases for columns in the plan which would otherwise have
759+
// colliding names.
760+
assert_expected_plan(
761+
"SELECT a + 1 as sum_a, a + 1 as sum_a_2 FROM data",
762+
"Projection: data.a + Int64(1) AS sum_a, data.a + Int64(1) AS data.a + Int64(1)__temp__0 AS sum_a_2\
763+
\n Projection: data.a + Int64(1)\
764+
\n TableScan: data projection=[a]",
765+
true,
766+
)
767+
.await
768+
}
769+
754770
/// Construct a plan that cast columns. Only those SQL types are supported for now.
755771
#[tokio::test]
756772
async fn new_test_grammar() -> Result<()> {

0 commit comments

Comments
 (0)