Skip to content

Incorrect result when optimizing expressions with IN list containing NULL since 50.0.0 #17681

@ddupg

Description

@ddupg

Describe the bug

When using an IN clause containing NULL (e.g., col IN (1, NULL)), ExprSimplifier appears to return incorrect results.

To Reproduce

Reproduce test:

mod tests {
    use std::sync::Arc;
    use arrow_schema::{DataType, Field, SchemaRef};
    use datafusion_common::{Column, DFSchema, ScalarValue};
    use datafusion_expr::{BinaryExpr, Cast, Expr, Operator};
    use datafusion_expr::execution_props::ExecutionProps;
    use datafusion_expr::simplify::SimplifyContext;
    use datafusion_optimizer::simplify_expressions::ExprSimplifier;

    pub fn optimize_expr(schema: SchemaRef, expr: Expr) -> datafusion_common::Result<Expr> {
        let df_schema = Arc::new(DFSchema::try_from(schema.as_ref().clone())?);

        // DataFusion needs the simplify and coerce passes to be applied before
        // expressions can be handled by the physical planner.
        let props = ExecutionProps::default();
        let simplify_context = SimplifyContext::new(&props).with_schema(df_schema.clone());
        let simplifier = ExprSimplifier::new(simplify_context);

        let expr = simplifier.simplify(expr)?;
        let expr = simplifier.coerce(expr, &df_schema)?;

        Ok(expr)
    }

    #[test]
    fn test() {
        let schema = Arc::new(arrow_schema::Schema::new(vec![
            Field::new("x", DataType::Int32, true),
        ]));
        // x IN (1, NULL)
        let expr = Expr::BinaryExpr(BinaryExpr::new(
            Box::new(Expr::BinaryExpr(BinaryExpr::new(
                Box::new(Expr::Column(Column::new(None::<String>, "x"))),
                Operator::Eq,
                Box::new(Expr::Literal(ScalarValue::Int32(Some(1)), None)),
            ))),
            Operator::Or,
            Box::new(Expr::BinaryExpr(BinaryExpr::new(
                Box::new(Expr::Column(Column::new(None::<String>, "x"))),
                Operator::Eq,
                Box::new(Expr::Cast(Cast::new(Box::new(Expr::Literal(ScalarValue::Null, None)), DataType::Int32))),
            ))),
        ));
        println!("expr: {:#?}", expr);

        let logical_expr = optimize_expr(schema.clone(), expr).unwrap();
        println!("logical_expr: {:#?}", logical_expr);
    }
}

the output:

expr: BinaryExpr(
    BinaryExpr {
        left: BinaryExpr(
            BinaryExpr {
                left: Column(
                    Column {
                        relation: None,
                        name: "x",
                    },
                ),
                op: Eq,
                right: Literal(
                    Int32(1),
                    None,
                ),
            },
        ),
        op: Or,
        right: BinaryExpr(
            BinaryExpr {
                left: Column(
                    Column {
                        relation: None,
                        name: "x",
                    },
                ),
                op: Eq,
                right: Cast(
                    Cast {
                        expr: Literal(
                            NULL,
                            None,
                        ),
                        data_type: Int32,
                    },
                ),
            },
        ),
    },
)
logical_expr: BinaryExpr(
    BinaryExpr {
        left: BinaryExpr(
            BinaryExpr {
                left: Column(
                    Column {
                        relation: None,
                        name: "x",
                    },
                ),
                op: Eq,
                right: Literal(
                    Int32(1),
                    None,
                ),
            },
        ),
        op: Or,
        right: Literal(
            Boolean(NULL),
            None,
        ),
    },
)

the output of datafusion 49.0.2 is:

logical_expr: BinaryExpr(
    BinaryExpr {
        left: BinaryExpr(
            BinaryExpr {
                left: Column(
                    Column {
                        relation: None,
                        name: "x",
                    },
                ),
                op: Eq,
                right: Literal(
                    Int32(1),
                    None,
                ),
            },
        ),
        op: Or,
        right: BinaryExpr(
            BinaryExpr {
                left: Column(
                    Column {
                        relation: None,
                        name: "x",
                    },
                ),
                op: Eq,
                right: Literal(
                    Int32(NULL),
                    None,
                ),
            },
        ),
    },
)

Expected behavior

No response

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions