@@ -51,14 +51,18 @@ use arrow::compute::{cast, concat};
5151use arrow:: datatypes:: { DataType , Field , Schema , SchemaRef } ;
5252use datafusion_common:: config:: { CsvOptions , JsonOptions } ;
5353use datafusion_common:: {
54- exec_err, not_impl_err, plan_err, Column , DFSchema , DataFusionError , ParamValues ,
55- SchemaError , UnnestOptions ,
54+ exec_err, not_impl_err, plan_datafusion_err , plan_err, Column , DFSchema ,
55+ DataFusionError , ParamValues , ScalarValue , SchemaError , UnnestOptions ,
5656} ;
57- use datafusion_expr:: dml:: InsertOp ;
58- use datafusion_expr:: { case, is_null, lit, SortExpr } ;
5957use datafusion_expr:: {
60- utils:: COUNT_STAR_EXPANSION , TableProviderFilterPushDown , UNNAMED_TABLE ,
58+ case,
59+ dml:: InsertOp ,
60+ expr:: { Alias , ScalarFunction } ,
61+ is_null, lit,
62+ utils:: COUNT_STAR_EXPANSION ,
63+ SortExpr , TableProviderFilterPushDown , UNNAMED_TABLE ,
6164} ;
65+ use datafusion_functions:: core:: coalesce;
6266use datafusion_functions_aggregate:: expr_fn:: {
6367 avg, count, max, median, min, stddev, sum,
6468} ;
@@ -1930,6 +1934,89 @@ impl DataFrame {
19301934 plan,
19311935 } )
19321936 }
1937+
1938+ /// Fill null values in specified columns with a given value
1939+ /// If no columns are specified (empty vector), applies to all columns
1940+ /// Only fills if the value can be cast to the column's type
1941+ ///
1942+ /// # Arguments
1943+ /// * `value` - Value to fill nulls with
1944+ /// * `columns` - List of column names to fill. If empty, fills all columns.
1945+ ///
1946+ /// # Example
1947+ /// ```
1948+ /// # use datafusion::prelude::*;
1949+ /// # use datafusion::error::Result;
1950+ /// # use datafusion_common::ScalarValue;
1951+ /// # #[tokio::main]
1952+ /// # async fn main() -> Result<()> {
1953+ /// let ctx = SessionContext::new();
1954+ /// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
1955+ /// // Fill nulls in only columns "a" and "c":
1956+ /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?;
1957+ /// // Fill nulls across all columns:
1958+ /// let df = df.fill_null(ScalarValue::from(0), vec![])?;
1959+ /// # Ok(())
1960+ /// # }
1961+ /// ```
1962+ pub fn fill_null (
1963+ & self ,
1964+ value : ScalarValue ,
1965+ columns : Vec < String > ,
1966+ ) -> Result < DataFrame > {
1967+ let cols = if columns. is_empty ( ) {
1968+ self . logical_plan ( )
1969+ . schema ( )
1970+ . fields ( )
1971+ . iter ( )
1972+ . map ( |f| f. as_ref ( ) . clone ( ) )
1973+ . collect ( )
1974+ } else {
1975+ self . find_columns ( & columns) ?
1976+ } ;
1977+
1978+ // Create projections for each column
1979+ let projections = self
1980+ . logical_plan ( )
1981+ . schema ( )
1982+ . fields ( )
1983+ . iter ( )
1984+ . map ( |field| {
1985+ if cols. contains ( field) {
1986+ // Try to cast fill value to column type. If the cast fails, fallback to the original column.
1987+ match value. clone ( ) . cast_to ( field. data_type ( ) ) {
1988+ Ok ( fill_value) => Expr :: Alias ( Alias {
1989+ expr : Box :: new ( Expr :: ScalarFunction ( ScalarFunction {
1990+ func : coalesce ( ) ,
1991+ args : vec ! [ col( field. name( ) ) , lit( fill_value) ] ,
1992+ } ) ) ,
1993+ relation : None ,
1994+ name : field. name ( ) . to_string ( ) ,
1995+ } ) ,
1996+ Err ( _) => col ( field. name ( ) ) ,
1997+ }
1998+ } else {
1999+ col ( field. name ( ) )
2000+ }
2001+ } )
2002+ . collect :: < Vec < _ > > ( ) ;
2003+
2004+ self . clone ( ) . select ( projections)
2005+ }
2006+
2007+ // Helper to find columns from names
2008+ fn find_columns ( & self , names : & [ String ] ) -> Result < Vec < Field > > {
2009+ let schema = self . logical_plan ( ) . schema ( ) ;
2010+ names
2011+ . iter ( )
2012+ . map ( |name| {
2013+ schema
2014+ . field_with_name ( None , name)
2015+ . cloned ( )
2016+ . map_err ( |_| plan_datafusion_err ! ( "Column '{}' not found" , name) )
2017+ } )
2018+ . collect ( )
2019+ }
19332020}
19342021
19352022#[ derive( Debug ) ]
0 commit comments