Skip to content

Commit 2fd14a4

Browse files
authored
fix: Correct null_count in describe() (#10260)
* fix: Correct null_count in describe() * chore: fix fmt * chore: Fix ci * fix: Update comment * fix: refactor null_count calculation in describe() and add test * chore
1 parent ac21e77 commit 2fd14a4

File tree

2 files changed

+42
-4
lines changed

2 files changed

+42
-4
lines changed

datafusion/core/src/dataframe/mod.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,12 @@ use datafusion_common::config::{CsvOptions, FormatOptions, JsonOptions};
4848
use datafusion_common::{
4949
plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions,
5050
};
51+
use datafusion_expr::lit;
5152
use datafusion_expr::{
52-
avg, count, is_null, max, median, min, stddev, utils::COUNT_STAR_EXPANSION,
53+
avg, count, max, median, min, stddev, utils::COUNT_STAR_EXPANSION,
5354
TableProviderFilterPushDown, UNNAMED_TABLE,
5455
};
56+
use datafusion_expr::{case, is_null, sum};
5557

5658
use async_trait::async_trait;
5759

@@ -534,7 +536,13 @@ impl DataFrame {
534536
vec![],
535537
original_schema_fields
536538
.clone()
537-
.map(|f| count(is_null(col(f.name()))).alias(f.name()))
539+
.map(|f| {
540+
sum(case(is_null(col(f.name())))
541+
.when(lit(true), lit(1))
542+
.otherwise(lit(0))
543+
.unwrap())
544+
.alias(f.name())
545+
})
538546
.collect::<Vec<_>>(),
539547
),
540548
// mean aggregation

datafusion/core/tests/dataframe/describe.rs

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ async fn describe() -> Result<()> {
3939
"| describe | id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col | year | month |",
4040
"+------------+-------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------------+-------------------------+--------------------+-------------------+",
4141
"| count | 7300.0 | 7300 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300 | 7300 | 7300 | 7300.0 | 7300.0 |",
42-
"| null_count | 7300.0 | 7300 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300.0 | 7300 | 7300 | 7300 | 7300.0 | 7300.0 |",
42+
"| null_count | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0.0 | 0.0 |",
4343
"| mean | 3649.5 | null | 4.5 | 4.5 | 4.5 | 45.0 | 4.949999964237213 | 45.45 | null | null | null | 2009.5 | 6.526027397260274 |",
4444
"| std | 2107.472815166704 | null | 2.8724780750809518 | 2.8724780750809518 | 2.8724780750809518 | 28.724780750809533 | 3.1597258182544645 | 29.012028558317645 | null | null | null | 0.5000342500942125 | 3.44808750051728 |",
4545
"| min | 0.0 | null | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 01/01/09 | 0 | 2008-12-31T23:00:00 | 2009.0 | 1.0 |",
@@ -69,7 +69,7 @@ async fn describe_boolean_binary() -> Result<()> {
6969
"| describe | a | b |",
7070
"+------------+------+------+",
7171
"| count | 1 | 1 |",
72-
"| null_count | 1 | 1 |",
72+
"| null_count | 0 | 0 |",
7373
"| mean | null | null |",
7474
"| std | null | null |",
7575
"| min | a | null |",
@@ -81,6 +81,36 @@ async fn describe_boolean_binary() -> Result<()> {
8181
Ok(())
8282
}
8383

84+
#[tokio::test]
85+
async fn describe_null() -> Result<()> {
86+
let ctx = parquet_context().await;
87+
88+
//add test case for only boolean boolean/binary column
89+
let result = ctx
90+
.sql("select 'a' as a, null as b")
91+
.await?
92+
.describe()
93+
.await?
94+
.collect()
95+
.await?;
96+
#[rustfmt::skip]
97+
let expected = [
98+
"+------------+------+------+",
99+
"| describe | a | b |",
100+
"+------------+------+------+",
101+
"| count | 1 | 0 |",
102+
"| null_count | 0 | 1 |",
103+
"| mean | null | null |",
104+
"| std | null | null |",
105+
"| min | null | null |",
106+
"| max | null | null |",
107+
"| median | null | null |",
108+
"+------------+------+------+"
109+
];
110+
assert_batches_eq!(expected, &result);
111+
Ok(())
112+
}
113+
84114
/// Return a SessionContext with parquet file registered
85115
async fn parquet_context() -> SessionContext {
86116
let ctx = SessionContext::new();

0 commit comments

Comments
 (0)