Closed
Description
Describe the bug
Array::is_null
for dictionary arrays doesn't account for nulls in the values array. This causes hard-to-debug issues.
For example, the dictionary below contains only null values. When we run a distinct count on it, the result is sometimes 1 (incorrect) and sometimes 0.
To Reproduce
use arrow::array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, StringArray};
use arrow::datatypes::{DataType, Field, Schema};
use datafusion::catalog::MemTable;
use datafusion::prelude::SessionContext;
use datafusion_common::ScalarValue;
use datafusion_execution::config::SessionConfig;
use std::sync::Arc;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let n: usize = 5;
let num = Arc::new(Int32Array::from((0..n as _).collect::<Vec<i32>>())) as ArrayRef;
let dict_values = StringArray::from(vec![None, Some("abc")]);
let dict_indices = Int32Array::from(vec![0; n]);
// all idx point to 0 - which means that all values in the dictionary are None
let dict = DictionaryArray::new(dict_indices, Arc::new(dict_values) as ArrayRef);
let schema = Arc::new(Schema::new(vec![
Field::new("num1", DataType::Int32, false),
Field::new("num2", DataType::Int32, false), // num2 to disable SingleDistinctToGroupBy optimisation
Field::new(
"dict",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
true,
),
]));
let batch = RecordBatch::try_new(
schema.clone(),
vec![num.clone(), num.clone(), Arc::new(dict)],
)?;
let provider = MemTable::try_new(schema, vec![vec![batch]])?;
let mut session_config = SessionConfig::default();
session_config = session_config.set(
"datafusion.execution.target_partitions",
&ScalarValue::UInt64(Some(1u64)), // won't work with more than 1 partition
);
let ctx = SessionContext::new_with_config(session_config);
ctx.register_table("t", Arc::new(provider))?;
let df = ctx.sql("select count(distinct dict), count(num2) from t group by num1").await?;
// count(distinct ...) doesn't count None values, so the result should be 0
df.show().await?;
Ok(())
}
This will produce
+------------------------+---------------+
| count(DISTINCT t.dict) | count(t.num2) |
+------------------------+---------------+
| 1 | 1 |
| 1 | 1 |
| 1 | 1 |
| 1 | 1 |
| 1 | 1 |
+------------------------+---------------+
dict
only has null values so the correct result should be 0
If you change target partitions to 2
or remove num2
the results will be 0
for all columns which is correct
Expected behavior
We should either disallow nulls in dictionary values or handle them explicitly (the latter is preferable IMO).
Additional context
The fix will likely be in the Arrow repo, but created this issue for discussion and visibility.