Closed
Description
Describe the bug
Nulls are ignored when aggregating dictionary arrays.
To Reproduce
Create a dictionary array with nulls and then aggregate using that dictionary
Here is an example:
async fn dictionary_test() {
let key_builder = PrimitiveBuilder::<Int8Type>::new(100);
let value_builder = StringBuilder::new(100);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
builder.append("b").unwrap();
builder.append_null().unwrap();
builder.append("b").unwrap();
builder.append("a").unwrap();
builder.append("c").unwrap();
builder.append_null().unwrap();
let array = builder.finish();
print_array(&array);
// Now, let's run a query showing the invalid values are being used as well:
let batch = RecordBatch::try_from_iter(vec![("dict", array)]).unwrap();
let table = MemTable::try_new(batch.schema(), vec![vec![batch]]).unwrap();
let table = Arc::new(table);
// +------+
// | dict |
// +------+
// | b |
// | |
// | b |
// | a |
// | c |
// | |
// +------+
run_query(table.clone(), "select * from t").await;
// now aggregate the query
// (wrong results -- there are only 2 values for b, but it shows up as 4)
// +------+-----------------+
// | dict | COUNT(UInt8(1)) |
// +------+-----------------+
// | a | 1 |
// | c | 1 |
// | b | 4 |
// +------+-----------------+
run_query(table.clone(), "select dict, count(*) from t group by dict").await;
}
Here is a simply reproducer in sql.rs:
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index d9f7c6ea4..a323fc499 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -3019,6 +3019,15 @@ async fn query_on_string_dictionary() -> Result<()> {
let expected = vec![vec!["2"]];
assert_eq!(expected, actual);
+ // grouping
+ let sql = "SELECT d1, COUNT(*) FROM test group by d1";
+ let actual = execute(&mut ctx, sql).await;
+ let expected = vec![
+ vec!["one", "1"],
+ vec!["three", "1"],
+ ];
+ assert_eq!(expected, actual);
+
Ok(())
}
results in
---- query_on_string_dictionary stdout ----
thread 'query_on_string_dictionary' panicked at 'assertion failed: `(left == right)`
left: `[["one", "1"], ["three", "1"]]`,
right: `[["one", "2"], ["three", "1"]]`', datafusion/tests/sql.rs:3029:5
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
Expected behavior
Null entries should be ignored, but instead whatever value in the keys array is at that position is used to find a value.
Additional context
We found this in IOx (where @tustvold clevery used -1
for entries in keys
that were NULL rather than 0
): https://github.com/influxdata/influxdb_iox/issues/2112