Skip to content

Commit e54894c

Browse files
Bug-fix in Filter and Limit statistics (#8094)
* Bug fix and code simplification * Remove limit stats changes * Test added * Reduce code diff
1 parent 43cc870 commit e54894c

File tree

3 files changed

+45
-5
lines changed

3 files changed

+45
-5
lines changed

datafusion/common/src/stats.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,11 @@ pub struct ColumnStatistics {
279279
impl ColumnStatistics {
280280
/// Column contains a single non null value (e.g constant).
281281
pub fn is_singleton(&self) -> bool {
282-
match (self.min_value.get_value(), self.max_value.get_value()) {
282+
match (&self.min_value, &self.max_value) {
283283
// Min and max values are the same and not infinity.
284-
(Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max),
284+
(Precision::Exact(min), Precision::Exact(max)) => {
285+
!min.is_null() && !max.is_null() && (min == max)
286+
}
285287
(_, _) => false,
286288
}
287289
}

datafusion/core/src/datasource/statistics.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,11 @@ pub async fn get_statistics_with_limit(
7070
// files. This only applies when we know the number of rows. It also
7171
// currently ignores tables that have no statistics regarding the
7272
// number of rows.
73-
if num_rows.get_value().unwrap_or(&usize::MIN) <= &limit.unwrap_or(usize::MAX) {
73+
let conservative_num_rows = match num_rows {
74+
Precision::Exact(nr) => nr,
75+
_ => usize::MIN,
76+
};
77+
if conservative_num_rows <= limit.unwrap_or(usize::MAX) {
7478
while let Some(current) = all_files.next().await {
7579
let (file, file_stats) = current?;
7680
result_files.push(file);

datafusion/physical-plan/src/filter.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,25 @@ fn collect_new_statistics(
252252
},
253253
)| {
254254
let closed_interval = interval.close_bounds();
255+
let (min_value, max_value) =
256+
if closed_interval.lower.value.eq(&closed_interval.upper.value) {
257+
(
258+
Precision::Exact(closed_interval.lower.value),
259+
Precision::Exact(closed_interval.upper.value),
260+
)
261+
} else {
262+
(
263+
Precision::Inexact(closed_interval.lower.value),
264+
Precision::Inexact(closed_interval.upper.value),
265+
)
266+
};
255267
ColumnStatistics {
256268
null_count: match input_column_stats[idx].null_count.get_value() {
257269
Some(nc) => Precision::Inexact(*nc),
258270
None => Precision::Absent,
259271
},
260-
max_value: Precision::Inexact(closed_interval.upper.value),
261-
min_value: Precision::Inexact(closed_interval.lower.value),
272+
max_value,
273+
min_value,
262274
distinct_count: match distinct_count.get_value() {
263275
Some(dc) => Precision::Inexact(*dc),
264276
None => Precision::Absent,
@@ -963,4 +975,26 @@ mod tests {
963975

964976
Ok(())
965977
}
978+
979+
#[tokio::test]
980+
async fn test_statistics_with_constant_column() -> Result<()> {
981+
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
982+
let input = Arc::new(StatisticsExec::new(
983+
Statistics::new_unknown(&schema),
984+
schema,
985+
));
986+
// WHERE a = 10
987+
let predicate = Arc::new(BinaryExpr::new(
988+
Arc::new(Column::new("a", 0)),
989+
Operator::Eq,
990+
Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
991+
));
992+
let filter: Arc<dyn ExecutionPlan> =
993+
Arc::new(FilterExec::try_new(predicate, input)?);
994+
let filter_statistics = filter.statistics()?;
995+
// First column is "a", and it is a column with only one value after the filter.
996+
assert!(filter_statistics.column_statistics[0].is_singleton());
997+
998+
Ok(())
999+
}
9661000
}

0 commit comments

Comments
 (0)