MDEV-36761: Implement NULL-aware cardinality estimation for indexed columns

mariadb-OlegSmirnov · mariadb-OlegSmirnov · commit bb5d8c9caaa0 · 2025-10-01T21:25:17.000+03:00
When all values in an indexed column are NULL, EITS statistics show
avg_frequency == 0. This commit adds logic to distinguish between
"no statistics available" and "all values are NULL" scenarios.

For NULL-rejecting conditions (e.g., t1.col = t2.col), when statistics
confirm all indexed values are NULL, the optimizer can now return a
very low cardinality estimate (1.0) instead of unknown (0.0), since
NULL = NULL never matches.

For non-NULL-rejecting conditions (e.g., t1.col &lt;=&gt; t2.col),
normal cardinality estimation continues to apply since matches are possible.

Changes:
- Added KEY::rec_per_key_null_aware() to check nulls_ratio from column
  statistics when avg_frequency is 0
- Modified best_access_path() in sql_select.cc to use the new
  rec_per_key_null_aware() method for ref access cost estimation
- The optimization works with single-column and composite indexes,
  checking each key part's NULL-rejecting status via notnull_part bitmap
diff --git a/mysql-test/main/mdev-36761.result b/mysql-test/main/mdev-36761.result
@@ -0,0 +1,109 @@
+# Small driving table
+CREATE TABLE t1 (a INT, b INT);
+INSERT INTO t1 VALUES (1, 1), (2, 2000),(3,300);
+ANALYZE TABLE t1 PERSISTENT FOR ALL;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+# Table that will be accessed by an index lookup (`ref` access)
+CREATE TABLE t2 (a INT, b INT, KEY key_b(b));
+# All t11.b values are NULL
+INSERT INTO t2 SELECT seq/100,  NULL FROM seq_1_to_1000;
+ANALYZE TABLE t2 PERSISTENT FOR ALL;
+Table	Op	Msg_type	Msg_text
+test.t2	analyze	status	Engine-independent statistics collected
+test.t2	analyze	status	Table is already up to date
+# NULL-rejecting equality t1.b = t2.b will not return any matches
+# because all values of t2.b are NULL. So "rows" = 1 for t2 where 1 is
+# a special value meaning "very few" rows
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b = t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t2	ref	key_b	key_b	5	test.t1.b	1	100.00	Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`a` and `test`.`t2`.`b` = `test`.`t1`.`b`
+# However, rows estimation for not NULL-rejecting conditions
+# must not be affected ("rows" > 1 is expected)
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	
+1	SIMPLE	t2	ref	key_b	key_b	5	test.t1.b	11	100.00	Using index condition; Using where
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`a` and `test`.`t1`.`b` <=> `test`.`t2`.`b`
+ANALYZE SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	3.00	100.00	100.00	
+1	SIMPLE	t2	ref	key_b	key_b	5	test.t1.b	11	0.00	100.00	100.00	Using index condition; Using where
+# Test composite index for two columns. Key prefix is used for access
+CREATE TABLE t3 (a INT, b INT, KEY key_ab(a,b));
+# All t3.b values are NULL
+INSERT INTO t3 SELECT seq/100,  NULL FROM seq_1_to_1000;
+ANALYZE TABLE t3 PERSISTENT FOR ALL;
+Table	Op	Msg_type	Msg_text
+test.t3	analyze	status	Engine-independent statistics collected
+test.t3	analyze	status	Table is already up to date
+# NULL-rejecting equality t1.b = t3.b, same as above.
+# "rows" must be estimated to 1
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b = t3.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t3	ref	key_ab	key_ab	10	test.t1.a,test.t1.b	1	100.00	Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a` and `test`.`t3`.`b` = `test`.`t1`.`b`
+# Rows estimation for not NULL-rejecting conditions are not affected
+# ("rows" > 1 is expected)
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t3	ref	key_ab	key_ab	5	test.t1.a	90	100.00	Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a`
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b <=> t3.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t3	ref	key_ab	key_ab	10	test.t1.a,test.t1.b	11	100.00	Using where; Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a` and `test`.`t1`.`b` <=> `test`.`t3`.`b`
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t3.b is NULL;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t3	ref	key_ab	key_ab	10	test.t1.a,const	11	100.00	Using where; Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a` and `test`.`t3`.`b` is null
+OLEGS: update t2 so that there are not only NULLs, collect the stats and re-test
+# Test composite index for 3 columns. Key prefix is used for access
+CREATE TABLE t4 (a INT, b INT, c INT, KEY key_abc(a,b,c));
+# All t3.b values are NULL
+INSERT INTO t4 SELECT seq/10,  NULL, seq/10 FROM seq_1_to_1000;
+ANALYZE TABLE t4 PERSISTENT FOR ALL;
+Table	Op	Msg_type	Msg_text
+test.t4	analyze	status	Engine-independent statistics collected
+test.t4	analyze	status	Table is already up to date
+# NULL-rejecting equality t1.b = t3.b, same as above.
+# "rows" must be estimated to 1
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t4	ref	key_abc	key_abc	10	test.t1.a,test.t1.b	1	100.00	Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a` and `test`.`t4`.`b` = `test`.`t1`.`b`
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b and t1.b = t4.c;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t4	ref	key_abc	key_abc	15	test.t1.a,test.t1.b,test.t1.b	1	100.00	Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a` and `test`.`t4`.`b` = `test`.`t1`.`b` and `test`.`t4`.`c` = `test`.`t1`.`b`
+# "rows" expected to be > 1
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t4	ref	key_abc	key_abc	5	test.t1.a	9	100.00	Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a`
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b <=> t4.c;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	3	100.00	Using where
+1	SIMPLE	t4	ref	key_abc	key_abc	5	test.t1.a	9	100.00	Using where; Using index
+Warnings:
+Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a` and `test`.`t1`.`b` <=> `test`.`t4`.`c`
+DROP TABLE t1, t2, t3, t4;
diff --git a/mysql-test/main/mdev-36761.test b/mysql-test/main/mdev-36761.test
@@ -0,0 +1,68 @@
+--source include/have_sequence.inc
+
+--echo # Small driving table
+CREATE TABLE t1 (a INT, b INT);
+INSERT INTO t1 VALUES (1, 1), (2, 2000),(3,300);
+
+ANALYZE TABLE t1 PERSISTENT FOR ALL;
+
+--echo # Table that will be accessed by an index lookup (`ref` access)
+CREATE TABLE t2 (a INT, b INT, KEY key_b(b));
+--echo # All t11.b values are NULL
+INSERT INTO t2 SELECT seq/100,  NULL FROM seq_1_to_1000;
+
+ANALYZE TABLE t2 PERSISTENT FOR ALL;
+
+--echo # NULL-rejecting equality t1.b = t2.b will not return any matches
+--echo # because all values of t2.b are NULL. So "rows" = 1 for t2 where 1 is
+--echo # a special value meaning "very few" rows
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b = t2.b;
+
+--echo # However, rows estimation for not NULL-rejecting conditions
+--echo # must not be affected ("rows" > 1 is expected)
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
+
+ANALYZE SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
+
+--echo # Test composite index for two columns. Key prefix is used for access
+CREATE TABLE t3 (a INT, b INT, KEY key_ab(a,b));
+--echo # All t3.b values are NULL
+INSERT INTO t3 SELECT seq/100,  NULL FROM seq_1_to_1000;
+
+ANALYZE TABLE t3 PERSISTENT FOR ALL;
+
+--echo # NULL-rejecting equality t1.b = t3.b, same as above.
+--echo # "rows" must be estimated to 1
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b = t3.b;
+
+--echo # Rows estimation for not NULL-rejecting conditions are not affected
+--echo # ("rows" > 1 is expected)
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a;
+
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b <=> t3.b;
+
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t3.b is NULL;
+
+--echo OLEGS: update t2 so that there are not only NULLs, collect the stats and re-test
+
+--echo # Test composite index for 3 columns. Key prefix is used for access
+CREATE TABLE t4 (a INT, b INT, c INT, KEY key_abc(a,b,c));
+
+--echo # All t3.b values are NULL
+INSERT INTO t4 SELECT seq/10,  NULL, seq/10 FROM seq_1_to_1000;
+
+ANALYZE TABLE t4 PERSISTENT FOR ALL;
+
+--echo # NULL-rejecting equality t1.b = t3.b, same as above.
+--echo # "rows" must be estimated to 1
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b;
+
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b and t1.b = t4.c;
+
+--echo # "rows" expected to be > 1
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a;
+
+EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b <=> t4.c;
+
+DROP TABLE t1, t2, t3, t4;
+
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
@@ -8950,7 +8950,8 @@ best_access_path(JOIN      *join,
             }
             else
             {
-              if (!(records= keyinfo->actual_rec_per_key(key_parts-1)))
+              if (!(records=
+                    keyinfo->rec_per_key_null_aware(key_parts-1, notnull_part)))
               {                                   /* Prefer longer keys */
                 trace_access_idx.add("rec_per_key_stats_missing", true);
                 records=
@@ -9082,7 +9083,9 @@ best_access_path(JOIN      *join,
             else
             {
               /* Check if we have statistic about the distribution */
-              if ((records= keyinfo->actual_rec_per_key(max_key_part-1)))
+              if ((records=
+                   keyinfo->rec_per_key_null_aware(max_key_part-1,
+                                                   notnull_part)))
               {
                 /* 
                   Fix for the case where the index statistics is too
diff --git a/sql/structs.h b/sql/structs.h
@@ -172,6 +172,8 @@ typedef struct st_key {
   ha_index_option_struct *option_struct;                  /* structure with parsed options */
 
   double actual_rec_per_key(uint i) const;
+  double rec_per_key_null_aware(uint max_key_part,
+                                key_part_map notnull_part) const;
 } KEY;
 
 
diff --git a/sql/table.cc b/sql/table.cc
@@ -10310,6 +10310,62 @@ double KEY::actual_rec_per_key(uint i) const
   return (double) rec_per_key[i];
 }
 
+
+double KEY::rec_per_key_null_aware(uint max_key_part,
+                                   key_part_map notnull_part) const
+{
+  // Use engine-dependent statistics if EITS is not available
+  if (!is_statistics_from_stat_tables)
+  {
+    if (rec_per_key == nullptr)
+      return 0; // No statistics available
+    return (double) rec_per_key[max_key_part];
+  }
+
+  // Use engine-independent statistics (EITS)
+  double records= read_stats->get_avg_frequency(max_key_part);
+  if (records != 0.0)
+    return records;
+
+  /*
+    The index statistics show avg_frequency == 0 for this index prefix.
+    This typically means all values in the indexed columns are NULL.
+
+    For NULL-rejecting conditions like `t1.key_col = t2.col`, we know
+    there will be no matches (since NULL = NULL is never true).
+    However, for non-NULL-rejecting conditions like `t1.key_col <=> t2.col`,
+    matches are possible.
+
+    Check whether all key parts in the prefix have NULL-rejecting conditions
+    (indicated by bits set in `notnull_part`). If so, and if the statistics
+    confirm all values are NULL (nulls_ratio == 1.0), we can return a very
+    low cardinality estimate (1.0) instead of 0.0 (unknown), indicating
+    high selectivity with no expected matches.
+  */
+  for (int bit= max_key_part; bit >= 0; bit--)
+  {
+    key_part_map mask = (key_part_map)1 << bit;
+    if ((notnull_part & mask) == 0 || !key_part[bit].field->read_stats)
+    {
+      // This key part has a non-NULL-rejecting condition, or no column statistics
+      continue;
+    }
+
+    // Check if all values in this column are NULL according to statistics
+    double nulls_ratio= key_part[bit].field->read_stats->get_nulls_ratio();
+    if (nulls_ratio == 1.0)
+    {
+      /*
+        All values are NULL and the condition is NULL-rejecting.
+        Return 1.0 (very low cardinality) instead of 0.0 (unknown),
+        indicating this index prefix is highly selective with no expected matches.
+      */
+      return 1.0;
+    }
+  }
+  return records;
+}
+
 /*
    find total number of field in hash expr
 */

Original file line number	Diff line number	Diff line change
`@@ -8950,7 +8950,8 @@ best_access_path(JOIN *join,`
`8950`	`8950`	`}`
`8951`	`8951`	`else`
`8952`	`8952`	`{`
`8953`		`- if (!(records= keyinfo->actual_rec_per_key(key_parts-1)))`
	`8953`	`+ if (!(records=`
	`8954`	`+ keyinfo->rec_per_key_null_aware(key_parts-1, notnull_part)))`
`8954`	`8955`	`{ /* Prefer longer keys */`
`8955`	`8956`	`trace_access_idx.add("rec_per_key_stats_missing", true);`
`8956`	`8957`	`records=`
`@@ -9082,7 +9083,9 @@ best_access_path(JOIN *join,`
`9082`	`9083`	`else`
`9083`	`9084`	`{`
`9084`	`9085`	`/* Check if we have statistic about the distribution */`
`9085`		`- if ((records= keyinfo->actual_rec_per_key(max_key_part-1)))`
	`9086`	`+ if ((records=`
	`9087`	`+ keyinfo->rec_per_key_null_aware(max_key_part-1,`
	`9088`	`+ notnull_part)))`
`9086`	`9089`	`{`
`9087`	`9090`	`/*`
`9088`	`9091`	`Fix for the case where the index statistics is too`