Skip to content

Commit bb5d8c9

Browse files
MDEV-36761: Implement NULL-aware cardinality estimation for indexed columns
When all values in an indexed column are NULL, EITS statistics show avg_frequency == 0. This commit adds logic to distinguish between "no statistics available" and "all values are NULL" scenarios. For NULL-rejecting conditions (e.g., t1.col = t2.col), when statistics confirm all indexed values are NULL, the optimizer can now return a very low cardinality estimate (1.0) instead of unknown (0.0), since NULL = NULL never matches. For non-NULL-rejecting conditions (e.g., t1.col <=> t2.col), normal cardinality estimation continues to apply since matches are possible. Changes: - Added KEY::rec_per_key_null_aware() to check nulls_ratio from column statistics when avg_frequency is 0 - Modified best_access_path() in sql_select.cc to use the new rec_per_key_null_aware() method for ref access cost estimation - The optimization works with single-column and composite indexes, checking each key part's NULL-rejecting status via notnull_part bitmap
1 parent 44ed67d commit bb5d8c9

File tree

5 files changed

+240
-2
lines changed

5 files changed

+240
-2
lines changed

mysql-test/main/mdev-36761.result

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Small driving table
2+
CREATE TABLE t1 (a INT, b INT);
3+
INSERT INTO t1 VALUES (1, 1), (2, 2000),(3,300);
4+
ANALYZE TABLE t1 PERSISTENT FOR ALL;
5+
Table Op Msg_type Msg_text
6+
test.t1 analyze status Engine-independent statistics collected
7+
test.t1 analyze status OK
8+
# Table that will be accessed by an index lookup (`ref` access)
9+
CREATE TABLE t2 (a INT, b INT, KEY key_b(b));
10+
# All t11.b values are NULL
11+
INSERT INTO t2 SELECT seq/100, NULL FROM seq_1_to_1000;
12+
ANALYZE TABLE t2 PERSISTENT FOR ALL;
13+
Table Op Msg_type Msg_text
14+
test.t2 analyze status Engine-independent statistics collected
15+
test.t2 analyze status Table is already up to date
16+
# NULL-rejecting equality t1.b = t2.b will not return any matches
17+
# because all values of t2.b are NULL. So "rows" = 1 for t2 where 1 is
18+
# a special value meaning "very few" rows
19+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b = t2.b;
20+
id select_type table type possible_keys key key_len ref rows filtered Extra
21+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
22+
1 SIMPLE t2 ref key_b key_b 5 test.t1.b 1 100.00 Using where
23+
Warnings:
24+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`a` and `test`.`t2`.`b` = `test`.`t1`.`b`
25+
# However, rows estimation for not NULL-rejecting conditions
26+
# must not be affected ("rows" > 1 is expected)
27+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
28+
id select_type table type possible_keys key key_len ref rows filtered Extra
29+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00
30+
1 SIMPLE t2 ref key_b key_b 5 test.t1.b 11 100.00 Using index condition; Using where
31+
Warnings:
32+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`a` and `test`.`t1`.`b` <=> `test`.`t2`.`b`
33+
ANALYZE SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
34+
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
35+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 3.00 100.00 100.00
36+
1 SIMPLE t2 ref key_b key_b 5 test.t1.b 11 0.00 100.00 100.00 Using index condition; Using where
37+
# Test composite index for two columns. Key prefix is used for access
38+
CREATE TABLE t3 (a INT, b INT, KEY key_ab(a,b));
39+
# All t3.b values are NULL
40+
INSERT INTO t3 SELECT seq/100, NULL FROM seq_1_to_1000;
41+
ANALYZE TABLE t3 PERSISTENT FOR ALL;
42+
Table Op Msg_type Msg_text
43+
test.t3 analyze status Engine-independent statistics collected
44+
test.t3 analyze status Table is already up to date
45+
# NULL-rejecting equality t1.b = t3.b, same as above.
46+
# "rows" must be estimated to 1
47+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b = t3.b;
48+
id select_type table type possible_keys key key_len ref rows filtered Extra
49+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
50+
1 SIMPLE t3 ref key_ab key_ab 10 test.t1.a,test.t1.b 1 100.00 Using index
51+
Warnings:
52+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a` and `test`.`t3`.`b` = `test`.`t1`.`b`
53+
# Rows estimation for not NULL-rejecting conditions are not affected
54+
# ("rows" > 1 is expected)
55+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a;
56+
id select_type table type possible_keys key key_len ref rows filtered Extra
57+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
58+
1 SIMPLE t3 ref key_ab key_ab 5 test.t1.a 90 100.00 Using index
59+
Warnings:
60+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a`
61+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b <=> t3.b;
62+
id select_type table type possible_keys key key_len ref rows filtered Extra
63+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
64+
1 SIMPLE t3 ref key_ab key_ab 10 test.t1.a,test.t1.b 11 100.00 Using where; Using index
65+
Warnings:
66+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a` and `test`.`t1`.`b` <=> `test`.`t3`.`b`
67+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t3.b is NULL;
68+
id select_type table type possible_keys key key_len ref rows filtered Extra
69+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
70+
1 SIMPLE t3 ref key_ab key_ab 10 test.t1.a,const 11 100.00 Using where; Using index
71+
Warnings:
72+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t3` where `test`.`t3`.`a` = `test`.`t1`.`a` and `test`.`t3`.`b` is null
73+
OLEGS: update t2 so that there are not only NULLs, collect the stats and re-test
74+
# Test composite index for 3 columns. Key prefix is used for access
75+
CREATE TABLE t4 (a INT, b INT, c INT, KEY key_abc(a,b,c));
76+
# All t3.b values are NULL
77+
INSERT INTO t4 SELECT seq/10, NULL, seq/10 FROM seq_1_to_1000;
78+
ANALYZE TABLE t4 PERSISTENT FOR ALL;
79+
Table Op Msg_type Msg_text
80+
test.t4 analyze status Engine-independent statistics collected
81+
test.t4 analyze status Table is already up to date
82+
# NULL-rejecting equality t1.b = t3.b, same as above.
83+
# "rows" must be estimated to 1
84+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b;
85+
id select_type table type possible_keys key key_len ref rows filtered Extra
86+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
87+
1 SIMPLE t4 ref key_abc key_abc 10 test.t1.a,test.t1.b 1 100.00 Using index
88+
Warnings:
89+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a` and `test`.`t4`.`b` = `test`.`t1`.`b`
90+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b and t1.b = t4.c;
91+
id select_type table type possible_keys key key_len ref rows filtered Extra
92+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
93+
1 SIMPLE t4 ref key_abc key_abc 15 test.t1.a,test.t1.b,test.t1.b 1 100.00 Using index
94+
Warnings:
95+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a` and `test`.`t4`.`b` = `test`.`t1`.`b` and `test`.`t4`.`c` = `test`.`t1`.`b`
96+
# "rows" expected to be > 1
97+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a;
98+
id select_type table type possible_keys key key_len ref rows filtered Extra
99+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
100+
1 SIMPLE t4 ref key_abc key_abc 5 test.t1.a 9 100.00 Using index
101+
Warnings:
102+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a`
103+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b <=> t4.c;
104+
id select_type table type possible_keys key key_len ref rows filtered Extra
105+
1 SIMPLE t1 ALL NULL NULL NULL NULL 3 100.00 Using where
106+
1 SIMPLE t4 ref key_abc key_abc 5 test.t1.a 9 100.00 Using where; Using index
107+
Warnings:
108+
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t4`.`a` AS `a`,`test`.`t4`.`b` AS `b`,`test`.`t4`.`c` AS `c` from `test`.`t1` join `test`.`t4` where `test`.`t4`.`a` = `test`.`t1`.`a` and `test`.`t1`.`b` <=> `test`.`t4`.`c`
109+
DROP TABLE t1, t2, t3, t4;

mysql-test/main/mdev-36761.test

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
--source include/have_sequence.inc
2+
3+
--echo # Small driving table
4+
CREATE TABLE t1 (a INT, b INT);
5+
INSERT INTO t1 VALUES (1, 1), (2, 2000),(3,300);
6+
7+
ANALYZE TABLE t1 PERSISTENT FOR ALL;
8+
9+
--echo # Table that will be accessed by an index lookup (`ref` access)
10+
CREATE TABLE t2 (a INT, b INT, KEY key_b(b));
11+
--echo # All t11.b values are NULL
12+
INSERT INTO t2 SELECT seq/100, NULL FROM seq_1_to_1000;
13+
14+
ANALYZE TABLE t2 PERSISTENT FOR ALL;
15+
16+
--echo # NULL-rejecting equality t1.b = t2.b will not return any matches
17+
--echo # because all values of t2.b are NULL. So "rows" = 1 for t2 where 1 is
18+
--echo # a special value meaning "very few" rows
19+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b = t2.b;
20+
21+
--echo # However, rows estimation for not NULL-rejecting conditions
22+
--echo # must not be affected ("rows" > 1 is expected)
23+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
24+
25+
ANALYZE SELECT * FROM t1 JOIN t2 ON t1.a = t2.a AND t1.b <=> t2.b;
26+
27+
--echo # Test composite index for two columns. Key prefix is used for access
28+
CREATE TABLE t3 (a INT, b INT, KEY key_ab(a,b));
29+
--echo # All t3.b values are NULL
30+
INSERT INTO t3 SELECT seq/100, NULL FROM seq_1_to_1000;
31+
32+
ANALYZE TABLE t3 PERSISTENT FOR ALL;
33+
34+
--echo # NULL-rejecting equality t1.b = t3.b, same as above.
35+
--echo # "rows" must be estimated to 1
36+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b = t3.b;
37+
38+
--echo # Rows estimation for not NULL-rejecting conditions are not affected
39+
--echo # ("rows" > 1 is expected)
40+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a;
41+
42+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t1.b <=> t3.b;
43+
44+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t3 ON t1.a = t3.a AND t3.b is NULL;
45+
46+
--echo OLEGS: update t2 so that there are not only NULLs, collect the stats and re-test
47+
48+
--echo # Test composite index for 3 columns. Key prefix is used for access
49+
CREATE TABLE t4 (a INT, b INT, c INT, KEY key_abc(a,b,c));
50+
51+
--echo # All t3.b values are NULL
52+
INSERT INTO t4 SELECT seq/10, NULL, seq/10 FROM seq_1_to_1000;
53+
54+
ANALYZE TABLE t4 PERSISTENT FOR ALL;
55+
56+
--echo # NULL-rejecting equality t1.b = t3.b, same as above.
57+
--echo # "rows" must be estimated to 1
58+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b;
59+
60+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b = t4.b and t1.b = t4.c;
61+
62+
--echo # "rows" expected to be > 1
63+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a;
64+
65+
EXPLAIN EXTENDED SELECT * FROM t1 JOIN t4 ON t1.a = t4.a AND t1.b <=> t4.c;
66+
67+
DROP TABLE t1, t2, t3, t4;
68+

sql/sql_select.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8950,7 +8950,8 @@ best_access_path(JOIN *join,
89508950
}
89518951
else
89528952
{
8953-
if (!(records= keyinfo->actual_rec_per_key(key_parts-1)))
8953+
if (!(records=
8954+
keyinfo->rec_per_key_null_aware(key_parts-1, notnull_part)))
89548955
{ /* Prefer longer keys */
89558956
trace_access_idx.add("rec_per_key_stats_missing", true);
89568957
records=
@@ -9082,7 +9083,9 @@ best_access_path(JOIN *join,
90829083
else
90839084
{
90849085
/* Check if we have statistic about the distribution */
9085-
if ((records= keyinfo->actual_rec_per_key(max_key_part-1)))
9086+
if ((records=
9087+
keyinfo->rec_per_key_null_aware(max_key_part-1,
9088+
notnull_part)))
90869089
{
90879090
/*
90889091
Fix for the case where the index statistics is too

sql/structs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ typedef struct st_key {
172172
ha_index_option_struct *option_struct; /* structure with parsed options */
173173

174174
double actual_rec_per_key(uint i) const;
175+
double rec_per_key_null_aware(uint max_key_part,
176+
key_part_map notnull_part) const;
175177
} KEY;
176178

177179

sql/table.cc

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10310,6 +10310,62 @@ double KEY::actual_rec_per_key(uint i) const
1031010310
return (double) rec_per_key[i];
1031110311
}
1031210312

10313+
10314+
double KEY::rec_per_key_null_aware(uint max_key_part,
10315+
key_part_map notnull_part) const
10316+
{
10317+
// Use engine-dependent statistics if EITS is not available
10318+
if (!is_statistics_from_stat_tables)
10319+
{
10320+
if (rec_per_key == nullptr)
10321+
return 0; // No statistics available
10322+
return (double) rec_per_key[max_key_part];
10323+
}
10324+
10325+
// Use engine-independent statistics (EITS)
10326+
double records= read_stats->get_avg_frequency(max_key_part);
10327+
if (records != 0.0)
10328+
return records;
10329+
10330+
/*
10331+
The index statistics show avg_frequency == 0 for this index prefix.
10332+
This typically means all values in the indexed columns are NULL.
10333+
10334+
For NULL-rejecting conditions like `t1.key_col = t2.col`, we know
10335+
there will be no matches (since NULL = NULL is never true).
10336+
However, for non-NULL-rejecting conditions like `t1.key_col <=> t2.col`,
10337+
matches are possible.
10338+
10339+
Check whether all key parts in the prefix have NULL-rejecting conditions
10340+
(indicated by bits set in `notnull_part`). If so, and if the statistics
10341+
confirm all values are NULL (nulls_ratio == 1.0), we can return a very
10342+
low cardinality estimate (1.0) instead of 0.0 (unknown), indicating
10343+
high selectivity with no expected matches.
10344+
*/
10345+
for (int bit= max_key_part; bit >= 0; bit--)
10346+
{
10347+
key_part_map mask = (key_part_map)1 << bit;
10348+
if ((notnull_part & mask) == 0 || !key_part[bit].field->read_stats)
10349+
{
10350+
// This key part has a non-NULL-rejecting condition, or no column statistics
10351+
continue;
10352+
}
10353+
10354+
// Check if all values in this column are NULL according to statistics
10355+
double nulls_ratio= key_part[bit].field->read_stats->get_nulls_ratio();
10356+
if (nulls_ratio == 1.0)
10357+
{
10358+
/*
10359+
All values are NULL and the condition is NULL-rejecting.
10360+
Return 1.0 (very low cardinality) instead of 0.0 (unknown),
10361+
indicating this index prefix is highly selective with no expected matches.
10362+
*/
10363+
return 1.0;
10364+
}
10365+
}
10366+
return records;
10367+
}
10368+
1031310369
/*
1031410370
find total number of field in hash expr
1031510371
*/

0 commit comments

Comments
 (0)