Skip to content

[libc++] Optimize ranges::minmax #87335

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 6, 2024
Merged

Conversation

philnik777
Copy link
Contributor

This allows Clang to vectorize the loop.

---------------------------------------------------------------------
Benchmark                                         old             new
---------------------------------------------------------------------
BM_std_minmax<char>/1                        0.659 ns         1.41 ns
BM_std_minmax<char>/2                         1.08 ns         2.16 ns
BM_std_minmax<char>/3                         2.16 ns         2.96 ns
BM_std_minmax<char>/4                         2.82 ns         3.81 ns
BM_std_minmax<char>/5                         3.43 ns         4.69 ns
BM_std_minmax<char>/6                         4.08 ns         5.63 ns
BM_std_minmax<char>/7                         4.75 ns         6.51 ns
BM_std_minmax<char>/8                         5.42 ns         7.41 ns
BM_std_minmax<char>/9                         6.05 ns         8.34 ns
BM_std_minmax<char>/10                        6.68 ns         9.29 ns
BM_std_minmax<char>/11                        7.47 ns         10.6 ns
BM_std_minmax<char>/12                        7.95 ns         11.4 ns
BM_std_minmax<char>/13                        8.64 ns         12.4 ns
BM_std_minmax<char>/14                        9.35 ns         13.4 ns
BM_std_minmax<char>/15                        10.1 ns         14.4 ns
BM_std_minmax<char>/16                        10.6 ns         2.25 ns
BM_std_minmax<char>/17                        11.3 ns         2.82 ns
BM_std_minmax<char>/18                        11.8 ns         3.71 ns
BM_std_minmax<char>/19                        12.6 ns         4.52 ns
BM_std_minmax<char>/20                        13.2 ns         5.47 ns
BM_std_minmax<char>/21                        14.1 ns         6.67 ns
BM_std_minmax<char>/22                        14.5 ns         7.78 ns
BM_std_minmax<char>/23                        15.1 ns         8.67 ns
BM_std_minmax<char>/24                        15.7 ns         9.68 ns
BM_std_minmax<char>/25                        16.4 ns         10.7 ns
BM_std_minmax<char>/26                        17.1 ns         11.7 ns
BM_std_minmax<char>/27                        17.8 ns         12.8 ns
BM_std_minmax<char>/28                        18.4 ns         14.1 ns
BM_std_minmax<char>/29                        19.0 ns         15.0 ns
BM_std_minmax<char>/30                        19.6 ns         16.0 ns
BM_std_minmax<char>/31                        20.2 ns         17.0 ns
BM_std_minmax<char>/32                        20.8 ns         2.46 ns
BM_std_minmax<char>/64                        41.5 ns         2.97 ns
BM_std_minmax<char>/512                        340 ns         6.05 ns
BM_std_minmax<char>/1024                       667 ns         8.83 ns
BM_std_minmax<char>/4000                      2571 ns         28.6 ns
BM_std_minmax<char>/4096                      2632 ns         25.8 ns
BM_std_minmax<char>/5500                      3554 ns         51.1 ns
BM_std_minmax<char>/64000                    41175 ns          480 ns
BM_std_minmax<char>/65536                    42039 ns          490 ns
BM_std_minmax<char>/70000                    44931 ns          528 ns
BM_std_minmax<short>/1                       0.708 ns         1.20 ns
BM_std_minmax<short>/2                        1.18 ns         1.78 ns
BM_std_minmax<short>/3                        1.98 ns         2.42 ns
BM_std_minmax<short>/4                        2.47 ns         3.05 ns
BM_std_minmax<short>/5                        3.09 ns         3.72 ns
BM_std_minmax<short>/6                        3.49 ns         4.37 ns
BM_std_minmax<short>/7                        4.24 ns         5.03 ns
BM_std_minmax<short>/8                        4.65 ns         2.12 ns
BM_std_minmax<short>/9                        5.34 ns         2.51 ns
BM_std_minmax<short>/10                       5.82 ns         3.18 ns
BM_std_minmax<short>/11                       6.36 ns         3.97 ns
BM_std_minmax<short>/12                       6.73 ns         4.68 ns
BM_std_minmax<short>/13                       7.59 ns         5.49 ns
BM_std_minmax<short>/14                       7.77 ns         6.45 ns
BM_std_minmax<short>/15                       8.54 ns         7.55 ns
BM_std_minmax<short>/16                       8.74 ns         2.38 ns
BM_std_minmax<short>/17                       9.59 ns         2.76 ns
BM_std_minmax<short>/18                       9.88 ns         3.37 ns
BM_std_minmax<short>/19                       10.7 ns         4.17 ns
BM_std_minmax<short>/20                       10.9 ns         4.88 ns
BM_std_minmax<short>/21                       12.1 ns         5.70 ns
BM_std_minmax<short>/22                       12.6 ns         6.64 ns
BM_std_minmax<short>/23                       13.5 ns         7.72 ns
BM_std_minmax<short>/24                       13.2 ns         2.87 ns
BM_std_minmax<short>/25                       14.2 ns         3.10 ns
BM_std_minmax<short>/26                       14.2 ns         3.59 ns
BM_std_minmax<short>/27                       15.4 ns         4.35 ns
BM_std_minmax<short>/28                       15.3 ns         5.10 ns
BM_std_minmax<short>/29                       16.2 ns         5.87 ns
BM_std_minmax<short>/30                       16.2 ns         6.88 ns
BM_std_minmax<short>/31                       17.0 ns         7.78 ns
BM_std_minmax<short>/32                       17.2 ns         3.45 ns
BM_std_minmax<short>/64                       34.1 ns         3.35 ns
BM_std_minmax<short>/512                       279 ns         8.37 ns
BM_std_minmax<short>/1024                      549 ns         14.2 ns
BM_std_minmax<short>/4000                     2111 ns         50.1 ns
BM_std_minmax<short>/4096                     2167 ns         47.9 ns
BM_std_minmax<short>/5500                     2895 ns         69.7 ns
BM_std_minmax<short>/64000                   33454 ns          953 ns
BM_std_minmax<short>/65536                   34474 ns          970 ns
BM_std_minmax<short>/70000                   36691 ns         1037 ns
BM_std_minmax<int>/1                         0.664 ns         1.17 ns
BM_std_minmax<int>/2                          1.11 ns         1.69 ns
BM_std_minmax<int>/3                          2.36 ns         2.29 ns
BM_std_minmax<int>/4                          2.53 ns         2.91 ns
BM_std_minmax<int>/5                          3.23 ns         3.56 ns
BM_std_minmax<int>/6                          3.56 ns         4.23 ns
BM_std_minmax<int>/7                          4.28 ns         4.91 ns
BM_std_minmax<int>/8                          4.60 ns         5.60 ns
BM_std_minmax<int>/9                          5.38 ns         6.31 ns
BM_std_minmax<int>/10                         5.69 ns         7.03 ns
BM_std_minmax<int>/11                         6.41 ns         7.70 ns
BM_std_minmax<int>/12                         6.73 ns         8.39 ns
BM_std_minmax<int>/13                         7.38 ns         9.07 ns
BM_std_minmax<int>/14                         7.74 ns         9.79 ns
BM_std_minmax<int>/15                         8.53 ns         10.5 ns
BM_std_minmax<int>/16                         8.79 ns         11.2 ns
BM_std_minmax<int>/17                         9.63 ns         12.0 ns
BM_std_minmax<int>/18                         9.84 ns         12.7 ns
BM_std_minmax<int>/19                         10.6 ns         13.5 ns
BM_std_minmax<int>/20                         11.0 ns         14.3 ns
BM_std_minmax<int>/21                         11.7 ns         15.0 ns
BM_std_minmax<int>/22                         12.0 ns         15.7 ns
BM_std_minmax<int>/23                         13.1 ns         16.5 ns
BM_std_minmax<int>/24                         13.0 ns         17.3 ns
BM_std_minmax<int>/25                         13.7 ns         17.9 ns
BM_std_minmax<int>/26                         14.0 ns         18.6 ns
BM_std_minmax<int>/27                         14.8 ns         19.4 ns
BM_std_minmax<int>/28                         15.1 ns         20.3 ns
BM_std_minmax<int>/29                         15.8 ns         20.9 ns
BM_std_minmax<int>/30                         16.1 ns         21.7 ns
BM_std_minmax<int>/31                         16.9 ns         22.5 ns
BM_std_minmax<int>/32                         17.2 ns         3.40 ns
BM_std_minmax<int>/64                         33.9 ns         4.04 ns
BM_std_minmax<int>/512                         275 ns         14.6 ns
BM_std_minmax<int>/1024                        541 ns         27.5 ns
BM_std_minmax<int>/4000                       2093 ns         96.3 ns
BM_std_minmax<int>/4096                       2146 ns         98.3 ns
BM_std_minmax<int>/5500                       2866 ns          157 ns
BM_std_minmax<int>/64000                     33619 ns         1954 ns
BM_std_minmax<int>/65536                     34252 ns         2009 ns
BM_std_minmax<int>/70000                     36618 ns         2125 ns
BM_std_minmax<long long>/1                   0.709 ns         1.19 ns
BM_std_minmax<long long>/2                    1.01 ns         1.65 ns
BM_std_minmax<long long>/3                    2.14 ns         2.21 ns
BM_std_minmax<long long>/4                    2.45 ns         2.83 ns
BM_std_minmax<long long>/5                    3.09 ns         3.47 ns
BM_std_minmax<long long>/6                    3.44 ns         4.11 ns
BM_std_minmax<long long>/7                    4.16 ns         4.79 ns
BM_std_minmax<long long>/8                    4.54 ns         5.47 ns
BM_std_minmax<long long>/9                    5.37 ns         6.20 ns
BM_std_minmax<long long>/10                   5.71 ns         6.93 ns
BM_std_minmax<long long>/11                   6.00 ns         7.60 ns
BM_std_minmax<long long>/12                   6.43 ns         8.27 ns
BM_std_minmax<long long>/13                   7.01 ns         8.94 ns
BM_std_minmax<long long>/14                   7.45 ns         9.65 ns
BM_std_minmax<long long>/15                   8.16 ns         10.4 ns
BM_std_minmax<long long>/16                   8.46 ns         5.22 ns
BM_std_minmax<long long>/17                   9.16 ns         5.22 ns
BM_std_minmax<long long>/18                   9.53 ns         5.52 ns
BM_std_minmax<long long>/19                   10.2 ns         6.02 ns
BM_std_minmax<long long>/20                   10.5 ns         6.89 ns
BM_std_minmax<long long>/21                   11.3 ns         7.83 ns
BM_std_minmax<long long>/22                   11.6 ns         8.59 ns
BM_std_minmax<long long>/23                   12.3 ns         9.91 ns
BM_std_minmax<long long>/24                   12.6 ns         10.1 ns
BM_std_minmax<long long>/25                   13.2 ns         12.0 ns
BM_std_minmax<long long>/26                   13.6 ns         13.5 ns
BM_std_minmax<long long>/27                   14.2 ns         14.8 ns
BM_std_minmax<long long>/28                   14.7 ns         15.9 ns
BM_std_minmax<long long>/29                   15.3 ns         16.6 ns
BM_std_minmax<long long>/30                   15.8 ns         17.3 ns
BM_std_minmax<long long>/31                   16.3 ns         18.2 ns
BM_std_minmax<long long>/32                   16.7 ns         7.18 ns
BM_std_minmax<long long>/64                   33.1 ns         11.5 ns
BM_std_minmax<long long>/512                   268 ns         71.0 ns
BM_std_minmax<long long>/1024                  532 ns          138 ns
BM_std_minmax<long long>/4000                 2056 ns          533 ns
BM_std_minmax<long long>/4096                 2112 ns          539 ns
BM_std_minmax<long long>/5500                 2823 ns          749 ns
BM_std_minmax<long long>/64000               32956 ns         8590 ns
BM_std_minmax<long long>/65536               33795 ns         8791 ns
BM_std_minmax<long long>/70000               36084 ns         9442 ns
BM_std_minmax<unsigned char>/1               0.714 ns         1.41 ns
BM_std_minmax<unsigned char>/2               0.955 ns         1.96 ns
BM_std_minmax<unsigned char>/3                1.90 ns         2.63 ns
BM_std_minmax<unsigned char>/4                2.40 ns         3.34 ns
BM_std_minmax<unsigned char>/5                2.87 ns         4.10 ns
BM_std_minmax<unsigned char>/6                3.47 ns         4.88 ns
BM_std_minmax<unsigned char>/7                4.04 ns         5.66 ns
BM_std_minmax<unsigned char>/8                4.65 ns         6.45 ns
BM_std_minmax<unsigned char>/9                5.18 ns         7.24 ns
BM_std_minmax<unsigned char>/10               5.80 ns         8.05 ns
BM_std_minmax<unsigned char>/11               6.24 ns         8.86 ns
BM_std_minmax<unsigned char>/12               6.78 ns         9.70 ns
BM_std_minmax<unsigned char>/13               7.30 ns         10.6 ns
BM_std_minmax<unsigned char>/14               7.86 ns         11.4 ns
BM_std_minmax<unsigned char>/15               8.46 ns         12.3 ns
BM_std_minmax<unsigned char>/16               9.00 ns         2.12 ns
BM_std_minmax<unsigned char>/17               9.58 ns         2.83 ns
BM_std_minmax<unsigned char>/18               10.1 ns         3.37 ns
BM_std_minmax<unsigned char>/19               10.7 ns         4.11 ns
BM_std_minmax<unsigned char>/20               11.2 ns         4.85 ns
BM_std_minmax<unsigned char>/21               11.9 ns         5.69 ns
BM_std_minmax<unsigned char>/22               12.3 ns         6.77 ns
BM_std_minmax<unsigned char>/23               13.1 ns         7.56 ns
BM_std_minmax<unsigned char>/24               13.5 ns         8.40 ns
BM_std_minmax<unsigned char>/25               14.2 ns         9.30 ns
BM_std_minmax<unsigned char>/26               14.4 ns         10.1 ns
BM_std_minmax<unsigned char>/27               15.0 ns         11.1 ns
BM_std_minmax<unsigned char>/28               15.3 ns         11.9 ns
BM_std_minmax<unsigned char>/29               16.2 ns         12.9 ns
BM_std_minmax<unsigned char>/30               16.5 ns         13.9 ns
BM_std_minmax<unsigned char>/31               17.2 ns         14.8 ns
BM_std_minmax<unsigned char>/32               17.6 ns         2.36 ns
BM_std_minmax<unsigned char>/64               35.6 ns         3.21 ns
BM_std_minmax<unsigned char>/512               288 ns         6.00 ns
BM_std_minmax<unsigned char>/1024              573 ns         8.80 ns
BM_std_minmax<unsigned char>/4000             2222 ns         28.6 ns
BM_std_minmax<unsigned char>/4096             2265 ns         25.9 ns
BM_std_minmax<unsigned char>/5500             3047 ns         48.8 ns
BM_std_minmax<unsigned char>/64000           35059 ns          480 ns
BM_std_minmax<unsigned char>/65536           35941 ns          491 ns
BM_std_minmax<unsigned char>/70000           38922 ns          525 ns
BM_std_minmax<unsigned short>/1              0.711 ns         1.18 ns
BM_std_minmax<unsigned short>/2              0.957 ns         1.65 ns
BM_std_minmax<unsigned short>/3               2.13 ns         2.21 ns
BM_std_minmax<unsigned short>/4               2.14 ns         2.78 ns
BM_std_minmax<unsigned short>/5               3.06 ns         3.29 ns
BM_std_minmax<unsigned short>/6               2.89 ns         3.87 ns
BM_std_minmax<unsigned short>/7               3.80 ns         4.55 ns
BM_std_minmax<unsigned short>/8               3.68 ns         2.02 ns
BM_std_minmax<unsigned short>/9               4.53 ns         2.40 ns
BM_std_minmax<unsigned short>/10              4.60 ns         2.94 ns
BM_std_minmax<unsigned short>/11              5.67 ns         3.67 ns
BM_std_minmax<unsigned short>/12              5.39 ns         4.22 ns
BM_std_minmax<unsigned short>/13              6.58 ns         4.78 ns
BM_std_minmax<unsigned short>/14              6.33 ns         5.54 ns
BM_std_minmax<unsigned short>/15              7.34 ns         6.30 ns
BM_std_minmax<unsigned short>/16              7.17 ns         2.25 ns
BM_std_minmax<unsigned short>/17              8.19 ns         2.61 ns
BM_std_minmax<unsigned short>/18              8.02 ns         3.19 ns
BM_std_minmax<unsigned short>/19              9.03 ns         3.72 ns
BM_std_minmax<unsigned short>/20              8.89 ns         4.36 ns
BM_std_minmax<unsigned short>/21              9.77 ns         5.10 ns
BM_std_minmax<unsigned short>/22              9.70 ns         5.55 ns
BM_std_minmax<unsigned short>/23              10.8 ns         6.29 ns
BM_std_minmax<unsigned short>/24              10.6 ns         2.41 ns
BM_std_minmax<unsigned short>/25              11.6 ns         2.75 ns
BM_std_minmax<unsigned short>/26              11.4 ns         3.26 ns
BM_std_minmax<unsigned short>/27              12.4 ns         3.86 ns
BM_std_minmax<unsigned short>/28              12.3 ns         4.45 ns
BM_std_minmax<unsigned short>/29              13.2 ns         5.07 ns
BM_std_minmax<unsigned short>/30              13.1 ns         5.77 ns
BM_std_minmax<unsigned short>/31              13.9 ns         6.65 ns
BM_std_minmax<unsigned short>/32              13.9 ns         2.72 ns
BM_std_minmax<unsigned short>/64              27.8 ns         3.25 ns
BM_std_minmax<unsigned short>/512              220 ns         8.30 ns
BM_std_minmax<unsigned short>/1024             435 ns         14.1 ns
BM_std_minmax<unsigned short>/4000            1703 ns         49.8 ns
BM_std_minmax<unsigned short>/4096            1746 ns         47.9 ns
BM_std_minmax<unsigned short>/5500            2350 ns         69.9 ns
BM_std_minmax<unsigned short>/64000          27388 ns          953 ns
BM_std_minmax<unsigned short>/65536          28040 ns          975 ns
BM_std_minmax<unsigned short>/70000          29967 ns         1040 ns
BM_std_minmax<unsigned int>/1                0.712 ns         1.18 ns
BM_std_minmax<unsigned int>/2                0.965 ns         1.65 ns
BM_std_minmax<unsigned int>/3                 2.13 ns         2.14 ns
BM_std_minmax<unsigned int>/4                 2.09 ns         2.64 ns
BM_std_minmax<unsigned int>/5                 3.02 ns         3.21 ns
BM_std_minmax<unsigned int>/6                 2.94 ns         3.81 ns
BM_std_minmax<unsigned int>/7                 3.91 ns         4.38 ns
BM_std_minmax<unsigned int>/8                 3.75 ns         4.93 ns
BM_std_minmax<unsigned int>/9                 4.71 ns         5.60 ns
BM_std_minmax<unsigned int>/10                4.59 ns         6.26 ns
BM_std_minmax<unsigned int>/11                5.57 ns         6.80 ns
BM_std_minmax<unsigned int>/12                5.43 ns         7.47 ns
BM_std_minmax<unsigned int>/13                6.45 ns         8.10 ns
BM_std_minmax<unsigned int>/14                6.32 ns         8.69 ns
BM_std_minmax<unsigned int>/15                7.29 ns         9.37 ns
BM_std_minmax<unsigned int>/16                7.12 ns         9.99 ns
BM_std_minmax<unsigned int>/17                8.24 ns         10.6 ns
BM_std_minmax<unsigned int>/18                8.00 ns         11.2 ns
BM_std_minmax<unsigned int>/19                8.94 ns         12.0 ns
BM_std_minmax<unsigned int>/20                8.91 ns         12.6 ns
BM_std_minmax<unsigned int>/21                9.73 ns         17.2 ns
BM_std_minmax<unsigned int>/22                9.75 ns         13.8 ns
BM_std_minmax<unsigned int>/23                10.6 ns         14.5 ns
BM_std_minmax<unsigned int>/24                10.6 ns         15.1 ns
BM_std_minmax<unsigned int>/25                11.5 ns         15.7 ns
BM_std_minmax<unsigned int>/26                11.4 ns         16.3 ns
BM_std_minmax<unsigned int>/27                12.3 ns         17.0 ns
BM_std_minmax<unsigned int>/28                12.3 ns         17.6 ns
BM_std_minmax<unsigned int>/29                13.2 ns         18.3 ns
BM_std_minmax<unsigned int>/30                13.2 ns         19.0 ns
BM_std_minmax<unsigned int>/31                14.0 ns         19.6 ns
BM_std_minmax<unsigned int>/32                14.0 ns         3.39 ns
BM_std_minmax<unsigned int>/64                27.6 ns         4.05 ns
BM_std_minmax<unsigned int>/512                221 ns         14.2 ns
BM_std_minmax<unsigned int>/1024               439 ns         25.5 ns
BM_std_minmax<unsigned int>/4000              1720 ns         96.3 ns
BM_std_minmax<unsigned int>/4096              1762 ns         97.8 ns
BM_std_minmax<unsigned int>/5500              2364 ns          146 ns
BM_std_minmax<unsigned int>/64000            27874 ns         1905 ns
BM_std_minmax<unsigned int>/65536            28012 ns         1961 ns
BM_std_minmax<unsigned int>/70000            29899 ns         2087 ns
BM_std_minmax<unsigned long long>/1          0.707 ns         1.18 ns
BM_std_minmax<unsigned long long>/2          0.909 ns         1.65 ns
BM_std_minmax<unsigned long long>/3           1.65 ns         2.70 ns
BM_std_minmax<unsigned long long>/4           1.93 ns         2.69 ns
BM_std_minmax<unsigned long long>/5           2.45 ns         3.34 ns
BM_std_minmax<unsigned long long>/6           2.78 ns         3.81 ns
BM_std_minmax<unsigned long long>/7           3.28 ns         4.43 ns
BM_std_minmax<unsigned long long>/8           3.70 ns         4.92 ns
BM_std_minmax<unsigned long long>/9           4.12 ns         5.64 ns
BM_std_minmax<unsigned long long>/10          4.44 ns         6.15 ns
BM_std_minmax<unsigned long long>/11          4.91 ns         6.81 ns
BM_std_minmax<unsigned long long>/12          5.31 ns         7.41 ns
BM_std_minmax<unsigned long long>/13          5.72 ns         7.96 ns
BM_std_minmax<unsigned long long>/14          6.05 ns         8.66 ns
BM_std_minmax<unsigned long long>/15          6.55 ns         9.37 ns
BM_std_minmax<unsigned long long>/16          6.89 ns         7.98 ns
BM_std_minmax<unsigned long long>/17          7.34 ns         8.13 ns
BM_std_minmax<unsigned long long>/18          7.73 ns         8.42 ns
BM_std_minmax<unsigned long long>/19          8.26 ns         8.63 ns
BM_std_minmax<unsigned long long>/20          8.54 ns         8.96 ns
BM_std_minmax<unsigned long long>/21          9.14 ns         9.37 ns
BM_std_minmax<unsigned long long>/22          9.39 ns         9.67 ns
BM_std_minmax<unsigned long long>/23          10.1 ns         10.1 ns
BM_std_minmax<unsigned long long>/24          10.4 ns         10.6 ns
BM_std_minmax<unsigned long long>/25          11.0 ns         11.3 ns
BM_std_minmax<unsigned long long>/26          11.3 ns         12.1 ns
BM_std_minmax<unsigned long long>/27          11.8 ns         14.2 ns
BM_std_minmax<unsigned long long>/28          12.1 ns         15.8 ns
BM_std_minmax<unsigned long long>/29          12.6 ns         17.4 ns
BM_std_minmax<unsigned long long>/30          13.1 ns         18.1 ns
BM_std_minmax<unsigned long long>/31          13.4 ns         18.8 ns
BM_std_minmax<unsigned long long>/32          13.8 ns         10.4 ns
BM_std_minmax<unsigned long long>/64          27.3 ns         15.5 ns
BM_std_minmax<unsigned long long>/512          222 ns         80.6 ns
BM_std_minmax<unsigned long long>/1024         443 ns          156 ns
BM_std_minmax<unsigned long long>/4000        1731 ns          591 ns
BM_std_minmax<unsigned long long>/4096        1752 ns          609 ns
BM_std_minmax<unsigned long long>/5500        2340 ns          819 ns
BM_std_minmax<unsigned long long>/64000      27166 ns         9652 ns
BM_std_minmax<unsigned long long>/65536      27869 ns         9876 ns
BM_std_minmax<unsigned long long>/70000      29920 ns        10680 ns

@philnik777 philnik777 marked this pull request as ready for review April 2, 2024 17:34
@philnik777 philnik777 requested a review from a team as a code owner April 2, 2024 17:34
@llvmbot llvmbot added the libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi. label Apr 4, 2024
@llvmbot
Copy link
Member

llvmbot commented Apr 4, 2024

@llvm/pr-subscribers-libcxx

Author: Nikolas Klauser (philnik777)

Changes

This allows Clang to vectorize the loop.

---------------------------------------------------------------------
Benchmark                                         old             new
---------------------------------------------------------------------
BM_std_minmax&lt;char&gt;/1                        0.659 ns         1.41 ns
BM_std_minmax&lt;char&gt;/2                         1.08 ns         2.16 ns
BM_std_minmax&lt;char&gt;/3                         2.16 ns         2.96 ns
BM_std_minmax&lt;char&gt;/4                         2.82 ns         3.81 ns
BM_std_minmax&lt;char&gt;/5                         3.43 ns         4.69 ns
BM_std_minmax&lt;char&gt;/6                         4.08 ns         5.63 ns
BM_std_minmax&lt;char&gt;/7                         4.75 ns         6.51 ns
BM_std_minmax&lt;char&gt;/8                         5.42 ns         7.41 ns
BM_std_minmax&lt;char&gt;/9                         6.05 ns         8.34 ns
BM_std_minmax&lt;char&gt;/10                        6.68 ns         9.29 ns
BM_std_minmax&lt;char&gt;/11                        7.47 ns         10.6 ns
BM_std_minmax&lt;char&gt;/12                        7.95 ns         11.4 ns
BM_std_minmax&lt;char&gt;/13                        8.64 ns         12.4 ns
BM_std_minmax&lt;char&gt;/14                        9.35 ns         13.4 ns
BM_std_minmax&lt;char&gt;/15                        10.1 ns         14.4 ns
BM_std_minmax&lt;char&gt;/16                        10.6 ns         2.25 ns
BM_std_minmax&lt;char&gt;/17                        11.3 ns         2.82 ns
BM_std_minmax&lt;char&gt;/18                        11.8 ns         3.71 ns
BM_std_minmax&lt;char&gt;/19                        12.6 ns         4.52 ns
BM_std_minmax&lt;char&gt;/20                        13.2 ns         5.47 ns
BM_std_minmax&lt;char&gt;/21                        14.1 ns         6.67 ns
BM_std_minmax&lt;char&gt;/22                        14.5 ns         7.78 ns
BM_std_minmax&lt;char&gt;/23                        15.1 ns         8.67 ns
BM_std_minmax&lt;char&gt;/24                        15.7 ns         9.68 ns
BM_std_minmax&lt;char&gt;/25                        16.4 ns         10.7 ns
BM_std_minmax&lt;char&gt;/26                        17.1 ns         11.7 ns
BM_std_minmax&lt;char&gt;/27                        17.8 ns         12.8 ns
BM_std_minmax&lt;char&gt;/28                        18.4 ns         14.1 ns
BM_std_minmax&lt;char&gt;/29                        19.0 ns         15.0 ns
BM_std_minmax&lt;char&gt;/30                        19.6 ns         16.0 ns
BM_std_minmax&lt;char&gt;/31                        20.2 ns         17.0 ns
BM_std_minmax&lt;char&gt;/32                        20.8 ns         2.46 ns
BM_std_minmax&lt;char&gt;/64                        41.5 ns         2.97 ns
BM_std_minmax&lt;char&gt;/512                        340 ns         6.05 ns
BM_std_minmax&lt;char&gt;/1024                       667 ns         8.83 ns
BM_std_minmax&lt;char&gt;/4000                      2571 ns         28.6 ns
BM_std_minmax&lt;char&gt;/4096                      2632 ns         25.8 ns
BM_std_minmax&lt;char&gt;/5500                      3554 ns         51.1 ns
BM_std_minmax&lt;char&gt;/64000                    41175 ns          480 ns
BM_std_minmax&lt;char&gt;/65536                    42039 ns          490 ns
BM_std_minmax&lt;char&gt;/70000                    44931 ns          528 ns
BM_std_minmax&lt;short&gt;/1                       0.708 ns         1.20 ns
BM_std_minmax&lt;short&gt;/2                        1.18 ns         1.78 ns
BM_std_minmax&lt;short&gt;/3                        1.98 ns         2.42 ns
BM_std_minmax&lt;short&gt;/4                        2.47 ns         3.05 ns
BM_std_minmax&lt;short&gt;/5                        3.09 ns         3.72 ns
BM_std_minmax&lt;short&gt;/6                        3.49 ns         4.37 ns
BM_std_minmax&lt;short&gt;/7                        4.24 ns         5.03 ns
BM_std_minmax&lt;short&gt;/8                        4.65 ns         2.12 ns
BM_std_minmax&lt;short&gt;/9                        5.34 ns         2.51 ns
BM_std_minmax&lt;short&gt;/10                       5.82 ns         3.18 ns
BM_std_minmax&lt;short&gt;/11                       6.36 ns         3.97 ns
BM_std_minmax&lt;short&gt;/12                       6.73 ns         4.68 ns
BM_std_minmax&lt;short&gt;/13                       7.59 ns         5.49 ns
BM_std_minmax&lt;short&gt;/14                       7.77 ns         6.45 ns
BM_std_minmax&lt;short&gt;/15                       8.54 ns         7.55 ns
BM_std_minmax&lt;short&gt;/16                       8.74 ns         2.38 ns
BM_std_minmax&lt;short&gt;/17                       9.59 ns         2.76 ns
BM_std_minmax&lt;short&gt;/18                       9.88 ns         3.37 ns
BM_std_minmax&lt;short&gt;/19                       10.7 ns         4.17 ns
BM_std_minmax&lt;short&gt;/20                       10.9 ns         4.88 ns
BM_std_minmax&lt;short&gt;/21                       12.1 ns         5.70 ns
BM_std_minmax&lt;short&gt;/22                       12.6 ns         6.64 ns
BM_std_minmax&lt;short&gt;/23                       13.5 ns         7.72 ns
BM_std_minmax&lt;short&gt;/24                       13.2 ns         2.87 ns
BM_std_minmax&lt;short&gt;/25                       14.2 ns         3.10 ns
BM_std_minmax&lt;short&gt;/26                       14.2 ns         3.59 ns
BM_std_minmax&lt;short&gt;/27                       15.4 ns         4.35 ns
BM_std_minmax&lt;short&gt;/28                       15.3 ns         5.10 ns
BM_std_minmax&lt;short&gt;/29                       16.2 ns         5.87 ns
BM_std_minmax&lt;short&gt;/30                       16.2 ns         6.88 ns
BM_std_minmax&lt;short&gt;/31                       17.0 ns         7.78 ns
BM_std_minmax&lt;short&gt;/32                       17.2 ns         3.45 ns
BM_std_minmax&lt;short&gt;/64                       34.1 ns         3.35 ns
BM_std_minmax&lt;short&gt;/512                       279 ns         8.37 ns
BM_std_minmax&lt;short&gt;/1024                      549 ns         14.2 ns
BM_std_minmax&lt;short&gt;/4000                     2111 ns         50.1 ns
BM_std_minmax&lt;short&gt;/4096                     2167 ns         47.9 ns
BM_std_minmax&lt;short&gt;/5500                     2895 ns         69.7 ns
BM_std_minmax&lt;short&gt;/64000                   33454 ns          953 ns
BM_std_minmax&lt;short&gt;/65536                   34474 ns          970 ns
BM_std_minmax&lt;short&gt;/70000                   36691 ns         1037 ns
BM_std_minmax&lt;int&gt;/1                         0.664 ns         1.17 ns
BM_std_minmax&lt;int&gt;/2                          1.11 ns         1.69 ns
BM_std_minmax&lt;int&gt;/3                          2.36 ns         2.29 ns
BM_std_minmax&lt;int&gt;/4                          2.53 ns         2.91 ns
BM_std_minmax&lt;int&gt;/5                          3.23 ns         3.56 ns
BM_std_minmax&lt;int&gt;/6                          3.56 ns         4.23 ns
BM_std_minmax&lt;int&gt;/7                          4.28 ns         4.91 ns
BM_std_minmax&lt;int&gt;/8                          4.60 ns         5.60 ns
BM_std_minmax&lt;int&gt;/9                          5.38 ns         6.31 ns
BM_std_minmax&lt;int&gt;/10                         5.69 ns         7.03 ns
BM_std_minmax&lt;int&gt;/11                         6.41 ns         7.70 ns
BM_std_minmax&lt;int&gt;/12                         6.73 ns         8.39 ns
BM_std_minmax&lt;int&gt;/13                         7.38 ns         9.07 ns
BM_std_minmax&lt;int&gt;/14                         7.74 ns         9.79 ns
BM_std_minmax&lt;int&gt;/15                         8.53 ns         10.5 ns
BM_std_minmax&lt;int&gt;/16                         8.79 ns         11.2 ns
BM_std_minmax&lt;int&gt;/17                         9.63 ns         12.0 ns
BM_std_minmax&lt;int&gt;/18                         9.84 ns         12.7 ns
BM_std_minmax&lt;int&gt;/19                         10.6 ns         13.5 ns
BM_std_minmax&lt;int&gt;/20                         11.0 ns         14.3 ns
BM_std_minmax&lt;int&gt;/21                         11.7 ns         15.0 ns
BM_std_minmax&lt;int&gt;/22                         12.0 ns         15.7 ns
BM_std_minmax&lt;int&gt;/23                         13.1 ns         16.5 ns
BM_std_minmax&lt;int&gt;/24                         13.0 ns         17.3 ns
BM_std_minmax&lt;int&gt;/25                         13.7 ns         17.9 ns
BM_std_minmax&lt;int&gt;/26                         14.0 ns         18.6 ns
BM_std_minmax&lt;int&gt;/27                         14.8 ns         19.4 ns
BM_std_minmax&lt;int&gt;/28                         15.1 ns         20.3 ns
BM_std_minmax&lt;int&gt;/29                         15.8 ns         20.9 ns
BM_std_minmax&lt;int&gt;/30                         16.1 ns         21.7 ns
BM_std_minmax&lt;int&gt;/31                         16.9 ns         22.5 ns
BM_std_minmax&lt;int&gt;/32                         17.2 ns         3.40 ns
BM_std_minmax&lt;int&gt;/64                         33.9 ns         4.04 ns
BM_std_minmax&lt;int&gt;/512                         275 ns         14.6 ns
BM_std_minmax&lt;int&gt;/1024                        541 ns         27.5 ns
BM_std_minmax&lt;int&gt;/4000                       2093 ns         96.3 ns
BM_std_minmax&lt;int&gt;/4096                       2146 ns         98.3 ns
BM_std_minmax&lt;int&gt;/5500                       2866 ns          157 ns
BM_std_minmax&lt;int&gt;/64000                     33619 ns         1954 ns
BM_std_minmax&lt;int&gt;/65536                     34252 ns         2009 ns
BM_std_minmax&lt;int&gt;/70000                     36618 ns         2125 ns
BM_std_minmax&lt;long long&gt;/1                   0.709 ns         1.19 ns
BM_std_minmax&lt;long long&gt;/2                    1.01 ns         1.65 ns
BM_std_minmax&lt;long long&gt;/3                    2.14 ns         2.21 ns
BM_std_minmax&lt;long long&gt;/4                    2.45 ns         2.83 ns
BM_std_minmax&lt;long long&gt;/5                    3.09 ns         3.47 ns
BM_std_minmax&lt;long long&gt;/6                    3.44 ns         4.11 ns
BM_std_minmax&lt;long long&gt;/7                    4.16 ns         4.79 ns
BM_std_minmax&lt;long long&gt;/8                    4.54 ns         5.47 ns
BM_std_minmax&lt;long long&gt;/9                    5.37 ns         6.20 ns
BM_std_minmax&lt;long long&gt;/10                   5.71 ns         6.93 ns
BM_std_minmax&lt;long long&gt;/11                   6.00 ns         7.60 ns
BM_std_minmax&lt;long long&gt;/12                   6.43 ns         8.27 ns
BM_std_minmax&lt;long long&gt;/13                   7.01 ns         8.94 ns
BM_std_minmax&lt;long long&gt;/14                   7.45 ns         9.65 ns
BM_std_minmax&lt;long long&gt;/15                   8.16 ns         10.4 ns
BM_std_minmax&lt;long long&gt;/16                   8.46 ns         5.22 ns
BM_std_minmax&lt;long long&gt;/17                   9.16 ns         5.22 ns
BM_std_minmax&lt;long long&gt;/18                   9.53 ns         5.52 ns
BM_std_minmax&lt;long long&gt;/19                   10.2 ns         6.02 ns
BM_std_minmax&lt;long long&gt;/20                   10.5 ns         6.89 ns
BM_std_minmax&lt;long long&gt;/21                   11.3 ns         7.83 ns
BM_std_minmax&lt;long long&gt;/22                   11.6 ns         8.59 ns
BM_std_minmax&lt;long long&gt;/23                   12.3 ns         9.91 ns
BM_std_minmax&lt;long long&gt;/24                   12.6 ns         10.1 ns
BM_std_minmax&lt;long long&gt;/25                   13.2 ns         12.0 ns
BM_std_minmax&lt;long long&gt;/26                   13.6 ns         13.5 ns
BM_std_minmax&lt;long long&gt;/27                   14.2 ns         14.8 ns
BM_std_minmax&lt;long long&gt;/28                   14.7 ns         15.9 ns
BM_std_minmax&lt;long long&gt;/29                   15.3 ns         16.6 ns
BM_std_minmax&lt;long long&gt;/30                   15.8 ns         17.3 ns
BM_std_minmax&lt;long long&gt;/31                   16.3 ns         18.2 ns
BM_std_minmax&lt;long long&gt;/32                   16.7 ns         7.18 ns
BM_std_minmax&lt;long long&gt;/64                   33.1 ns         11.5 ns
BM_std_minmax&lt;long long&gt;/512                   268 ns         71.0 ns
BM_std_minmax&lt;long long&gt;/1024                  532 ns          138 ns
BM_std_minmax&lt;long long&gt;/4000                 2056 ns          533 ns
BM_std_minmax&lt;long long&gt;/4096                 2112 ns          539 ns
BM_std_minmax&lt;long long&gt;/5500                 2823 ns          749 ns
BM_std_minmax&lt;long long&gt;/64000               32956 ns         8590 ns
BM_std_minmax&lt;long long&gt;/65536               33795 ns         8791 ns
BM_std_minmax&lt;long long&gt;/70000               36084 ns         9442 ns
BM_std_minmax&lt;unsigned char&gt;/1               0.714 ns         1.41 ns
BM_std_minmax&lt;unsigned char&gt;/2               0.955 ns         1.96 ns
BM_std_minmax&lt;unsigned char&gt;/3                1.90 ns         2.63 ns
BM_std_minmax&lt;unsigned char&gt;/4                2.40 ns         3.34 ns
BM_std_minmax&lt;unsigned char&gt;/5                2.87 ns         4.10 ns
BM_std_minmax&lt;unsigned char&gt;/6                3.47 ns         4.88 ns
BM_std_minmax&lt;unsigned char&gt;/7                4.04 ns         5.66 ns
BM_std_minmax&lt;unsigned char&gt;/8                4.65 ns         6.45 ns
BM_std_minmax&lt;unsigned char&gt;/9                5.18 ns         7.24 ns
BM_std_minmax&lt;unsigned char&gt;/10               5.80 ns         8.05 ns
BM_std_minmax&lt;unsigned char&gt;/11               6.24 ns         8.86 ns
BM_std_minmax&lt;unsigned char&gt;/12               6.78 ns         9.70 ns
BM_std_minmax&lt;unsigned char&gt;/13               7.30 ns         10.6 ns
BM_std_minmax&lt;unsigned char&gt;/14               7.86 ns         11.4 ns
BM_std_minmax&lt;unsigned char&gt;/15               8.46 ns         12.3 ns
BM_std_minmax&lt;unsigned char&gt;/16               9.00 ns         2.12 ns
BM_std_minmax&lt;unsigned char&gt;/17               9.58 ns         2.83 ns
BM_std_minmax&lt;unsigned char&gt;/18               10.1 ns         3.37 ns
BM_std_minmax&lt;unsigned char&gt;/19               10.7 ns         4.11 ns
BM_std_minmax&lt;unsigned char&gt;/20               11.2 ns         4.85 ns
BM_std_minmax&lt;unsigned char&gt;/21               11.9 ns         5.69 ns
BM_std_minmax&lt;unsigned char&gt;/22               12.3 ns         6.77 ns
BM_std_minmax&lt;unsigned char&gt;/23               13.1 ns         7.56 ns
BM_std_minmax&lt;unsigned char&gt;/24               13.5 ns         8.40 ns
BM_std_minmax&lt;unsigned char&gt;/25               14.2 ns         9.30 ns
BM_std_minmax&lt;unsigned char&gt;/26               14.4 ns         10.1 ns
BM_std_minmax&lt;unsigned char&gt;/27               15.0 ns         11.1 ns
BM_std_minmax&lt;unsigned char&gt;/28               15.3 ns         11.9 ns
BM_std_minmax&lt;unsigned char&gt;/29               16.2 ns         12.9 ns
BM_std_minmax&lt;unsigned char&gt;/30               16.5 ns         13.9 ns
BM_std_minmax&lt;unsigned char&gt;/31               17.2 ns         14.8 ns
BM_std_minmax&lt;unsigned char&gt;/32               17.6 ns         2.36 ns
BM_std_minmax&lt;unsigned char&gt;/64               35.6 ns         3.21 ns
BM_std_minmax&lt;unsigned char&gt;/512               288 ns         6.00 ns
BM_std_minmax&lt;unsigned char&gt;/1024              573 ns         8.80 ns
BM_std_minmax&lt;unsigned char&gt;/4000             2222 ns         28.6 ns
BM_std_minmax&lt;unsigned char&gt;/4096             2265 ns         25.9 ns
BM_std_minmax&lt;unsigned char&gt;/5500             3047 ns         48.8 ns
BM_std_minmax&lt;unsigned char&gt;/64000           35059 ns          480 ns
BM_std_minmax&lt;unsigned char&gt;/65536           35941 ns          491 ns
BM_std_minmax&lt;unsigned char&gt;/70000           38922 ns          525 ns
BM_std_minmax&lt;unsigned short&gt;/1              0.711 ns         1.18 ns
BM_std_minmax&lt;unsigned short&gt;/2              0.957 ns         1.65 ns
BM_std_minmax&lt;unsigned short&gt;/3               2.13 ns         2.21 ns
BM_std_minmax&lt;unsigned short&gt;/4               2.14 ns         2.78 ns
BM_std_minmax&lt;unsigned short&gt;/5               3.06 ns         3.29 ns
BM_std_minmax&lt;unsigned short&gt;/6               2.89 ns         3.87 ns
BM_std_minmax&lt;unsigned short&gt;/7               3.80 ns         4.55 ns
BM_std_minmax&lt;unsigned short&gt;/8               3.68 ns         2.02 ns
BM_std_minmax&lt;unsigned short&gt;/9               4.53 ns         2.40 ns
BM_std_minmax&lt;unsigned short&gt;/10              4.60 ns         2.94 ns
BM_std_minmax&lt;unsigned short&gt;/11              5.67 ns         3.67 ns
BM_std_minmax&lt;unsigned short&gt;/12              5.39 ns         4.22 ns
BM_std_minmax&lt;unsigned short&gt;/13              6.58 ns         4.78 ns
BM_std_minmax&lt;unsigned short&gt;/14              6.33 ns         5.54 ns
BM_std_minmax&lt;unsigned short&gt;/15              7.34 ns         6.30 ns
BM_std_minmax&lt;unsigned short&gt;/16              7.17 ns         2.25 ns
BM_std_minmax&lt;unsigned short&gt;/17              8.19 ns         2.61 ns
BM_std_minmax&lt;unsigned short&gt;/18              8.02 ns         3.19 ns
BM_std_minmax&lt;unsigned short&gt;/19              9.03 ns         3.72 ns
BM_std_minmax&lt;unsigned short&gt;/20              8.89 ns         4.36 ns
BM_std_minmax&lt;unsigned short&gt;/21              9.77 ns         5.10 ns
BM_std_minmax&lt;unsigned short&gt;/22              9.70 ns         5.55 ns
BM_std_minmax&lt;unsigned short&gt;/23              10.8 ns         6.29 ns
BM_std_minmax&lt;unsigned short&gt;/24              10.6 ns         2.41 ns
BM_std_minmax&lt;unsigned short&gt;/25              11.6 ns         2.75 ns
BM_std_minmax&lt;unsigned short&gt;/26              11.4 ns         3.26 ns
BM_std_minmax&lt;unsigned short&gt;/27              12.4 ns         3.86 ns
BM_std_minmax&lt;unsigned short&gt;/28              12.3 ns         4.45 ns
BM_std_minmax&lt;unsigned short&gt;/29              13.2 ns         5.07 ns
BM_std_minmax&lt;unsigned short&gt;/30              13.1 ns         5.77 ns
BM_std_minmax&lt;unsigned short&gt;/31              13.9 ns         6.65 ns
BM_std_minmax&lt;unsigned short&gt;/32              13.9 ns         2.72 ns
BM_std_minmax&lt;unsigned short&gt;/64              27.8 ns         3.25 ns
BM_std_minmax&lt;unsigned short&gt;/512              220 ns         8.30 ns
BM_std_minmax&lt;unsigned short&gt;/1024             435 ns         14.1 ns
BM_std_minmax&lt;unsigned short&gt;/4000            1703 ns         49.8 ns
BM_std_minmax&lt;unsigned short&gt;/4096            1746 ns         47.9 ns
BM_std_minmax&lt;unsigned short&gt;/5500            2350 ns         69.9 ns
BM_std_minmax&lt;unsigned short&gt;/64000          27388 ns          953 ns
BM_std_minmax&lt;unsigned short&gt;/65536          28040 ns          975 ns
BM_std_minmax&lt;unsigned short&gt;/70000          29967 ns         1040 ns
BM_std_minmax&lt;unsigned int&gt;/1                0.712 ns         1.18 ns
BM_std_minmax&lt;unsigned int&gt;/2                0.965 ns         1.65 ns
BM_std_minmax&lt;unsigned int&gt;/3                 2.13 ns         2.14 ns
BM_std_minmax&lt;unsigned int&gt;/4                 2.09 ns         2.64 ns
BM_std_minmax&lt;unsigned int&gt;/5                 3.02 ns         3.21 ns
BM_std_minmax&lt;unsigned int&gt;/6                 2.94 ns         3.81 ns
BM_std_minmax&lt;unsigned int&gt;/7                 3.91 ns         4.38 ns
BM_std_minmax&lt;unsigned int&gt;/8                 3.75 ns         4.93 ns
BM_std_minmax&lt;unsigned int&gt;/9                 4.71 ns         5.60 ns
BM_std_minmax&lt;unsigned int&gt;/10                4.59 ns         6.26 ns
BM_std_minmax&lt;unsigned int&gt;/11                5.57 ns         6.80 ns
BM_std_minmax&lt;unsigned int&gt;/12                5.43 ns         7.47 ns
BM_std_minmax&lt;unsigned int&gt;/13                6.45 ns         8.10 ns
BM_std_minmax&lt;unsigned int&gt;/14                6.32 ns         8.69 ns
BM_std_minmax&lt;unsigned int&gt;/15                7.29 ns         9.37 ns
BM_std_minmax&lt;unsigned int&gt;/16                7.12 ns         9.99 ns
BM_std_minmax&lt;unsigned int&gt;/17                8.24 ns         10.6 ns
BM_std_minmax&lt;unsigned int&gt;/18                8.00 ns         11.2 ns
BM_std_minmax&lt;unsigned int&gt;/19                8.94 ns         12.0 ns
BM_std_minmax&lt;unsigned int&gt;/20                8.91 ns         12.6 ns
BM_std_minmax&lt;unsigned int&gt;/21                9.73 ns         17.2 ns
BM_std_minmax&lt;unsigned int&gt;/22                9.75 ns         13.8 ns
BM_std_minmax&lt;unsigned int&gt;/23                10.6 ns         14.5 ns
BM_std_minmax&lt;unsigned int&gt;/24                10.6 ns         15.1 ns
BM_std_minmax&lt;unsigned int&gt;/25                11.5 ns         15.7 ns
BM_std_minmax&lt;unsigned int&gt;/26                11.4 ns         16.3 ns
BM_std_minmax&lt;unsigned int&gt;/27                12.3 ns         17.0 ns
BM_std_minmax&lt;unsigned int&gt;/28                12.3 ns         17.6 ns
BM_std_minmax&lt;unsigned int&gt;/29                13.2 ns         18.3 ns
BM_std_minmax&lt;unsigned int&gt;/30                13.2 ns         19.0 ns
BM_std_minmax&lt;unsigned int&gt;/31                14.0 ns         19.6 ns
BM_std_minmax&lt;unsigned int&gt;/32                14.0 ns         3.39 ns
BM_std_minmax&lt;unsigned int&gt;/64                27.6 ns         4.05 ns
BM_std_minmax&lt;unsigned int&gt;/512                221 ns         14.2 ns
BM_std_minmax&lt;unsigned int&gt;/1024               439 ns         25.5 ns
BM_std_minmax&lt;unsigned int&gt;/4000              1720 ns         96.3 ns
BM_std_minmax&lt;unsigned int&gt;/4096              1762 ns         97.8 ns
BM_std_minmax&lt;unsigned int&gt;/5500              2364 ns          146 ns
BM_std_minmax&lt;unsigned int&gt;/64000            27874 ns         1905 ns
BM_std_minmax&lt;unsigned int&gt;/65536            28012 ns         1961 ns
BM_std_minmax&lt;unsigned int&gt;/70000            29899 ns         2087 ns
BM_std_minmax&lt;unsigned long long&gt;/1          0.707 ns         1.18 ns
BM_std_minmax&lt;unsigned long long&gt;/2          0.909 ns         1.65 ns
BM_std_minmax&lt;unsigned long long&gt;/3           1.65 ns         2.70 ns
BM_std_minmax&lt;unsigned long long&gt;/4           1.93 ns         2.69 ns
BM_std_minmax&lt;unsigned long long&gt;/5           2.45 ns         3.34 ns
BM_std_minmax&lt;unsigned long long&gt;/6           2.78 ns         3.81 ns
BM_std_minmax&lt;unsigned long long&gt;/7           3.28 ns         4.43 ns
BM_std_minmax&lt;unsigned long long&gt;/8           3.70 ns         4.92 ns
BM_std_minmax&lt;unsigned long long&gt;/9           4.12 ns         5.64 ns
BM_std_minmax&lt;unsigned long long&gt;/10          4.44 ns         6.15 ns
BM_std_minmax&lt;unsigned long long&gt;/11          4.91 ns         6.81 ns
BM_std_minmax&lt;unsigned long long&gt;/12          5.31 ns         7.41 ns
BM_std_minmax&lt;unsigned long long&gt;/13          5.72 ns         7.96 ns
BM_std_minmax&lt;unsigned long long&gt;/14          6.05 ns         8.66 ns
BM_std_minmax&lt;unsigned long long&gt;/15          6.55 ns         9.37 ns
BM_std_minmax&lt;unsigned long long&gt;/16          6.89 ns         7.98 ns
BM_std_minmax&lt;unsigned long long&gt;/17          7.34 ns         8.13 ns
BM_std_minmax&lt;unsigned long long&gt;/18          7.73 ns         8.42 ns
BM_std_minmax&lt;unsigned long long&gt;/19          8.26 ns         8.63 ns
BM_std_minmax&lt;unsigned long long&gt;/20          8.54 ns         8.96 ns
BM_std_minmax&lt;unsigned long long&gt;/21          9.14 ns         9.37 ns
BM_std_minmax&lt;unsigned long long&gt;/22          9.39 ns         9.67 ns
BM_std_minmax&lt;unsigned long long&gt;/23          10.1 ns         10.1 ns
BM_std_minmax&lt;unsigned long long&gt;/24          10.4 ns         10.6 ns
BM_std_minmax&lt;unsigned long long&gt;/25          11.0 ns         11.3 ns
BM_std_minmax&lt;unsigned long long&gt;/26          11.3 ns         12.1 ns
BM_std_minmax&lt;unsigned long long&gt;/27          11.8 ns         14.2 ns
BM_std_minmax&lt;unsigned long long&gt;/28          12.1 ns         15.8 ns
BM_std_minmax&lt;unsigned long long&gt;/29          12.6 ns         17.4 ns
BM_std_minmax&lt;unsigned long long&gt;/30          13.1 ns         18.1 ns
BM_std_minmax&lt;unsigned long long&gt;/31          13.4 ns         18.8 ns
BM_std_minmax&lt;unsigned long long&gt;/32          13.8 ns         10.4 ns
BM_std_minmax&lt;unsigned long long&gt;/64          27.3 ns         15.5 ns
BM_std_minmax&lt;unsigned long long&gt;/512          222 ns         80.6 ns
BM_std_minmax&lt;unsigned long long&gt;/1024         443 ns          156 ns
BM_std_minmax&lt;unsigned long long&gt;/4000        1731 ns          591 ns
BM_std_minmax&lt;unsigned long long&gt;/4096        1752 ns          609 ns
BM_std_minmax&lt;unsigned long long&gt;/5500        2340 ns          819 ns
BM_std_minmax&lt;unsigned long long&gt;/64000      27166 ns         9652 ns
BM_std_minmax&lt;unsigned long long&gt;/65536      27869 ns         9876 ns
BM_std_minmax&lt;unsigned long long&gt;/70000      29920 ns        10680 ns

Full diff: https://github.com/llvm/llvm-project/pull/87335.diff

8 Files Affected:

  • (modified) libcxx/benchmarks/CMakeLists.txt (+1)
  • (added) libcxx/benchmarks/algorithms/minmax.bench.cpp (+68)
  • (modified) libcxx/docs/ReleaseNotes/19.rst (+2)
  • (modified) libcxx/include/__algorithm/comp.h (+3)
  • (modified) libcxx/include/__algorithm/ranges_minmax.h (+16-1)
  • (modified) libcxx/include/__functional/operations.h (+6)
  • (modified) libcxx/include/__functional/ranges_operations.h (+3)
  • (modified) libcxx/include/__type_traits/desugars_to.h (+1)
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 387e013afeb6c4..928238c1ac69ba 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -182,6 +182,7 @@ set(BENCHMARK_TESTS
     algorithms/make_heap.bench.cpp
     algorithms/make_heap_then_sort_heap.bench.cpp
     algorithms/min.bench.cpp
+    algorithms/minmax.bench.cpp
     algorithms/min_max_element.bench.cpp
     algorithms/mismatch.bench.cpp
     algorithms/pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/minmax.bench.cpp b/libcxx/benchmarks/algorithms/minmax.bench.cpp
new file mode 100644
index 00000000000000..b0ff7f91c19939
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/minmax.bench.cpp
@@ -0,0 +1,68 @@
+#include <algorithm>
+#include <cassert>
+
+#include <benchmark/benchmark.h>
+
+void run_sizes(auto benchmark) {
+  benchmark->Arg(1)
+      ->Arg(2)
+      ->Arg(3)
+      ->Arg(4)
+      ->Arg(5)
+      ->Arg(6)
+      ->Arg(7)
+      ->Arg(8)
+      ->Arg(9)
+      ->Arg(10)
+      ->Arg(11)
+      ->Arg(12)
+      ->Arg(13)
+      ->Arg(14)
+      ->Arg(15)
+      ->Arg(16)
+      ->Arg(17)
+      ->Arg(18)
+      ->Arg(19)
+      ->Arg(20)
+      ->Arg(21)
+      ->Arg(22)
+      ->Arg(23)
+      ->Arg(24)
+      ->Arg(25)
+      ->Arg(26)
+      ->Arg(27)
+      ->Arg(28)
+      ->Arg(29)
+      ->Arg(30)
+      ->Arg(31)
+      ->Arg(32)
+      ->Arg(64)
+      ->Arg(512)
+      ->Arg(1024)
+      ->Arg(4000)
+      ->Arg(4096)
+      ->Arg(5500)
+      ->Arg(64000)
+      ->Arg(65536)
+      ->Arg(70000);
+}
+
+template <class T>
+static void BM_std_minmax(benchmark::State& state) {
+  std::vector<T> vec(state.range(), 3);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec);
+    benchmark::DoNotOptimize(std::ranges::minmax(vec));
+  }
+}
+BENCHMARK(BM_std_minmax<char>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<short>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<int>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<long long>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned char>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned short>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned int>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned long long>)->Apply(run_sizes);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 2da9df54a53198..a420b599cd597e 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -54,6 +54,8 @@ Improvements and New Features
   resulting in a performance increase of up to 1400x.
 - The ``std::mismatch`` algorithm has been optimized for integral types, which can lead up to 40x performance
   improvements.
+- The ``std::ranges::minmax`` algorithm has been optimized for integral types, resulting in a performance increase of
+  up to 100x.
 
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ``<strstream>`` available.
 
diff --git a/libcxx/include/__algorithm/comp.h b/libcxx/include/__algorithm/comp.h
index a089375e3da139..a0fa88d6d2acd3 100644
--- a/libcxx/include/__algorithm/comp.h
+++ b/libcxx/include/__algorithm/comp.h
@@ -41,6 +41,9 @@ struct __less<void, void> {
   }
 };
 
+template <class _Tp>
+inline const bool __desugars_to_v<__less_tag, __less<>, _Tp, _Tp> = true;
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___ALGORITHM_COMP_H
diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h
index 22a62b620c936f..049263b615fc0e 100644
--- a/libcxx/include/__algorithm/ranges_minmax.h
+++ b/libcxx/include/__algorithm/ranges_minmax.h
@@ -24,6 +24,8 @@
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__type_traits/is_reference.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/operation_traits.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
@@ -83,7 +85,20 @@ struct __fn {
 
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range has to contain at least one element");
 
-    if constexpr (forward_range<_Range>) {
+    // This optimiation is not in minmax_element because clang doesn't see through the pointers and as a result doesn't
+    // vectorize the code.
+    if constexpr (contiguous_range<_Range> && is_integral_v<_ValueT> &&
+                  __is_cheap_to_copy<_ValueT> & __is_identity<_Proj>::value &&
+                  __desugars_to_v<__less_tag, _Comp, _ValueT, _ValueT>) {
+      minmax_result<_ValueT> __result = {__r[0], __r[0]};
+      for (auto __e : __r) {
+        if (__e < __result.min)
+          __result.min = __e;
+        if (__result.max < __e)
+          __result.max = __e;
+      }
+      return __result;
+    } else if constexpr (forward_range<_Range>) {
       // Special-case the one element case. Avoid repeatedly initializing objects from the result of an iterator
       // dereference when doing so might not be idempotent. The `if constexpr` avoids the extra branch in cases where
       // it's not needed.
diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h
index 9aa28e4925069c..240f127e542553 100644
--- a/libcxx/include/__functional/operations.h
+++ b/libcxx/include/__functional/operations.h
@@ -359,6 +359,9 @@ struct _LIBCPP_TEMPLATE_VIS less : __binary_function<_Tp, _Tp, bool> {
 };
 _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(less);
 
+template <class _Tp>
+inline const bool __desugars_to_v<__less_tag, less<_Tp>, _Tp, _Tp> = true;
+
 #if _LIBCPP_STD_VER >= 14
 template <>
 struct _LIBCPP_TEMPLATE_VIS less<void> {
@@ -370,6 +373,9 @@ struct _LIBCPP_TEMPLATE_VIS less<void> {
   }
   typedef void is_transparent;
 };
+
+template <class _Tp>
+inline const bool __desugars_to_v<__less_tag, less<>, _Tp, _Tp> = true;
 #endif
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/__functional/ranges_operations.h b/libcxx/include/__functional/ranges_operations.h
index a9dffaf6962585..27f06eadd0eb11 100644
--- a/libcxx/include/__functional/ranges_operations.h
+++ b/libcxx/include/__functional/ranges_operations.h
@@ -99,6 +99,9 @@ struct greater_equal {
 template <class _Tp, class _Up>
 inline const bool __desugars_to_v<__equal_tag, ranges::equal_to, _Tp, _Up> = true;
 
+template <class _Tp, class _Up>
+inline const bool __desugars_to_v<__less_tag, ranges::less, _Tp, _Up> = true;
+
 #endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/desugars_to.h b/libcxx/include/__type_traits/desugars_to.h
index a8f69c28dfc520..97a2ee5448f203 100644
--- a/libcxx/include/__type_traits/desugars_to.h
+++ b/libcxx/include/__type_traits/desugars_to.h
@@ -20,6 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // Tags to represent the canonical operations
 struct __equal_tag {};
 struct __plus_tag {};
+struct __less_tag {};
 
 // This class template is used to determine whether an operation "desugars"
 // (or boils down) to a given canonical operation.

Copy link
Member

@ldionne ldionne left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, thanks!

@philnik777 philnik777 merged commit 935e699 into llvm:main Apr 6, 2024
@philnik777 philnik777 deleted the optimize_minmax branch April 6, 2024 15:22
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants