Skip to content

performance regression of binary_search #115271

Closed
@kekeimiku

Description

This affects my real project. [usize] seems to cause a huge performance regression.

After investigation, it is caused by the new binary_search introduced by this pr #74024 .

Why is the new binary_search not binary_search_unstable ?

macOS Pro M2

test new_binary_search_l1            ... bench:          38 ns/iter (+/- 0)
test new_binary_search_l1_with_dups  ... bench:          22 ns/iter (+/- 0)
test new_binary_search_l1_worst_case ... bench:           7 ns/iter (+/- 1)
test new_binary_search_l2            ... bench:          56 ns/iter (+/- 6)
test new_binary_search_l2_with_dups  ... bench:          35 ns/iter (+/- 0)
test new_binary_search_l2_worst_case ... bench:          11 ns/iter (+/- 0)
test new_binary_search_l3            ... bench:         104 ns/iter (+/- 1)
test new_binary_search_l3_with_dups  ... bench:          85 ns/iter (+/- 1)
test new_binary_search_l3_worst_case ... bench:          19 ns/iter (+/- 0)
test old_binary_search_l1            ... bench:           5 ns/iter (+/- 0)
test old_binary_search_l1_with_dups  ... bench:           5 ns/iter (+/- 0)
test old_binary_search_l1_worst_case ... bench:           4 ns/iter (+/- 0)
test old_binary_search_l2            ... bench:           8 ns/iter (+/- 0)
test old_binary_search_l2_with_dups  ... bench:           8 ns/iter (+/- 0)
test old_binary_search_l2_worst_case ... bench:           7 ns/iter (+/- 0)
test old_binary_search_l3            ... bench:          24 ns/iter (+/- 0)
test old_binary_search_l3_with_dups  ... bench:          25 ns/iter (+/- 6)
test old_binary_search_l3_worst_case ... bench:          12 ns/iter (+/- 0)
lib.rs

#![feature(core_intrinsics)]

use std::cmp::Ord;
use std::cmp::Ordering::{self, Equal, Greater, Less};

pub fn old_binary_search<T>(s: &[T], x: &T) -> Result<usize, usize>
where
    T: Ord,
{
    old_binary_search_by(s, |p| p.cmp(x))
}

#[inline]
pub fn old_binary_search_by<'a, T, F>(s: &'a [T], mut f: F) -> Result<usize, usize>
where
    F: FnMut(&'a T) -> Ordering,
{
    let mut size = s.len();
    if size == 0 {
        return Err(0);
    }
    let mut base = 0usize;
    while size > 1 {
        let half = size / 2;
        let mid = base + half;
        // mid is always in [0, size), that means mid is >= 0 and < size.
        // mid >= 0: by definition
        // mid < size: mid = size / 2 + size / 4 + size / 8 ...
        let cmp = f(unsafe { s.get_unchecked(mid) });
        base = if cmp == Greater { base } else { mid };
        size -= half;
    }
    // base is always in [0, size) because base <= mid.
    let cmp = f(unsafe { s.get_unchecked(base) });
    if cmp == Equal {
        Ok(base)
    } else {
        Err(base + (cmp == Less) as usize)
    }
}

pub fn new_binary_search<T>(s: &[T], x: &T) -> Result<usize, usize>
where
    T: Ord,
{
    new_binary_search_by(s, |p| p.cmp(x))
}

#[inline]
pub fn new_binary_search_by<'a, T, F>(s: &'a [T], mut f: F) -> Result<usize, usize>
where
    F: FnMut(&'a T) -> Ordering,
{
    // INVARIANTS:
    // - 0 <= left <= left + size = right <= self.len()
    // - f returns Less for everything in self[..left]
    // - f returns Greater for everything in self[right..]
    let mut size = s.len();
    let mut left = 0;
    let mut right = size;
    while left < right {
        let mid = left + size / 2;

        // SAFETY: the while condition means `size` is strictly positive, so
        // `size/2 < size`. Thus `left + size/2 < left + size`, which
        // coupled with the `left + size <= self.len()` invariant means
        // we have `left + size/2 < self.len()`, and this is in-bounds.
        let cmp = f(unsafe { s.get_unchecked(mid) });

        // The reason why we use if/else control flow rather than match
        // is because match reorders comparison operations, which is perf sensitive.
        // This is x86 asm for u8: https://rust.godbolt.org/z/8Y8Pra.
        if cmp == Less {
            left = mid + 1;
        } else if cmp == Greater {
            right = mid;
        } else {
            // SAFETY: same as the `get_unchecked` above
            unsafe { core::intrinsics::assume(mid < s.len()) };
            return Ok(mid);
        }

        size = right - left;
    }

    // SAFETY: directly true from the overall invariant.
    // Note that this is `<=`, unlike the assume in the `Ok` path.
    unsafe { core::intrinsics::assume(left <= s.len()) };
    Err(left)
}

bench.rs

  #![feature(test)]
extern crate test;

use test::black_box;
use test::Bencher;

use binary_search_bench::*;

enum Cache {
    L1,
    L2,
    L3,
}

fn old_bench_binary_search<F>(b: &mut Bencher, cache: Cache, mapper: F)
where
    F: Fn(usize) -> usize,
{
    let size = match cache {
        Cache::L1 => 1000,      // 8kb
        Cache::L2 => 10_000,    // 80kb
        Cache::L3 => 1_000_000, // 8Mb
    };
    let v = (0..size).map(&mapper).collect::<Vec<_>>();
    let mut r = 0usize;
    b.iter(move || {
        // LCG constants from https://en.wikipedia.org/wiki/Numerical_Recipes.
        r = r.wrapping_mul(1664525).wrapping_add(1013904223);
        // Lookup the whole range to get 50% hits and 50% misses.
        let i = mapper(r % size);
        black_box(old_binary_search(&v, &i).is_ok());
    })
}

fn old_bench_binary_search_worst_case(b: &mut Bencher, cache: Cache) {
    let size = match cache {
        Cache::L1 => 1000,      // 8kb
        Cache::L2 => 10_000,    // 80kb
        Cache::L3 => 1_000_000, // 8Mb
    };
    let mut v = vec![0; size];
    let i = 1;
    v[size - 1] = i;
    b.iter(move || {
        black_box(old_binary_search(&v, &i).is_ok());
    })
}

#[bench]
fn old_binary_search_l1(b: &mut Bencher) {
    old_bench_binary_search(b, Cache::L1, |i| i * 2);
}

#[bench]
fn old_binary_search_l2(b: &mut Bencher) {
    old_bench_binary_search(b, Cache::L2, |i| i * 2);
}

#[bench]
fn old_binary_search_l3(b: &mut Bencher) {
    old_bench_binary_search(b, Cache::L3, |i| i * 2);
}

#[bench]
fn old_binary_search_l1_with_dups(b: &mut Bencher) {
    old_bench_binary_search(b, Cache::L1, |i| i / 16 * 16);
}

#[bench]
fn old_binary_search_l2_with_dups(b: &mut Bencher) {
    old_bench_binary_search(b, Cache::L2, |i| i / 16 * 16);
}

#[bench]
fn old_binary_search_l3_with_dups(b: &mut Bencher) {
    old_bench_binary_search(b, Cache::L3, |i| i / 16 * 16);
}

#[bench]
fn old_binary_search_l1_worst_case(b: &mut Bencher) {
    old_bench_binary_search_worst_case(b, Cache::L1);
}

#[bench]
fn old_binary_search_l2_worst_case(b: &mut Bencher) {
    old_bench_binary_search_worst_case(b, Cache::L2);
}

#[bench]
fn old_binary_search_l3_worst_case(b: &mut Bencher) {
    old_bench_binary_search_worst_case(b, Cache::L3);
}

fn new_bench_binary_search<F>(b: &mut Bencher, cache: Cache, mapper: F)
where
    F: Fn(usize) -> usize,
{
    let size = match cache {
        Cache::L1 => 1000,      // 8kb
        Cache::L2 => 10_000,    // 80kb
        Cache::L3 => 1_000_000, // 8Mb
    };
    let v = (0..size).map(&mapper).collect::<Vec<_>>();
    let mut r = 0usize;
    b.iter(move || {
        // LCG constants from https://en.wikipedia.org/wiki/Numerical_Recipes.
        r = r.wrapping_mul(1664525).wrapping_add(1013904223);
        // Lookup the whole range to get 50% hits and 50% misses.
        let i = mapper(r % size);
        black_box(new_binary_search(&v, &i).is_ok());
    })
}

fn new_bench_binary_search_worst_case(b: &mut Bencher, cache: Cache) {
    let size = match cache {
        Cache::L1 => 1000,      // 8kb
        Cache::L2 => 10_000,    // 80kb
        Cache::L3 => 1_000_000, // 8Mb
    };
    let mut v = vec![0; size];
    let i = 1;
    v[size - 1] = i;
    b.iter(move || {
        black_box(new_binary_search(&v, &i).is_ok());
    })
}

#[bench]
fn new_binary_search_l1(b: &mut Bencher) {
    new_bench_binary_search(b, Cache::L1, |i| i * 2);
}

#[bench]
fn new_binary_search_l2(b: &mut Bencher) {
    new_bench_binary_search(b, Cache::L2, |i| i * 2);
}

#[bench]
fn new_binary_search_l3(b: &mut Bencher) {
    new_bench_binary_search(b, Cache::L3, |i| i * 2);
}

#[bench]
fn new_binary_search_l1_with_dups(b: &mut Bencher) {
    new_bench_binary_search(b, Cache::L1, |i| i / 16 * 16);
}

#[bench]
fn new_binary_search_l2_with_dups(b: &mut Bencher) {
    new_bench_binary_search(b, Cache::L2, |i| i / 16 * 16);
}

#[bench]
fn new_binary_search_l3_with_dups(b: &mut Bencher) {
    new_bench_binary_search(b, Cache::L3, |i| i / 16 * 16);
}

#[bench]
fn new_binary_search_l1_worst_case(b: &mut Bencher) {
    new_bench_binary_search_worst_case(b, Cache::L1);
}

#[bench]
fn new_binary_search_l2_worst_case(b: &mut Bencher) {
    new_bench_binary_search_worst_case(b, Cache::L2);
}

#[bench]
fn new_binary_search_l3_worst_case(b: &mut Bencher) {
    new_bench_binary_search_worst_case(b, Cache::L3);
}

```[tasklist] ### Tasks ```

Activity

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Assignees

No one assigned

    Labels

    I-slowIssue: Problems and improvements with respect to performance of generated code.T-libsRelevant to the library team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions