Closed
Description
opened on Aug 27, 2023
This affects my real project. [usize]
seems to cause a huge performance regression.
After investigation, it is caused by the new binary_search introduced by this pr #74024 .
Why is the new binary_search
not binary_search_unstable
?
macOS Pro M2
test new_binary_search_l1 ... bench: 38 ns/iter (+/- 0)
test new_binary_search_l1_with_dups ... bench: 22 ns/iter (+/- 0)
test new_binary_search_l1_worst_case ... bench: 7 ns/iter (+/- 1)
test new_binary_search_l2 ... bench: 56 ns/iter (+/- 6)
test new_binary_search_l2_with_dups ... bench: 35 ns/iter (+/- 0)
test new_binary_search_l2_worst_case ... bench: 11 ns/iter (+/- 0)
test new_binary_search_l3 ... bench: 104 ns/iter (+/- 1)
test new_binary_search_l3_with_dups ... bench: 85 ns/iter (+/- 1)
test new_binary_search_l3_worst_case ... bench: 19 ns/iter (+/- 0)
test old_binary_search_l1 ... bench: 5 ns/iter (+/- 0)
test old_binary_search_l1_with_dups ... bench: 5 ns/iter (+/- 0)
test old_binary_search_l1_worst_case ... bench: 4 ns/iter (+/- 0)
test old_binary_search_l2 ... bench: 8 ns/iter (+/- 0)
test old_binary_search_l2_with_dups ... bench: 8 ns/iter (+/- 0)
test old_binary_search_l2_worst_case ... bench: 7 ns/iter (+/- 0)
test old_binary_search_l3 ... bench: 24 ns/iter (+/- 0)
test old_binary_search_l3_with_dups ... bench: 25 ns/iter (+/- 6)
test old_binary_search_l3_worst_case ... bench: 12 ns/iter (+/- 0)
lib.rs
#![feature(core_intrinsics)]
use std::cmp::Ord;
use std::cmp::Ordering::{self, Equal, Greater, Less};
pub fn old_binary_search<T>(s: &[T], x: &T) -> Result<usize, usize>
where
T: Ord,
{
old_binary_search_by(s, |p| p.cmp(x))
}
#[inline]
pub fn old_binary_search_by<'a, T, F>(s: &'a [T], mut f: F) -> Result<usize, usize>
where
F: FnMut(&'a T) -> Ordering,
{
let mut size = s.len();
if size == 0 {
return Err(0);
}
let mut base = 0usize;
while size > 1 {
let half = size / 2;
let mid = base + half;
// mid is always in [0, size), that means mid is >= 0 and < size.
// mid >= 0: by definition
// mid < size: mid = size / 2 + size / 4 + size / 8 ...
let cmp = f(unsafe { s.get_unchecked(mid) });
base = if cmp == Greater { base } else { mid };
size -= half;
}
// base is always in [0, size) because base <= mid.
let cmp = f(unsafe { s.get_unchecked(base) });
if cmp == Equal {
Ok(base)
} else {
Err(base + (cmp == Less) as usize)
}
}
pub fn new_binary_search<T>(s: &[T], x: &T) -> Result<usize, usize>
where
T: Ord,
{
new_binary_search_by(s, |p| p.cmp(x))
}
#[inline]
pub fn new_binary_search_by<'a, T, F>(s: &'a [T], mut f: F) -> Result<usize, usize>
where
F: FnMut(&'a T) -> Ordering,
{
// INVARIANTS:
// - 0 <= left <= left + size = right <= self.len()
// - f returns Less for everything in self[..left]
// - f returns Greater for everything in self[right..]
let mut size = s.len();
let mut left = 0;
let mut right = size;
while left < right {
let mid = left + size / 2;
// SAFETY: the while condition means `size` is strictly positive, so
// `size/2 < size`. Thus `left + size/2 < left + size`, which
// coupled with the `left + size <= self.len()` invariant means
// we have `left + size/2 < self.len()`, and this is in-bounds.
let cmp = f(unsafe { s.get_unchecked(mid) });
// The reason why we use if/else control flow rather than match
// is because match reorders comparison operations, which is perf sensitive.
// This is x86 asm for u8: https://rust.godbolt.org/z/8Y8Pra.
if cmp == Less {
left = mid + 1;
} else if cmp == Greater {
right = mid;
} else {
// SAFETY: same as the `get_unchecked` above
unsafe { core::intrinsics::assume(mid < s.len()) };
return Ok(mid);
}
size = right - left;
}
// SAFETY: directly true from the overall invariant.
// Note that this is `<=`, unlike the assume in the `Ok` path.
unsafe { core::intrinsics::assume(left <= s.len()) };
Err(left)
}
bench.rs
#![feature(test)]
extern crate test;
use test::black_box;
use test::Bencher;
use binary_search_bench::*;
enum Cache {
L1,
L2,
L3,
}
fn old_bench_binary_search<F>(b: &mut Bencher, cache: Cache, mapper: F)
where
F: Fn(usize) -> usize,
{
let size = match cache {
Cache::L1 => 1000, // 8kb
Cache::L2 => 10_000, // 80kb
Cache::L3 => 1_000_000, // 8Mb
};
let v = (0..size).map(&mapper).collect::<Vec<_>>();
let mut r = 0usize;
b.iter(move || {
// LCG constants from https://en.wikipedia.org/wiki/Numerical_Recipes.
r = r.wrapping_mul(1664525).wrapping_add(1013904223);
// Lookup the whole range to get 50% hits and 50% misses.
let i = mapper(r % size);
black_box(old_binary_search(&v, &i).is_ok());
})
}
fn old_bench_binary_search_worst_case(b: &mut Bencher, cache: Cache) {
let size = match cache {
Cache::L1 => 1000, // 8kb
Cache::L2 => 10_000, // 80kb
Cache::L3 => 1_000_000, // 8Mb
};
let mut v = vec![0; size];
let i = 1;
v[size - 1] = i;
b.iter(move || {
black_box(old_binary_search(&v, &i).is_ok());
})
}
#[bench]
fn old_binary_search_l1(b: &mut Bencher) {
old_bench_binary_search(b, Cache::L1, |i| i * 2);
}
#[bench]
fn old_binary_search_l2(b: &mut Bencher) {
old_bench_binary_search(b, Cache::L2, |i| i * 2);
}
#[bench]
fn old_binary_search_l3(b: &mut Bencher) {
old_bench_binary_search(b, Cache::L3, |i| i * 2);
}
#[bench]
fn old_binary_search_l1_with_dups(b: &mut Bencher) {
old_bench_binary_search(b, Cache::L1, |i| i / 16 * 16);
}
#[bench]
fn old_binary_search_l2_with_dups(b: &mut Bencher) {
old_bench_binary_search(b, Cache::L2, |i| i / 16 * 16);
}
#[bench]
fn old_binary_search_l3_with_dups(b: &mut Bencher) {
old_bench_binary_search(b, Cache::L3, |i| i / 16 * 16);
}
#[bench]
fn old_binary_search_l1_worst_case(b: &mut Bencher) {
old_bench_binary_search_worst_case(b, Cache::L1);
}
#[bench]
fn old_binary_search_l2_worst_case(b: &mut Bencher) {
old_bench_binary_search_worst_case(b, Cache::L2);
}
#[bench]
fn old_binary_search_l3_worst_case(b: &mut Bencher) {
old_bench_binary_search_worst_case(b, Cache::L3);
}
fn new_bench_binary_search<F>(b: &mut Bencher, cache: Cache, mapper: F)
where
F: Fn(usize) -> usize,
{
let size = match cache {
Cache::L1 => 1000, // 8kb
Cache::L2 => 10_000, // 80kb
Cache::L3 => 1_000_000, // 8Mb
};
let v = (0..size).map(&mapper).collect::<Vec<_>>();
let mut r = 0usize;
b.iter(move || {
// LCG constants from https://en.wikipedia.org/wiki/Numerical_Recipes.
r = r.wrapping_mul(1664525).wrapping_add(1013904223);
// Lookup the whole range to get 50% hits and 50% misses.
let i = mapper(r % size);
black_box(new_binary_search(&v, &i).is_ok());
})
}
fn new_bench_binary_search_worst_case(b: &mut Bencher, cache: Cache) {
let size = match cache {
Cache::L1 => 1000, // 8kb
Cache::L2 => 10_000, // 80kb
Cache::L3 => 1_000_000, // 8Mb
};
let mut v = vec![0; size];
let i = 1;
v[size - 1] = i;
b.iter(move || {
black_box(new_binary_search(&v, &i).is_ok());
})
}
#[bench]
fn new_binary_search_l1(b: &mut Bencher) {
new_bench_binary_search(b, Cache::L1, |i| i * 2);
}
#[bench]
fn new_binary_search_l2(b: &mut Bencher) {
new_bench_binary_search(b, Cache::L2, |i| i * 2);
}
#[bench]
fn new_binary_search_l3(b: &mut Bencher) {
new_bench_binary_search(b, Cache::L3, |i| i * 2);
}
#[bench]
fn new_binary_search_l1_with_dups(b: &mut Bencher) {
new_bench_binary_search(b, Cache::L1, |i| i / 16 * 16);
}
#[bench]
fn new_binary_search_l2_with_dups(b: &mut Bencher) {
new_bench_binary_search(b, Cache::L2, |i| i / 16 * 16);
}
#[bench]
fn new_binary_search_l3_with_dups(b: &mut Bencher) {
new_bench_binary_search(b, Cache::L3, |i| i / 16 * 16);
}
#[bench]
fn new_binary_search_l1_worst_case(b: &mut Bencher) {
new_bench_binary_search_worst_case(b, Cache::L1);
}
#[bench]
fn new_binary_search_l2_worst_case(b: &mut Bencher) {
new_bench_binary_search_worst_case(b, Cache::L2);
}
#[bench]
fn new_binary_search_l3_worst_case(b: &mut Bencher) {
new_bench_binary_search_worst_case(b, Cache::L3);
}
Activity