Skip to content

Commit f974aff

Browse files
committed
Optimize core::ptr::align_offset
- Instead of squaring the modulu until it is larger than the required one, we double log2 of it. This means a shift instead of mul each iteration.
1 parent 22b263a commit f974aff

File tree

1 file changed

+29
-22
lines changed

1 file changed

+29
-22
lines changed

src/libcore/ptr/mod.rs

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,49 +1043,55 @@ pub unsafe fn write_volatile<T>(dst: *mut T, src: T) {
10431043
/// Any questions go to @nagisa.
10441044
#[lang = "align_offset"]
10451045
pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1046-
/// Calculate multiplicative modular inverse of `x` modulo `m`.
1046+
/// Calculate multiplicative modular inverse of `x` modulo `m = 2^mpow`.
10471047
///
10481048
/// This implementation is tailored for align_offset and has following preconditions:
10491049
///
1050-
/// * `m` is a power-of-two;
1051-
/// * `x < m`; (if `x m`, pass in `x % m` instead)
1050+
/// * The requested modulo `m` is a power-of-two, so `mpow` can be an argument;
1051+
/// * `x < m`; (if `x >= m`, pass in `x % m` instead)
10521052
///
10531053
/// Implementation of this function shall not panic. Ever.
10541054
#[inline]
1055-
fn mod_inv(x: usize, m: usize) -> usize {
1056-
/// Multiplicative modular inverse table modulo 2 = 16.
1055+
fn mod_pow_2_inv(x: usize, mpow: usize) -> usize {
1056+
/// Multiplicative modular inverse table modulo 2^4 = 16.
10571057
///
10581058
/// Note, that this table does not contain values where inverse does not exist (i.e., for
1059-
/// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.)
1059+
/// `0^-1 mod 16`, `2^-1 mod 16`, etc.)
10601060
const INV_TABLE_MOD_16: [u8; 8] = [1, 11, 13, 7, 9, 3, 5, 15];
1061-
/// Modulo for which the `INV_TABLE_MOD_16` is intended.
1062-
const INV_TABLE_MOD: usize = 16;
1063-
/// INV_TABLE_MOD²
1064-
const INV_TABLE_MOD_SQUARED: usize = INV_TABLE_MOD * INV_TABLE_MOD;
1061+
/// `t` such that `2^t` is the modulu for which the `INV_TABLE_MOD_16` is intended.
1062+
const INV_TABLE_MOD_POW: usize = 4;
1063+
const INV_TABLE_MOD_POW_TIMES_2: usize = INV_TABLE_MOD_POW << 1;
1064+
const INV_TABLE_MOD: usize = 1 << INV_TABLE_MOD_POW;
10651065

10661066
let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize;
1067-
if m <= INV_TABLE_MOD {
1068-
table_inverse & (m - 1)
1067+
let mask = (1usize << mpow) - 1;
1068+
1069+
if mpow <= INV_TABLE_MOD_POW {
1070+
table_inverse & mask
10691071
} else {
10701072
// We iterate "up" using the following formula:
10711073
//
1072-
// $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
1074+
// ` xy = 1 (mod 2^n) -> xy (2 - xy) = 1 (mod 2^(2n)) `
1075+
//
1076+
// until 2^2n ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
10731077
//
1074-
// until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
1078+
// Running `k` iterations starting with a solution valid mod `2^t` will get us a
1079+
// solution valid mod `2^((2^k) * t)`, so we need to calculate for which `k`,
1080+
// `2^k * t > log2(m)`.
10751081
let mut inverse = table_inverse;
1076-
let mut going_mod = INV_TABLE_MOD_SQUARED;
1082+
let mut going_modpow = INV_TABLE_MOD_POW_TIMES_2;
10771083
loop {
1078-
// y = y * (2 - xy) mod n
1084+
// y = y * (2 - xy)
10791085
//
1080-
// Note, that we use wrapping operations here intentionally the original formula
1086+
// Note, that we use wrapping operations here intentionally - the original formula
10811087
// uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
10821088
// usize::max_value()` instead, because we take the result `mod n` at the end
10831089
// anyway.
10841090
inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse)));
1085-
if going_mod >= m {
1086-
return inverse & (m - 1);
1091+
if going_modpow >= mpow {
1092+
return inverse & mask;
10871093
}
1088-
going_mod = going_mod.wrapping_mul(going_mod);
1094+
going_modpow <<= 1;
10891095
}
10901096
}
10911097
}
@@ -1111,7 +1117,8 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
11111117

11121118
let smoda = stride & a_minus_one;
11131119
// a is power-of-two so cannot be 0. stride = 0 is handled above.
1114-
let gcdpow = intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a));
1120+
let apow = intrinsics::cttz_nonzero(a);
1121+
let gcdpow = intrinsics::cttz_nonzero(stride).min(apow);
11151122
let gcd = 1usize << gcdpow;
11161123

11171124
if p as usize & (gcd.wrapping_sub(1)) == 0 {
@@ -1140,7 +1147,7 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
11401147
let a2minus1 = a2.wrapping_sub(1);
11411148
let s2 = smoda >> gcdpow;
11421149
let minusp2 = a2.wrapping_sub(pmoda >> gcdpow);
1143-
return (minusp2.wrapping_mul(mod_inv(s2, a2))) & a2minus1;
1150+
return (minusp2.wrapping_mul(mod_pow_2_inv(s2, apow.wrapping_sub(gcdpow)))) & a2minus1;
11441151
}
11451152

11461153
// Cannot be aligned at all.

0 commit comments

Comments
 (0)