@@ -1043,49 +1043,55 @@ pub unsafe fn write_volatile<T>(dst: *mut T, src: T) {
1043
1043
/// Any questions go to @nagisa.
1044
1044
#[ lang = "align_offset" ]
1045
1045
pub ( crate ) unsafe fn align_offset < T : Sized > ( p : * const T , a : usize ) -> usize {
1046
- /// Calculate multiplicative modular inverse of `x` modulo `m`.
1046
+ /// Calculate multiplicative modular inverse of `x` modulo `m = 2^mpow `.
1047
1047
///
1048
1048
/// This implementation is tailored for align_offset and has following preconditions:
1049
1049
///
1050
- /// * `m` is a power-of-two;
1051
- /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead)
1050
+ /// * The requested modulo `m` is a power-of-two, so `mpow` can be an argument ;
1051
+ /// * `x < m`; (if `x >= m`, pass in `x % m` instead)
1052
1052
///
1053
1053
/// Implementation of this function shall not panic. Ever.
1054
1054
#[ inline]
1055
- fn mod_inv ( x : usize , m : usize ) -> usize {
1056
- /// Multiplicative modular inverse table modulo 2⁴ = 16.
1055
+ fn mod_pow_2_inv ( x : usize , mpow : usize ) -> usize {
1056
+ /// Multiplicative modular inverse table modulo 2^4 = 16.
1057
1057
///
1058
1058
/// Note, that this table does not contain values where inverse does not exist (i.e., for
1059
- /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.)
1059
+ /// `0^-1 mod 16`, `2^-1 mod 16`, etc.)
1060
1060
const INV_TABLE_MOD_16 : [ u8 ; 8 ] = [ 1 , 11 , 13 , 7 , 9 , 3 , 5 , 15 ] ;
1061
- /// Modulo for which the `INV_TABLE_MOD_16` is intended.
1062
- const INV_TABLE_MOD : usize = 16 ;
1063
- /// INV_TABLE_MOD²
1064
- const INV_TABLE_MOD_SQUARED : usize = INV_TABLE_MOD * INV_TABLE_MOD ;
1061
+ /// `t` such that `2^t` is the modulu for which the `INV_TABLE_MOD_16` is intended.
1062
+ const INV_TABLE_MOD_POW : usize = 4 ;
1063
+ const INV_TABLE_MOD_POW_TIMES_2 : usize = INV_TABLE_MOD_POW << 1 ;
1064
+ const INV_TABLE_MOD : usize = 1 << INV_TABLE_MOD_POW ;
1065
1065
1066
1066
let table_inverse = INV_TABLE_MOD_16 [ ( x & ( INV_TABLE_MOD - 1 ) ) >> 1 ] as usize ;
1067
- if m <= INV_TABLE_MOD {
1068
- table_inverse & ( m - 1 )
1067
+ let mask = ( 1usize << mpow) - 1 ;
1068
+
1069
+ if mpow <= INV_TABLE_MOD_POW {
1070
+ table_inverse & mask
1069
1071
} else {
1070
1072
// We iterate "up" using the following formula:
1071
1073
//
1072
- // $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
1074
+ // ` xy = 1 (mod 2^n) -> xy (2 - xy) = 1 (mod 2^(2n)) `
1075
+ //
1076
+ // until 2^2n ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
1073
1077
//
1074
- // until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
1078
+ // Running `k` iterations starting with a solution valid mod `2^t` will get us a
1079
+ // solution valid mod `2^((2^k) * t)`, so we need to calculate for which `k`,
1080
+ // `2^k * t > log2(m)`.
1075
1081
let mut inverse = table_inverse;
1076
- let mut going_mod = INV_TABLE_MOD_SQUARED ;
1082
+ let mut going_modpow = INV_TABLE_MOD_POW_TIMES_2 ;
1077
1083
loop {
1078
- // y = y * (2 - xy) mod n
1084
+ // y = y * (2 - xy)
1079
1085
//
1080
- // Note, that we use wrapping operations here intentionally – the original formula
1086
+ // Note, that we use wrapping operations here intentionally - the original formula
1081
1087
// uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
1082
1088
// usize::max_value()` instead, because we take the result `mod n` at the end
1083
1089
// anyway.
1084
1090
inverse = inverse. wrapping_mul ( 2usize . wrapping_sub ( x. wrapping_mul ( inverse) ) ) ;
1085
- if going_mod >= m {
1086
- return inverse & ( m - 1 ) ;
1091
+ if going_modpow >= mpow {
1092
+ return inverse & mask ;
1087
1093
}
1088
- going_mod = going_mod . wrapping_mul ( going_mod ) ;
1094
+ going_modpow <<= 1 ;
1089
1095
}
1090
1096
}
1091
1097
}
@@ -1111,7 +1117,8 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1111
1117
1112
1118
let smoda = stride & a_minus_one;
1113
1119
// a is power-of-two so cannot be 0. stride = 0 is handled above.
1114
- let gcdpow = intrinsics:: cttz_nonzero ( stride) . min ( intrinsics:: cttz_nonzero ( a) ) ;
1120
+ let apow = intrinsics:: cttz_nonzero ( a) ;
1121
+ let gcdpow = intrinsics:: cttz_nonzero ( stride) . min ( apow) ;
1115
1122
let gcd = 1usize << gcdpow;
1116
1123
1117
1124
if p as usize & ( gcd. wrapping_sub ( 1 ) ) == 0 {
@@ -1140,7 +1147,7 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1140
1147
let a2minus1 = a2. wrapping_sub ( 1 ) ;
1141
1148
let s2 = smoda >> gcdpow;
1142
1149
let minusp2 = a2. wrapping_sub ( pmoda >> gcdpow) ;
1143
- return ( minusp2. wrapping_mul ( mod_inv ( s2, a2 ) ) ) & a2minus1;
1150
+ return ( minusp2. wrapping_mul ( mod_pow_2_inv ( s2, apow . wrapping_sub ( gcdpow ) ) ) ) & a2minus1;
1144
1151
}
1145
1152
1146
1153
// Cannot be aligned at all.
0 commit comments