Skip to content

Greatly improve division performance for u128 and other cases #332

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Sep 3, 2020
Merged
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ panic-handler = { path = 'crates/panic-handler' }
[features]
default = ["compiler-builtins"]

# Some algorithms benefit from inline assembly, but some compiler backends do
# not support it, so inline assembly is only enabled when this flag is set.
asm = []

# Enable compilation of C code in compiler-rt, filling in some more optimized
# implementations and also filling in unimplemented intrinsics
c = ["cc"]
Expand Down
12 changes: 1 addition & 11 deletions src/int/mod.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
use core::ops;

macro_rules! hty {
($ty:ty) => {
<$ty as LargeInt>::HighHalf
};
}

macro_rules! os_ty {
($ty:ty) => {
<$ty as Int>::OtherSign
};
}
mod specialized_div_rem;

pub mod addsub;
pub mod leading_zeros;
Expand Down
114 changes: 39 additions & 75 deletions src/int/sdiv.rs
Original file line number Diff line number Diff line change
@@ -1,101 +1,65 @@
use int::Int;

trait Div: Int {
/// Returns `a / b`
fn div(self, other: Self) -> Self {
let s_a = self >> (Self::BITS - 1);
let s_b = other >> (Self::BITS - 1);
// NOTE it's OK to overflow here because of the `.unsigned()` below.
// This whole operation is computing the absolute value of the inputs
// So some overflow will happen when dealing with e.g. `i64::MIN`
// where the absolute value is `(-i64::MIN) as u64`
let a = (self ^ s_a).wrapping_sub(s_a);
let b = (other ^ s_b).wrapping_sub(s_b);
let s = s_a ^ s_b;

let r = a.unsigned().aborting_div(b.unsigned());
(Self::from_unsigned(r) ^ s) - s
}
}

impl Div for i32 {}
impl Div for i64 {}
impl Div for i128 {}

trait Mod: Int {
/// Returns `a % b`
fn mod_(self, other: Self) -> Self {
let s = other >> (Self::BITS - 1);
// NOTE(wrapping_sub) see comment in the `div`
let b = (other ^ s).wrapping_sub(s);
let s = self >> (Self::BITS - 1);
let a = (self ^ s).wrapping_sub(s);

let r = a.unsigned().aborting_rem(b.unsigned());
(Self::from_unsigned(r) ^ s) - s
}
}

impl Mod for i32 {}
impl Mod for i64 {}
impl Mod for i128 {}

trait Divmod: Int {
/// Returns `a / b` and sets `*rem = n % d`
fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self
where
F: Fn(Self, Self) -> Self,
{
let r = div(self, other);
// NOTE won't overflow because it's using the result from the
// previous division
*rem = self - r.wrapping_mul(other);
r
}
}

impl Divmod for i32 {}
impl Divmod for i64 {}
use int::specialized_div_rem::*;

intrinsics! {
#[maybe_use_optimized_c_shim]
#[arm_aeabi_alias = __aeabi_idiv]
/// Returns `n / d`
pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
a.div(b)
i32_div_rem(a, b).0
}

#[maybe_use_optimized_c_shim]
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
a.div(b)
/// Returns `n % d`
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
i32_div_rem(a, b).1
}

#[win64_128bit_abi_hack]
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
a.div(b)
#[maybe_use_optimized_c_shim]
/// Returns `n / d` and sets `*rem = n % d`
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
let quo_rem = i32_div_rem(a, b);
*rem = quo_rem.1;
quo_rem.0
}

#[maybe_use_optimized_c_shim]
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
a.mod_(b)
/// Returns `n / d`
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
i64_div_rem(a, b).0
}

#[maybe_use_optimized_c_shim]
/// Returns `n % d`
pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
a.mod_(b)
i64_div_rem(a, b).1
}

#[maybe_use_optimized_c_shim]
/// Returns `n / d` and sets `*rem = n % d`
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
let quo_rem = i64_div_rem(a, b);
*rem = quo_rem.1;
quo_rem.0
}

#[win64_128bit_abi_hack]
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
a.mod_(b)
/// Returns `n / d`
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
i128_div_rem(a, b).0
}

#[maybe_use_optimized_c_shim]
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
a.divmod(b, rem, |a, b| __divsi3(a, b))
#[win64_128bit_abi_hack]
/// Returns `n % d`
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
i128_div_rem(a, b).1
}

#[aapcs_on_arm]
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
a.divmod(b, rem, |a, b| __divdi3(a, b))
// LLVM does not currently have a `__divmodti4` function, but GCC does
#[maybe_use_optimized_c_shim]
/// Returns `n / d` and sets `*rem = n % d`
pub extern "C" fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 {
let quo_rem = i128_div_rem(a, b);
*rem = quo_rem.1;
quo_rem.0
}
}
169 changes: 169 additions & 0 deletions src/int/specialized_div_rem/asymmetric.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/// Creates unsigned and signed division functions optimized for dividing integers with the same
/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an
/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits
/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to
/// construct a full 128 bit by 128 bit division.
#[macro_export]
macro_rules! impl_asymmetric {
(
$unsigned_name:ident, // name of the unsigned division function
$signed_name:ident, // name of the signed division function
$zero_div_fn:ident, // function called when division by zero is attempted
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a general rule we don't need to check for zero here. The API contract for the division functions is that a zero check will have already been done beforehand. It is not necessary to check for zero again here and it's fine if we just call unreachable_unchecked or trigger a CPU trap in that case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should keep $zero_div_fn so that we can define it to be ::abort or unreachable_unchecked in one place

$half_division:ident, // function for division of a $uX by a $uX
$asymmetric_division:ident, // function for division of a $uD by a $uX
$n_h:expr, // the number of bits in a $iH or $uH
$uH:ident, // unsigned integer with half the bit width of $uX
$uX:ident, // unsigned integer with half the bit width of $uD
$uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
$iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
$($unsigned_attr:meta),*; // attributes for the unsigned function
$($signed_attr:meta),* // attributes for the signed function
) => {
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
$(
#[$unsigned_attr]
)*
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
}
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
}

let n: u32 = $n_h * 2;

// Many of these subalgorithms are taken from trifecta.rs, see that for better
// documentation.

let duo_lo = duo as $uX;
let duo_hi = (duo >> n) as $uX;
let div_lo = div as $uX;
let div_hi = (div >> n) as $uX;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using the methods from LargeInt for splitting.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize that many lines of code are spent just separating out parts of integers. After your suggestion I think that using a trait would help, but I would also like a more terse formulation like div.lo() and div.hi(). What if I defined a trait SplitInt in both the compiler-builtins specialized_div_rem/mod.rs and in my crates.io version? It would be consistent with my existing documentation, cut down many lines of code, and be almost as terse inline.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m more concerned about the duplication of implementation to split integers, not the fact that these lines are not very terse. If the newly added trait delegates to LargeInt methods, that’s fine with me.

if div_hi == 0 {
if div_lo == 0 {
$zero_div_fn()
}
if duo_hi < div_lo {
// `$uD` by `$uX` division with a quotient that will fit into a `$uX`
let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) };
return (quo as $uD, rem as $uD)
} else if (div_lo >> $n_h) == 0 {
// Short division of $uD by a $uH.

// Some x86_64 CPUs have bad division implementations that make specializing
// this case faster.
let div_0 = div_lo as $uH as $uX;
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);

let duo_mid =
((duo >> $n_h) as $uH as $uX)
| (rem_3 << $n_h);
let (quo_1, rem_2) = $half_division(duo_mid, div_0);

let duo_lo =
(duo as $uH as $uX)
| (rem_2 << $n_h);
let (quo_0, rem_1) = $half_division(duo_lo, div_0);

return (
(quo_0 as $uD)
| ((quo_1 as $uD) << $n_h)
| ((quo_hi as $uD) << n),
rem_1 as $uD
)
} else {
// Short division using the $uD by $uX division
let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
let tmp = unsafe {
$asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
};
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
}
}

let duo_lz = duo_hi.leading_zeros();
let div_lz = div_hi.leading_zeros();
let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
if rel_leading_sb < $n_h {
// Some x86_64 CPUs have bad hardware division implementations that make putting
// a two possibility algorithm here beneficial. We also avoid a full `$uD`
// multiplication.
let shift = n - duo_lz;
let duo_sig_n = (duo >> shift) as $uX;
let div_sig_n = (div >> shift) as $uX;
let quo = $half_division(duo_sig_n, div_sig_n).0;
let div_lo = div as $uX;
let div_hi = (div >> n) as $uX;
let (tmp_lo, carry) = carrying_mul(quo, div_lo);
let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry);
let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
if (overflow != 0) || (duo < tmp) {
return (
(quo - 1) as $uD,
duo.wrapping_add(div).wrapping_sub(tmp)
)
} else {
return (
quo as $uD,
duo - tmp
)
}
} else {
// This has been adapted from
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
// adapted from Hacker's Delight. This is similar to the two possibility algorithm
// in that it uses only more significant parts of `duo` and `div` to divide a large
// integer with a smaller division instruction.

let div_extra = n - div_lz;
let div_sig_n = (div >> div_extra) as $uX;
let tmp = unsafe {
$asymmetric_division(duo >> 1, div_sig_n)
};

let mut quo = tmp.0 >> ((n - 1) - div_lz);
if quo != 0 {
quo -= 1;
}

// Note that this is a full `$uD` multiplication being used here
let mut rem = duo - (quo as $uD).wrapping_mul(div);
if div <= rem {
quo += 1;
rem -= div;
}
return (quo as $uD, rem)
}
}

/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
$(
#[$signed_attr]
)*
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
match (duo < 0, div < 0) {
(false, false) => {
let t = $unsigned_name(duo as $uD, div as $uD);
(t.0 as $iD, t.1 as $iD)
},
(true, false) => {
let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
},
(false, true) => {
let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
((t.0 as $iD).wrapping_neg(), t.1 as $iD)
},
(true, true) => {
let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
(t.0 as $iD, (t.1 as $iD).wrapping_neg())
},
}
}
}
}
Loading