Bug report by Stefan Kanthak:
The bug needs a divisor 'v' with the most significant bit set, i.e. a value from
0x8000000000000000ULL to 0xFFFFFFFFFFFFFFFFULL.
It should be reproducible with a dividend pair 'u1' and 'u0' of 0ULL and
~0ULL: this combination lets 'un64' be ~0ULL instead of 0ULL
Here is a link to the full blog post:
I have written a standalone test program for libdivide's libdivide_128_div_64_to_64()
function that can be compiled using GCC or Clang on a 64-bit CPU architecture. The test program calculates many combinations of x1/x2 where x1 is of type __uint128_t
and x2 is of type uint64_t
. The test program always calculates x1/x2 twice, first using the builtin integer division and second using libdivide's libdivide_128_div_64_to_64()
function and then compares both results for equality.
The test program is able to reproduce the reported bug and I can also confirm that the proposed fix below fixes the bug:
// Error
} else {
// Avoid undefined behavior
un64 = u1 | u0;
un10 = u0;
// Fix
} else {
// Avoid undefined behavior
un64 = u1;
un10 = u0;
And here is my test program:
// This is a test program for the function libdivide_128_div_64_to_64
// which devides a 128-bit number by a 64-bit number where
// the result must fit into a 64-bit word.
// The test program tries to cover a wide variety of test cases
// e.g. small dividend / small divisor,
// large dividend / small divisor,
// small dividend / large divisor,
// large dividend / large divisor,
// ...
#include <iostream>
#include <random>
#include <limits>
#include <ostream>
#include <string>
using namespace std;
// Code taken from Hacker's Delight:
// License permits inclusion here per:
static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
const uint64_t b = (1ULL << 32); // Number base (16 bits)
uint64_t un1, un0; // Norm. dividend LSD's
uint64_t vn1, vn0; // Norm. divisor digits
uint64_t q1, q0; // Quotient digits
uint64_t un64, un21, un10; // Dividend digit pairs
uint64_t rhat; // A remainder
int32_t s; // Shift amount for norm
// If overflow, set rem. to an impossible value,
// and return the largest possible quotient
if (u1 >= v) {
if (r != NULL)
*r = (uint64_t) -1;
return (uint64_t) -1;
// count leading zeros
s = __builtin_clzll(v);
if (s > 0) {
// Normalize divisor
v = v << s;
un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
un10 = u0 << s; // Shift dividend left
} else {
// Avoid undefined behavior
un64 = u1 | u0;
un10 = u0;
// Break divisor up into two 32-bit digits
vn1 = v >> 32;
vn0 = v & 0xFFFFFFFF;
// Break right half of dividend into two digits
un1 = un10 >> 32;
un0 = un10 & 0xFFFFFFFF;
// Compute the first quotient digit, q1
q1 = un64 / vn1;
rhat = un64 - q1 * vn1;
while (q1 >= b || q1 * vn0 > b * rhat + un1) {
q1 = q1 - 1;
rhat = rhat + vn1;
if (rhat >= b)
// Multiply and subtract
un21 = un64 * b + un1 - q1 * v;
// Compute the second quotient digit
q0 = un21 / vn1;
rhat = un21 - q0 * vn1;
while (q0 >= b || q0 * vn0 > b * rhat + un0) {
q0 = q0 - 1;
rhat = rhat + vn1;
if (rhat >= b)
// If remainder is wanted, return it
if (r != NULL)
*r = (un21 * b + un0 - q0 * v) >> s;
return q1 * b + q0;
inline std::ostream& operator<<(std::ostream& stream, __uint128_t n)
std::string str;
while (n > 0)
str += '0' + n % 10;
n /= 10;
if (str.empty())
str = "0";
stream << std::string(str.rbegin(), str.rend());
return stream;
int main(int argc, char** argv)
random_device rd16;
mt19937 gen16(rd16());
uniform_int_distribution<uint16_t> dist16(1, std::numeric_limits<uint16_t>::max());
int iters = 100000;
if (argc > 1)
iters = atoi(argv[1]);
// Test: random dividends < 2^64 / small divisors
for (int i = 0; i < iters; i++)
uint64_t x1 = 1;
for (int i = 0; i < 4; i++)
uint16_t a = dist16(gen16);
uint64_t x2 = 1;
x1 *= a;
x1 += 1;
for (int j = 1; j < 17; j++)
x2 = j;
auto res1 = x1 / x2;
auto res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
// Test: 2^64-1 / random divisors < 2^64
for (int i = 0; i < iters; i++)
uint64_t x1 = ~0ull;
uint64_t x2 = 1;
for (int i = 0; i < 4; i++)
uint16_t a = dist16(gen16);
x2 *= a;
x2 += 1;
auto res1 = x1 / x2;
auto res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
// Test: 2^90 / random divisors > 2^46 && < 2^64
for (int i = 0; i < iters; i++)
__uint128_t x1 = ((__uint128_t) 1) << 90;
uint64_t x2 = 1;
while (x2 <= (1ull << 46))
uint16_t a = dist16(gen16);
x2 *= a;
x2 += 1;
uint64_t res1 = x1 / x2;
uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
// Test: random dividends > 2^100 / 2^64-1
for (int i = 0; i < iters; i++)
__uint128_t x1 = 1;
uint64_t x2 = ~0ull;
while (x1 <= (((__uint128_t) 1) << 100))
uint16_t a = dist16(gen16);
x1 *= a;
x1 += 1;
uint64_t res1 = x1 / x2;
uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
// Test: 2^(0..127) / 2^64-1
for (int i = 0; i < 127; i++)
__uint128_t x1 = ((__uint128_t) 1) << i;
uint64_t x2 = ~0ull;
uint64_t res1 = x1 / x2;
uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
// Test: 2^(0..127) / 2^63
for (int i = 0; i < 127; i++)
__uint128_t x1 = ((__uint128_t) 1) << i;
uint64_t x2 = 1ull << 63;
uint64_t res1 = x1 / x2;
uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
// Test: 2^64-1 / 2^64-1
uint64_t x1 = ~0ull;
uint64_t x2 = ~0ull;
uint64_t res1 = x1 / x2;
uint64_t res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
// Test: 2^64-1 / 2^63
uint64_t x1 = ~0ull;
uint64_t x2 = 0x8000000000000000ULL;
uint64_t res1 = x1 / x2;
uint64_t res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);
if ((uint64_t) res1 != (uint64_t) res2)
std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
return 1;
return 0;
The above test program can be compiled and run using:
g++ -O2 test.cpp -o test
18446744073709551615 / 10012018566958790731 = 1 correct
18446744073709551615 / 10012018566958790731 = 15540644647674054952 error