Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
85df268
Typo
duncancmt Sep 17, 2025
0e0ee7c
WIP: golf
duncancmt Sep 18, 2025
f276662
Comments
duncancmt Sep 18, 2025
1a87b9f
Golf
duncancmt Sep 18, 2025
aa4e4bc
Comment
duncancmt Sep 18, 2025
64a3717
Comment
duncancmt Sep 18, 2025
d33269b
Golf
duncancmt Sep 18, 2025
302e32e
Let the optimizer decide whether `not(0x00)` or `0xffffffffffffffffff…
duncancmt Sep 18, 2025
efa9668
Golf
duncancmt Sep 18, 2025
d1ae4f7
Comment
duncancmt Sep 18, 2025
449afc2
Comment
duncancmt Sep 18, 2025
fbb85fa
WIP: Golf
duncancmt Sep 18, 2025
153f9f8
Golf
duncancmt Sep 18, 2025
f5dc315
WIP: Golf
duncancmt Sep 18, 2025
eae9c9f
WIP: Golf
duncancmt Sep 18, 2025
e63a500
Golf
duncancmt Sep 18, 2025
c51e4d9
WIP: Golf
duncancmt Sep 18, 2025
4bac536
Golf
duncancmt Sep 18, 2025
eb954eb
WIP: Golf
duncancmt Sep 18, 2025
8b61eb6
WIP: Golf
duncancmt Sep 19, 2025
7b85425
WIP: Golf
duncancmt Sep 19, 2025
c88101c
WIP: Golf
duncancmt Sep 19, 2025
972e0e3
WIP: Golf
duncancmt Sep 19, 2025
bb21270
Finish: Golf
duncancmt Sep 19, 2025
2fa3a02
Cleanup `sqrt` unit test
duncancmt Sep 19, 2025
2af5cd0
Comments
duncancmt Sep 19, 2025
2596c70
Comment
duncancmt Sep 19, 2025
5c828f1
Comment
duncancmt Sep 19, 2025
8a5fb44
WIP: Golf
duncancmt Sep 20, 2025
734d0d2
Golf
duncancmt Sep 20, 2025
410a490
Cleanup
duncancmt Sep 20, 2025
7b7a6f9
Comment about convergence region
duncancmt Sep 20, 2025
3b519bc
Recompute seed tables
duncancmt Sep 20, 2025
07dec3c
Recomputing the seed tables caused the fuzz test to fail
duncancmt Sep 20, 2025
01c589e
Add sqrt seed optimization infrastructure
duncancmt Sep 21, 2025
8267ffc
Simplify sqrt seed optimization script
duncancmt Sep 21, 2025
d430ee7
Add invEThreshold parameter to sqrt and test scaffolding
duncancmt Sep 21, 2025
0eb57da
Cleanup, test only high/low/random
duncancmt Sep 21, 2025
1fa5759
Optimize sqrt seed finder with spiral search algorithm
duncancmt Sep 21, 2025
4620017
Optimize
duncancmt Sep 21, 2025
39cf292
Optimize
duncancmt Sep 21, 2025
4d555c5
Optimize
duncancmt Sep 21, 2025
faf8f1e
Add threshold optimization tools and fix seed validation
duncancmt Sep 21, 2025
591a5de
Work on optimizing the 512-bit `sqrt` seed table
duncancmt Sep 22, 2025
b9f06a1
Remove suspect data from `threshold_optimization_results.json`
duncancmt Sep 22, 2025
675aa92
Add bucket 32 threshold 59
duncancmt Sep 22, 2025
c6c68d2
Fix timeout handling in sqrt seed optimization scripts
duncancmt Sep 23, 2025
e524b77
Run scripts
duncancmt Sep 23, 2025
8e9eefc
Update threshold optimization results and improve summary output
duncancmt Sep 23, 2025
281b77d
Add threshold optimization results for buckets 37-39
duncancmt Sep 23, 2025
2db56a3
Reduce minimum threshold
duncancmt Sep 23, 2025
94de3a1
Doing some more fuzzing to discover thresholds
duncancmt Sep 24, 2025
161f503
Collect more thresholds
duncancmt Sep 25, 2025
7d2d711
Collect more thresholds
duncancmt Sep 25, 2025
f501c28
Collect the last batch of thresholds
duncancmt Sep 27, 2025
7dbb364
Refresh sqrt seed lookup tables
duncancmt Sep 27, 2025
17bd8b6
Fix swapped sqrt seed lookup tables
duncancmt Sep 27, 2025
49d77ae
Vibe code a better `invE` threshold given the optimized lookup table …
duncancmt Sep 27, 2025
29cb1fe
Vibe code a better threshold
duncancmt Sep 27, 2025
a08ad02
Optimize invE threshold quadratic fit
duncancmt Sep 27, 2025
85e68cc
Use Horner form for invE threshold
duncancmt Sep 27, 2025
b758e85
Inline bucket-aware invE threshold and clean up tooling
duncancmt Sep 27, 2025
3a2327a
Document quadratic invE threshold helper
duncancmt Sep 27, 2025
1a8e374
Lookup invE threshold from tables
duncancmt Sep 27, 2025
8f428ef
Use LUT seeds with polynomial invE threshold
duncancmt Sep 27, 2025
238c139
Cleanup
duncancmt Sep 27, 2025
dc24fe9
Cleanup
duncancmt Sep 27, 2025
c0227b4
Golf
duncancmt Sep 27, 2025
fa2a227
Golf
duncancmt Sep 27, 2025
d0b013d
Comments
duncancmt Sep 27, 2025
db2fb2b
Merge branch 'dcmt/sqrt512-golf-golf' into dcmt/sqrt512-golf
duncancmt Sep 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 96 additions & 53 deletions src/utils/512Math.sol
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ library Lib512MathArithmetic {

function _mul(uint256 x, uint256 y) private pure returns (uint256 r_hi, uint256 r_lo) {
assembly ("memory-safe") {
let mm := mulmod(x, y, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm := mulmod(x, y, not(0x00))
r_lo := mul(x, y)
r_hi := sub(sub(mm, r_lo), lt(mm, r_lo))
}
Expand All @@ -468,7 +468,7 @@ library Lib512MathArithmetic {

function _mul(uint256 x_hi, uint256 x_lo, uint256 y) private pure returns (uint256 r_hi, uint256 r_lo) {
assembly ("memory-safe") {
let mm := mulmod(x_lo, y, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm := mulmod(x_lo, y, not(0x00))
r_lo := mul(x_lo, y)
r_hi := add(mul(x_hi, y), sub(sub(mm, r_lo), lt(mm, r_lo)))
}
Expand All @@ -490,7 +490,7 @@ library Lib512MathArithmetic {
returns (uint256 r_hi, uint256 r_lo)
{
assembly ("memory-safe") {
let mm := mulmod(x_lo, y_lo, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm := mulmod(x_lo, y_lo, not(0x00))
r_lo := mul(x_lo, y_lo)
r_hi := add(add(mul(x_hi, y_lo), mul(x_lo, y_hi)), sub(sub(mm, r_lo), lt(mm, r_lo)))
}
Expand Down Expand Up @@ -563,9 +563,9 @@ library Lib512MathArithmetic {
returns (uint256 r_ex, uint256 r_hi, uint256 r_lo)
{
assembly ("memory-safe") {
let mm0 := mulmod(x_lo, y, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm0 := mulmod(x_lo, y, not(0x00))
r_lo := mul(x_lo, y)
let mm1 := mulmod(x_hi, y, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm1 := mulmod(x_hi, y, not(0x00))
let r_partial := mul(x_hi, y)
r_ex := sub(sub(mm1, r_partial), lt(mm1, r_partial))

Expand Down Expand Up @@ -746,7 +746,7 @@ library Lib512MathArithmetic {

assembly ("memory-safe") {
// inv_hi = inv_lo * tmp / 2**256 % 2**256
let mm := mulmod(inv_lo, tmp_lo, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm := mulmod(inv_lo, tmp_lo, not(0x00))
inv_hi := add(mul(inv_lo, tmp_hi), sub(sub(mm, inv_lo), lt(mm, inv_lo)))
}
}
Expand Down Expand Up @@ -1432,21 +1432,14 @@ library Lib512MathArithmetic {
return omodAlt(r, y, r);
}

/// A single Newton-Raphson step for computing the inverse square root
/// Y_next = ⌊Y · (1.5·2²⁵⁵ - U) ) / 2²⁵⁵⌋
/// U = ⌊M · ⌊Y² / 2²⁵⁵⌋ / 2²⁵⁶⌋
function _iSqrtNrStep(uint256 Y, uint256 M) private pure returns (uint256 Y_next) {
unchecked {
(uint256 Y2_hi, uint256 Y2_lo) = _mul(Y, Y); // [hi lo] = Y·Y
(, uint256 Y2) = _shr256(Y2_hi, Y2_lo, 255); // ⌊/ 2²⁵⁵⌋
(uint256 MY2,) = _mul(M, Y2); // ⌊M·Y2 / 2²⁵⁶⌋
uint256 T = 1.5 * 2 ** 255 - MY2;
(uint256 Y_next_hi, uint256 Y_next_lo) = _mul(Y, T); // [hi lo] = Y·T
(, Y_next) = _shr256(Y_next_hi, Y_next_lo, 255); // ⌊/ 2²⁵⁵⌋
// hi ≈ x · y / 2²⁵⁶ (±1)
function _inaccurateMulHi(uint256 x, uint256 y) private pure returns (uint256 hi) {
assembly ("memory-safe") {
hi := sub(mulmod(x, y, not(0x00)), mul(x, y))
}
}

// gas benchmark 14/09/2025: ~2315 gas
// gas benchmark 2025/09/20: ~1425 gas
function sqrt(uint512 x) internal pure returns (uint256 r) {
(uint256 x_hi, uint256 x_lo) = x.into();

Expand All @@ -1467,66 +1460,116 @@ library Lib512MathArithmetic {
// `e` is half the exponent of `x`
// e = ⌊bitlength(x)/2⌋
// invE = 256 - e
uint256 invE = (x_hi.clz() + 1) >> 1; // range: [0 128]
uint256 invE = (x_hi.clz() + 1) >> 1; // invE ∈ [0, 128]

// Extract mantissa M by shifting x right by 2·e - 255 bits
// `M` is the mantissa of `x`; M ∈ [½, 2)
(, uint256 M) = _shr(x_hi, x_lo, 257 - (invE << 1)); // scale: 255 - 2*e
// `M` is the mantissa of `x` as a Q1.255; M ∈ [½, 2)
(, uint256 M) = _shr(x_hi, x_lo, 257 - (invE << 1)); // scale: 2⁽²⁵⁵⁻²ᵉ⁾

/// Pick an initial estimate (seed) for Y using a lookup table. Even-exponent
/// normalization means our mantissa is geometrically symmetric around 1, leading to 16
/// buckets on the low side and 32 buckets on the high side.
// `Y` approximates the inverse square root of integer `M` as a Q1.255
uint256 Y; // scale: 255 + e (scale relative to M: 382.5)
// `Y` _ultimately_ approximates the inverse square root of fixnum `M` as a
// Q3.253. However, as a gas optimization, the number of fractional bits in `Y` rises
// through the steps, giving an inhomogeneous fixed-point representation. Y ≈∈ [√½, √2]
uint256 Y; // scale: 2⁽²⁵³⁺ᵉ⁾
uint256 Mbucket;
assembly ("memory-safe") {
// Extract the upper 6 bits of `M` to be used as a table index. `M >> 250 < 16` is
// invalid (that would imply M<½), so our lookup table only needs to handle only 16
// through 63.
let i := shr(0xfa, M)
Mbucket := shr(0xfa, M)
// We can't fit 48 seeds into a single word, so we split the table in 2 and use `c`
// to select which table we index.
let c := lt(0x27, i)
let c := lt(0x27, Mbucket)

// Each entry is 10 bits and the entries are ordered from lowest `i` to
// highest. Each seed is 10 significant bits on the MSB end followed by 246 padding
// zero bits. The seed is the value for `Y` for the midpoint of the bucket, rounded
// highest. The seed is the value for `Y` for the midpoint of the bucket, rounded
// to 10 significant bits.
// Each seed is less than ⌊2²⁵⁵·√2⌋. This ensures overflow safety (Y² / 2²⁵⁵ < 2²⁵⁶)
// in the first (and subsequent) N-R step(s).
let table_hi := 0xb26b4a8690a027198e559263e8ce2887e15832047f1f47b5e677dd974dcd
let table_lo := 0x71dc26f1b76c9ad6a5a46819c661946418c621856057e5ed775d1715b96b
let table := xor(table_hi, mul(xor(table_lo, table_hi), c))

// Index the table to obtain the initial seed of `Y`
let shift := add(0x186, mul(0x0a, sub(mul(0x18, c), i)))
Y := shl(0xf6, shr(shift, table))
// Index the table to obtain the initial seed of `Y`.
let shift := add(0x186, mul(0x0a, sub(mul(0x18, c), Mbucket)))
// We begin the Newton-Raphson iterations with `Y` in Q247.9 format.
Y := and(0x3ff, shr(shift, table))

// The worst-case seed for `Y` occurs when `Mbucket = 16`. For monotone quadratic
// convergence, we desire that 1/√3 < Y·√M < √(5/3). At the boundaries (worst case)
// of the `Mbucket = 16` range, we are 0.407351 (41.3680%) from the lower bound and
// 0.275987 (27.1906%) from the higher bound.
}

// Perform 5 Newton-Raphson iterations. 5 is enough iterations for sufficient
// convergence that our final fixup step produces an exact result.
Y = _iSqrtNrStep(Y, M);
Y = _iSqrtNrStep(Y, M);
Y = _iSqrtNrStep(Y, M);
Y = _iSqrtNrStep(Y, M);
if (invE < 79) {
// For small `e` (lower values of `x`), we can skip the 5th, final N-R
// iteration. The correct bits that this iteration would obtain are shifted away
// during the denormalization step. This branch is net gas-optimizing.
Y = _iSqrtNrStep(Y, M);
/// Perform 5 Newton-Raphson iterations. 5 is enough iterations for sufficient
/// convergence that our final fixup step produces an exact result.
// The Newton-Raphson iteration for 1/√M is:
// Y ≈ Y · (3 - M · Y²) / 2
// The implementation of this iteration is deliberately imprecise. No matter how many
// times you run it, you won't converge `Y` on the closest Q3.253 to √M. However, this
// is acceptable because the cleanup step applied after the final call is very tolerant
// of error in the low bits of `Y`.

// `M` is Q1.255
// `Y` is Q247.9
{
uint256 Y2 = Y * Y; // scale: 2¹⁸
// Because `M` is Q1.255, multiplying `Y2` by `M` and taking the high word
// implicitly divides `MY2` by 2. We move the division by 2 inside the subtraction
// from 3 by adjusting the minuend.
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2¹⁸
uint256 T = 1.5 * 2 ** 18 - MY2; // scale: 2¹⁸
Y *= T; // scale: 2²⁷
}
// `Y` is Q229.27
{
uint256 Y2 = Y * Y; // scale: 2⁵⁴
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2⁵⁴
uint256 T = 1.5 * 2 ** 54 - MY2; // scale: 2⁵⁴
Y *= T; // scale: 2⁸¹
}
// `Y` is Q175.81
{
uint256 Y2 = Y * Y; // scale: 2¹⁶²
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2¹⁶²
uint256 T = 1.5 * 2 ** 162 - MY2; // scale: 2¹⁶²
Y = Y * T >> 116; // scale: 2¹²⁷
}
// `Y` is Q129.127
if (invE < 95 - Mbucket) {
// Generally speaking, for relatively smaller `e` (lower values of `x`) and for
// relatively larger `M`, we can skip the 5th N-R iteration. The constant `95` is
// derived by extensive fuzzing. Attempting a higher-order approximation of the
// relationship between `M` and `invE` consumes, on average, more gas. The correct
// bits that this iteration would obtain are shifted away during the denormalization
// step. This branch is net gas-optimizing.
uint256 Y2 = Y * Y; // scale: 2²⁵⁴
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2²⁵⁴
uint256 T = 1.5 * 2 ** 254 - MY2; // scale: 2²⁵⁴
Y = _inaccurateMulHi(Y << 2, T); // scale: 2¹²⁷
}
// `Y` is Q129.127
{
uint256 Y2 = Y * Y; // scale: 2²⁵⁴
uint256 MY2 = _inaccurateMulHi(M, Y2); // scale: 2²⁵⁴
uint256 T = 1.5 * 2 ** 254 - MY2; // scale: 2²⁵⁴
Y = _inaccurateMulHi(Y << 128, T); // scale: 2²⁵³
}
// `Y` is Q3.253

/// When we combine `Y` with `M` to form our approximation of the square root, we have
/// to un-normalize by the half-scale value. This is where even-exponent normalization
/// comes in because the half-scale is integral.
/// Y ≈ 2³⁸³ / √(2·M)
/// M = ⌊x · 2⁽²⁵⁵⁻²ᵉ⁾⌋
/// M·Y ≈ 2³⁸³ · √(M/2)
/// M·Y ≈ 2⁽⁵¹⁰⁻ᵉ⁾ · √x
/// r0 = ⌊M·Y / 2⁽⁵¹⁰⁻ᵉ⁾⌋ ≈ ⌊√x⌋
// We shift right by `510 - e` to account for both the Q1.255 scaling and
// denormalization
(uint256 r0_hi, uint256 r0_lo) = _mul(M, Y);
(, uint256 r0) = _shr(r0_hi, r0_lo, 254 + invE);
/// Y ≈ 2²⁵³ / √(M / 2²⁵⁵)
/// Y ≈ 2³⁸¹ / √(2·M)
/// M·Y ≈ 2³⁸¹ · √(M/2)
/// M·Y ≈ 2⁽⁵⁰⁸⁻ᵉ⁾ · √x
/// r0 ≈ M·Y / 2⁽⁵⁰⁸⁻ᵉ⁾ ≈ ⌊√x⌋
// We shift right by `508 - e` to account for both the Q3.253 scaling and
// denormalization. We don't care about accuracy in the low bits of `r0`, so we can cut
// some corners.
(, uint256 r0) = _shr(_inaccurateMulHi(M, Y), 0, 252 + invE);

/// `r0` is only an approximation of √x, so we perform a single Babylonian step to fully
/// converge on ⌊√x⌋ or ⌈√x⌉. The Babylonian step is:
Expand All @@ -1535,7 +1578,7 @@ library Lib512MathArithmetic {
// because the value the upper word of the quotient can take is highly constrained, we
// can compute the quotient mod 2²⁵⁶ and recover the high word separately. Although
// `_div` does an expensive Newton-Raphson-Hensel modular inversion:
// ⌊x/r0⌋ ≡ x·r0⁻¹ mod 2²⁵⁶ (for odd r0)
// ⌊x/r0⌋ ≡ ⌊x/2ⁿ⌋·⌊r0/2ⁿ⌋⁻¹ mod 2²⁵⁶ (for r0 % 2ⁿ = 0 ∧ r % 2⁽ⁿ⁺¹⁾ = 2ⁿ)
// and we already have a pretty good estimate for r0⁻¹, namely `Y`, refining `Y` into
// the appropriate inverse requires a series of 768-bit multiplications that take more
// gas.
Expand All @@ -1545,7 +1588,7 @@ library Lib512MathArithmetic {
// `oflo` here is either 0 or 1. When `oflo == 1`, `r1 == 0`, and the correct value for
// `r1` is `type(uint256).max`.
(uint256 oflo, uint256 r1) = _shr256(s_hi, s_lo, 1);
r1 -= oflo;
r1 -= oflo; // underflow is desired

/// Because the Babylonian step can give ⌈√x⌉ if x+1 is a perfect square, we have to
/// check whether we've overstepped by 1 and clamp as appropriate. ref:
Expand Down
6 changes: 3 additions & 3 deletions src/vendor/FullMath.sol
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ library FullMath {
// Remainder Theorem to reconstruct the 512 bit result. The result is stored
// in two 256 variables such that product = prod1 * 2**256 + prod0
assembly ("memory-safe") {
let mm := mulmod(a, b, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm := mulmod(a, b, not(0x00))
lo := mul(a, b)
hi := sub(sub(mm, lo), lt(mm, lo))
}
Expand Down Expand Up @@ -229,7 +229,7 @@ library FullMath {

function unsafeMulShift(uint256 a, uint256 b, uint256 s) internal pure returns (uint256 result) {
assembly ("memory-safe") {
let mm := mulmod(a, b, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm := mulmod(a, b, not(0x00))
let prod0 := mul(a, b)
let prod1 := sub(sub(mm, prod0), lt(mm, prod0))
result := or(shr(s, prod0), shl(sub(0x100, s), prod1))
Expand All @@ -238,7 +238,7 @@ library FullMath {

function unsafeMulShiftUp(uint256 a, uint256 b, uint256 s) internal pure returns (uint256 result) {
assembly ("memory-safe") {
let mm := mulmod(a, b, 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff)
let mm := mulmod(a, b, not(0x00))
let prod0 := mul(a, b)
let prod1 := sub(sub(mm, prod0), lt(mm, prod0))
let s_ := sub(0x100, s)
Expand Down
24 changes: 1 addition & 23 deletions test/0.8.25/512Math.t.sol
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,9 @@ contract Lib512MathTest is Test {
assertTrue(r == e);
}

function test512Math_sqrt(uint256 x_hi, uint256 x_lo) public {
function test512Math_sqrt(uint256 x_hi, uint256 x_lo) external pure {
uint512 x = alloc().from(x_hi, x_lo);
vm.startSnapshotGas("sqrt512");
uint256 r = x.sqrt();
vm.stopSnapshotGas();

(uint256 r2_lo, uint256 r2_hi) = SlowMath.fullMul(r, r);
assertTrue((r2_hi < x_hi) || (r2_hi == x_hi && r2_lo <= x_lo), "sqrt too high");
Expand All @@ -240,24 +238,4 @@ contract Lib512MathTest is Test {
assertTrue((r2_hi > x_hi) || (r2_hi == x_hi && r2_lo > x_lo), "sqrt too low");
}
}

function test512Math_sqrt_historicFailure01() external {
return test512Math_sqrt(
114805576419587020236757229903001819680135317278270614039818801407925860184587, 154395211239568
);
}

function test512Math_sqrt_historicFailure02() external {
return test512Math_sqrt(
111622007524949111044014908009097472374689074738034287125887346957447537418677,
1234515505982255149698729514620346575
);
}

function test512Math_sqrt_historicFailure03() external {
return test512Math_sqrt(
0xfdd1a74de33135e737432b6ce327f31ace09b8be4d708a8fb483fad552fc320b,
0xc624b66cc0138b8fabc209247f72d758e1cf3343756d543badbf24212bed8c16
);
}
}