Skip to content

Commit f54583c

Browse files
committed
Switch back to an earlier SSE version
1 parent 487e366 commit f54583c

File tree

1 file changed

+17
-22
lines changed

1 file changed

+17
-22
lines changed

zmij.cc

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -598,16 +598,12 @@ auto write_significand(char* buffer, uint64_t value, bool extra_digit,
598598
buffer += 16 - ((zeroes != 0 ? clz(zeroes) : 64) >> 2);
599599
return buffer;
600600
#elif ZMIJ_USE_SSE
601-
uint32_t last_digit = value - value_div10 * 10;
601+
uint32_t abbccddee = uint32_t(value / 100'000'000);
602+
uint32_t ffgghhii = uint32_t(value % 100'000'000);
603+
uint32_t a = abbccddee / 100'000'000;
604+
uint32_t bbccddee = abbccddee % 100'000'000;
602605

603-
// We always write 17 digits into the buffer, but the first one can be zero.
604-
// buffer points to the second place in the output buffer to allow for the
605-
// insertion of the decimal point, so we can use the first place as scratch.
606-
buffer += extra_digit - 1;
607-
buffer[16] = char(last_digit + '0');
608-
609-
uint32_t abcdefgh = value_div10 / uint64_t(1e8);
610-
uint32_t ijklmnop = value_div10 % uint64_t(1e8);
606+
buffer = write_if(buffer, a, extra_digit);
611607

612608
alignas(64) static constexpr struct {
613609
static constexpr auto splat64(uint64_t x) -> uint128 { return {x, x}; }
@@ -658,20 +654,20 @@ auto write_significand(char* buffer, uint64_t value, bool extra_digit,
658654
# endif
659655
const __m128i zeros = _mm_load_si128(ptr(&c->zeros));
660656

661-
// The BCD sequences are based on the ones provided by Xiang JunBo.
662-
__m128i x = _mm_set_epi64x(abcdefgh, ijklmnop);
657+
// The BCD sequences are based on ones provided by Xiang JunBo.
658+
__m128i x = _mm_set_epi64x(bbccddee, ffgghhii);
663659
__m128i y = _mm_add_epi64(
664660
x, _mm_mul_epu32(neg10k,
665661
_mm_srli_epi64(_mm_mul_epu32(x, div10k), div10k_exp)));
666662
# if ZMIJ_USE_SSE4_1
667663
// _mm_mullo_epi32 is SSE 4.1
668664
__m128i z = _mm_add_epi64(
669-
y,
670-
_mm_mullo_epi32(neg100, _mm_srli_epi32(_mm_mulhi_epu16(y, div100), 3)));
665+
y, _mm_mullo_epi32(neg100,
666+
_mm_srli_epi32(_mm_mulhi_epu16(y, div100), 3)));
671667
__m128i big_endian_bcd =
672668
_mm_add_epi16(z, _mm_mullo_epi16(neg10, _mm_mulhi_epu16(z, div10)));
673669
__m128i bcd = _mm_shuffle_epi8(big_endian_bcd, bswap); // SSSE3
674-
# else
670+
# else // !ZMIJ_USE_SSE4_1
675671
__m128i y_div_100 = _mm_srli_epi16(_mm_mulhi_epu16(y, div100), 3);
676672
__m128i y_mod_100 = _mm_sub_epi16(y, _mm_mullo_epi16(y_div_100, hundred));
677673
__m128i z = _mm_or_si128(_mm_slli_epi32(y_mod_100, 16), y_div_100);
@@ -685,16 +681,15 @@ auto write_significand(char* buffer, uint64_t value, bool extra_digit,
685681

686682
// Count leading zeros.
687683
__m128i mask128 = _mm_cmpgt_epi8(bcd, _mm_setzero_si128());
688-
uint32_t mask = _mm_movemask_epi8(mask128);
689-
// We don't need a zero-check here: if the mask were zero, either the
690-
// significand is zero which is handled elsewhere or the only non-zero digit
691-
// is the last digit which we factored off. But in that case the number would
692-
// be printed with a different exponent that shifts the last digit into the
693-
// first position.
694-
auto len = size_t(64) - clz(mask); // size_t for native arithmetic
684+
uint64_t mask = _mm_movemask_epi8(mask128);
685+
# if defined(__LZCNT__) && !defined(ZMIJ_NO_BUILTINS)
686+
auto len = 32 - _lzcnt_u32(mask);
687+
# else
688+
auto len = 63 - clz((mask << 1) | 1);
689+
# endif
695690

696691
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), digits);
697-
return buffer + (last_digit != 0 ? 17 : len);
692+
return buffer + len;
698693
#endif // ZMIJ_USE_SSE
699694
}
700695

0 commit comments

Comments
 (0)