@@ -598,16 +598,12 @@ auto write_significand(char* buffer, uint64_t value, bool extra_digit,
598598 buffer += 16 - ((zeroes != 0 ? clz (zeroes) : 64 ) >> 2 );
599599 return buffer;
600600#elif ZMIJ_USE_SSE
601- uint32_t last_digit = value - value_div10 * 10 ;
601+ uint32_t abbccddee = uint32_t (value / 100'000'000 );
602+ uint32_t ffgghhii = uint32_t (value % 100'000'000 );
603+ uint32_t a = abbccddee / 100'000'000 ;
604+ uint32_t bbccddee = abbccddee % 100'000'000 ;
602605
603- // We always write 17 digits into the buffer, but the first one can be zero.
604- // buffer points to the second place in the output buffer to allow for the
605- // insertion of the decimal point, so we can use the first place as scratch.
606- buffer += extra_digit - 1 ;
607- buffer[16 ] = char (last_digit + ' 0' );
608-
609- uint32_t abcdefgh = value_div10 / uint64_t (1e8 );
610- uint32_t ijklmnop = value_div10 % uint64_t (1e8 );
606+ buffer = write_if (buffer, a, extra_digit);
611607
612608 alignas (64 ) static constexpr struct {
613609 static constexpr auto splat64 (uint64_t x) -> uint128 { return {x, x}; }
@@ -658,20 +654,20 @@ auto write_significand(char* buffer, uint64_t value, bool extra_digit,
658654# endif
659655 const __m128i zeros = _mm_load_si128 (ptr (&c->zeros ));
660656
661- // The BCD sequences are based on the ones provided by Xiang JunBo.
662- __m128i x = _mm_set_epi64x (abcdefgh, ijklmnop );
657+ // The BCD sequences are based on ones provided by Xiang JunBo.
658+ __m128i x = _mm_set_epi64x (bbccddee, ffgghhii );
663659 __m128i y = _mm_add_epi64 (
664660 x, _mm_mul_epu32 (neg10k,
665661 _mm_srli_epi64 (_mm_mul_epu32 (x, div10k), div10k_exp)));
666662# if ZMIJ_USE_SSE4_1
667663 // _mm_mullo_epi32 is SSE 4.1
668664 __m128i z = _mm_add_epi64 (
669- y,
670- _mm_mullo_epi32 (neg100, _mm_srli_epi32 (_mm_mulhi_epu16 (y, div100), 3 )));
665+ y, _mm_mullo_epi32 (neg100,
666+ _mm_srli_epi32 (_mm_mulhi_epu16 (y, div100), 3 )));
671667 __m128i big_endian_bcd =
672668 _mm_add_epi16 (z, _mm_mullo_epi16 (neg10, _mm_mulhi_epu16 (z, div10)));
673669 __m128i bcd = _mm_shuffle_epi8 (big_endian_bcd, bswap); // SSSE3
674- # else
670+ # else // !ZMIJ_USE_SSE4_1
675671 __m128i y_div_100 = _mm_srli_epi16 (_mm_mulhi_epu16 (y, div100), 3 );
676672 __m128i y_mod_100 = _mm_sub_epi16 (y, _mm_mullo_epi16 (y_div_100, hundred));
677673 __m128i z = _mm_or_si128 (_mm_slli_epi32 (y_mod_100, 16 ), y_div_100);
@@ -685,16 +681,15 @@ auto write_significand(char* buffer, uint64_t value, bool extra_digit,
685681
686682 // Count leading zeros.
687683 __m128i mask128 = _mm_cmpgt_epi8 (bcd, _mm_setzero_si128 ());
688- uint32_t mask = _mm_movemask_epi8 (mask128);
689- // We don't need a zero-check here: if the mask were zero, either the
690- // significand is zero which is handled elsewhere or the only non-zero digit
691- // is the last digit which we factored off. But in that case the number would
692- // be printed with a different exponent that shifts the last digit into the
693- // first position.
694- auto len = size_t (64 ) - clz (mask); // size_t for native arithmetic
684+ uint64_t mask = _mm_movemask_epi8 (mask128);
685+ # if defined(__LZCNT__) && !defined(ZMIJ_NO_BUILTINS)
686+ auto len = 32 - _lzcnt_u32 (mask);
687+ # else
688+ auto len = 63 - clz ((mask << 1 ) | 1 );
689+ # endif
695690
696691 _mm_storeu_si128 (reinterpret_cast <__m128i*>(buffer), digits);
697- return buffer + (last_digit != 0 ? 17 : len) ;
692+ return buffer + len;
698693#endif // ZMIJ_USE_SSE
699694}
700695
0 commit comments