-
Notifications
You must be signed in to change notification settings - Fork 47
/
__msvc_bit_utils.hpp
executable file
·447 lines (397 loc) · 16.4 KB
/
__msvc_bit_utils.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
// __msvc_bit_utils.hpp internal header (core)
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef __MSVC_BIT_UTILS_HPP
#define __MSVC_BIT_UTILS_HPP
#include <yvals_core.h>
#if _STL_COMPILER_PREPROCESSOR
#include <climits>
#include <xtr1common>
#include _STL_INTRIN_HEADER
// TRANSITION, GH-2129, move down to _Arm64_popcount
#if (defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
&& !defined(__INTEL_COMPILER) && !defined(__clang__) // TRANSITION, LLVM-51488
#define _HAS_NEON_INTRINSICS 1
#else // ^^^ intrinsics available / intrinsics unavailable vvv
#define _HAS_NEON_INTRINSICS 0
#endif // ^^^ intrinsics unavailable ^^^
#if _HAS_NEON_INTRINSICS
#include <arm64_neon.h> // TRANSITION, GH-2129
#endif // _HAS_NEON_INTRINSICS
#pragma pack(push, _CRT_PACKING)
#pragma warning(push, _STL_WARNING_LEVEL)
#pragma warning(disable : _STL_DISABLED_WARNINGS)
_STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new
_STD_BEGIN
extern "C" {
extern int __isa_available;
}
_INLINE_VAR constexpr int _Stl_isa_available_sse42 = 2; // equal to __ISA_AVAILABLE_SSE42
_INLINE_VAR constexpr int _Stl_isa_available_avx2 = 5; // equal to __ISA_AVAILABLE_AVX2
template <class _UInt>
_INLINE_VAR constexpr int _Unsigned_integer_digits = sizeof(_UInt) * CHAR_BIT;
// Implementation of countl_zero without using specialized CPU instructions.
// Used at compile time and when said instructions are not supported.
// see "Hacker's Delight" section 5-3
template <class _Ty>
_NODISCARD constexpr int _Countl_zero_fallback(_Ty _Val) noexcept {
_Ty _Yx = 0;
unsigned int _Nx = _Unsigned_integer_digits<_Ty>;
unsigned int _Cx = _Unsigned_integer_digits<_Ty> / 2;
do {
_Yx = static_cast<_Ty>(_Val >> _Cx);
if (_Yx != 0) {
_Nx -= _Cx;
_Val = _Yx;
}
_Cx >>= 1;
} while (_Cx != 0);
return static_cast<int>(_Nx) - static_cast<int>(_Val);
}
#if !defined(_M_CEE_PURE) && !defined(__CUDACC__) && !defined(__INTEL_COMPILER)
#define _HAS_COUNTL_ZERO_INTRINSICS 1
#else // ^^^ intrinsics available / intrinsics unavailable vvv
#define _HAS_COUNTL_ZERO_INTRINSICS 0
#endif // ^^^ intrinsics unavailable ^^^
#if _HAS_COUNTL_ZERO_INTRINSICS
#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
template <class _Ty>
_NODISCARD int _Countl_zero_lzcnt(const _Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
if constexpr (_Digits <= 16) {
return static_cast<int>(__lzcnt16(_Val) - (16 - _Digits));
} else if constexpr (_Digits == 32) {
return static_cast<int>(__lzcnt(_Val));
} else {
#ifdef _M_IX86
const unsigned int _High = _Val >> 32;
const auto _Low = static_cast<unsigned int>(_Val);
if (_High == 0) {
return 32 + _Countl_zero_lzcnt(_Low);
} else {
return _Countl_zero_lzcnt(_High);
}
#else // ^^^ defined(_M_IX86) / !defined(_M_IX86) vvv
return static_cast<int>(__lzcnt64(_Val));
#endif // ^^^ !defined(_M_IX86) ^^^
}
}
template <class _Ty>
_NODISCARD int _Countl_zero_bsr(const _Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
unsigned long _Result;
if constexpr (_Digits <= 32) {
if (!_BitScanReverse(&_Result, _Val)) {
return _Digits;
}
} else {
#ifdef _M_IX86
const unsigned int _High = _Val >> 32;
if (_BitScanReverse(&_Result, _High)) {
return static_cast<int>(31 - _Result);
}
const auto _Low = static_cast<unsigned int>(_Val);
if (!_BitScanReverse(&_Result, _Low)) {
return _Digits;
}
#else // ^^^ defined(_M_IX86) / !defined(_M_IX86) vvv
if (!_BitScanReverse64(&_Result, _Val)) {
return _Digits;
}
#endif // ^^^ !defined(_M_IX86) ^^^
}
return static_cast<int>(_Digits - 1 - _Result);
}
template <class _Ty>
_NODISCARD int _Checked_x86_x64_countl_zero(const _Ty _Val) noexcept {
#ifdef __AVX2__
return _Countl_zero_lzcnt(_Val);
#else // ^^^ defined(__AVX2__) / !defined(__AVX2__) vvv
const bool _Definitely_have_lzcnt = __isa_available >= _Stl_isa_available_avx2;
if (_Definitely_have_lzcnt) {
return _Countl_zero_lzcnt(_Val);
} else {
return _Countl_zero_bsr(_Val);
}
#endif // ^^^ !defined(__AVX2__) ^^^
}
#endif // defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
#if defined(_M_ARM) || defined(_M_ARM64)
#ifdef __clang__ // TRANSITION, GH-1586
_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned short _Val) {
return __builtin_clzs(_Val);
}
_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned int _Val) {
return __builtin_clz(_Val);
}
_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned long _Val) {
return __builtin_clzl(_Val);
}
_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned long long _Val) {
return __builtin_clzll(_Val);
}
#endif // TRANSITION, GH-1586
template <class _Ty>
_NODISCARD int _Checked_arm_arm64_countl_zero(const _Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
if (_Val == 0) {
return _Digits;
}
#ifdef __clang__ // TRANSITION, GH-1586
if constexpr (is_same_v<remove_cv_t<_Ty>, unsigned char>) {
return _Clang_arm_arm64_countl_zero(static_cast<unsigned short>(_Val))
- (_Unsigned_integer_digits<unsigned short> - _Digits);
} else {
return _Clang_arm_arm64_countl_zero(_Val);
}
#else // ^^^ workaround / no workaround vvv
if constexpr (_Digits <= 32) {
return static_cast<int>(_CountLeadingZeros(_Val)) - (_Unsigned_integer_digits<unsigned long> - _Digits);
} else {
return static_cast<int>(_CountLeadingZeros64(_Val));
}
#endif // TRANSITION, GH-1586
}
#endif // defined(_M_ARM) || defined(_M_ARM64)
#endif // _HAS_COUNTL_ZERO_INTRINSICS
// Implementation of countr_zero without using specialized CPU instructions.
// Used at compile time and when said instructions are not supported.
// see "Hacker's Delight" section 5-4
template <class _Ty>
_NODISCARD constexpr int _Countr_zero_fallback(const _Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
return _Digits - _Countl_zero_fallback(static_cast<_Ty>(static_cast<_Ty>(~_Val) & static_cast<_Ty>(_Val - 1)));
}
// Implementation of popcount without using specialized CPU instructions.
// Used at compile time and when said instructions are not supported.
template <class _Ty>
_NODISCARD constexpr int _Popcount_fallback(_Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
#if defined(_M_IX86) || defined(_M_ARM)
if constexpr (_Digits == 64) {
// 64-bit bit operations on architectures without 64-bit registers are less efficient,
// hence we split the value so that it fits in 32-bit registers
return _Popcount_fallback(static_cast<unsigned long>(_Val))
+ _Popcount_fallback(static_cast<unsigned long>(_Val >> 32));
}
#endif // defined(_M_IX86) || defined(_M_ARM)
// we static_cast these bit patterns in order to truncate them to the correct size
_Val = static_cast<_Ty>(_Val - ((_Val >> 1) & static_cast<_Ty>(0x5555'5555'5555'5555ull)));
_Val = static_cast<_Ty>((_Val & static_cast<_Ty>(0x3333'3333'3333'3333ull))
+ ((_Val >> 2) & static_cast<_Ty>(0x3333'3333'3333'3333ull)));
_Val = static_cast<_Ty>((_Val + (_Val >> 4)) & static_cast<_Ty>(0x0F0F'0F0F'0F0F'0F0Full));
// Multiply by one in each byte, so that it will have the sum of all source bytes in the highest byte
_Val = static_cast<_Ty>(_Val * static_cast<_Ty>(0x0101'0101'0101'0101ull));
// Extract highest byte
return static_cast<int>(_Val >> (_Digits - 8));
}
#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
&& !defined(__INTEL_COMPILER)
#define _HAS_TZCNT_BSF_INTRINSICS 1
#else // ^^^ intrinsics available / intrinsics unavailable vvv
#define _HAS_TZCNT_BSF_INTRINSICS 0
#endif // ^^^ intrinsics unavailable ^^^
#if _HAS_TZCNT_BSF_INTRINSICS
#ifdef __clang__
#define _TZCNT_U32 __builtin_ia32_tzcnt_u32
#define _TZCNT_U64 __builtin_ia32_tzcnt_u64
#else // ^^^ __clang__ / !__clang__ vvv
#define _TZCNT_U32 _tzcnt_u32
#define _TZCNT_U64 _tzcnt_u64
#endif // __clang__
template <class _Ty>
_NODISCARD int _Countr_zero_tzcnt(const _Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
constexpr _Ty _Max = static_cast<_Ty>(-1); // equal to (numeric_limits<_Ty>::max)()
if constexpr (_Digits <= 32) {
// Intended widening to int. This operation means that a narrow 0 will widen
// to 0xFFFF....FFFF0... instead of 0. We need this to avoid counting all the zeros
// of the wider type.
return static_cast<int>(_TZCNT_U32(static_cast<unsigned int>(~_Max | _Val)));
} else {
#ifdef _M_IX86
const auto _Low = static_cast<unsigned int>(_Val);
if (_Low == 0) {
const unsigned int _High = _Val >> 32;
return static_cast<int>(32 + _TZCNT_U32(_High));
} else {
return static_cast<int>(_TZCNT_U32(_Low));
}
#else // ^^^ defined(_M_IX86) / !defined(_M_IX86) vvv
return static_cast<int>(_TZCNT_U64(_Val));
#endif // ^^^ !defined(_M_IX86) ^^^
}
}
#undef _TZCNT_U32
#undef _TZCNT_U64
template <class _Ty>
_NODISCARD int _Countr_zero_bsf(const _Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
constexpr _Ty _Max = static_cast<_Ty>(-1); // equal to (numeric_limits<_Ty>::max)()
unsigned long _Result;
if constexpr (_Digits <= 32) {
// Intended widening to int. This operation means that a narrow 0 will widen
// to 0xFFFF....FFFF0... instead of 0. We need this to avoid counting all the zeros
// of the wider type.
if (!_BitScanForward(&_Result, static_cast<unsigned int>(~_Max | _Val))) {
return _Digits;
}
} else {
#ifdef _M_IX86
const auto _Low = static_cast<unsigned int>(_Val);
if (_BitScanForward(&_Result, _Low)) {
return static_cast<int>(_Result);
}
const unsigned int _High = _Val >> 32;
if (!_BitScanForward(&_Result, _High)) {
return _Digits;
} else {
return static_cast<int>(_Result + 32);
}
#else // ^^^ defined(_M_IX86) / !defined(_M_IX86) vvv
if (!_BitScanForward64(&_Result, _Val)) {
return _Digits;
}
#endif // ^^^ !defined(_M_IX86) ^^^
}
return static_cast<int>(_Result);
}
template <class _Ty>
_NODISCARD int _Checked_x86_x64_countr_zero(const _Ty _Val) noexcept {
#ifdef __AVX2__
return _Countr_zero_tzcnt(_Val);
#else // ^^^ defined(__AVX2__) / !defined(__AVX2__) vvv
const bool _Definitely_have_tzcnt = __isa_available >= _Stl_isa_available_avx2;
if (_Definitely_have_tzcnt) {
return _Countr_zero_tzcnt(_Val);
} else {
return _Countr_zero_bsf(_Val);
}
#endif // ^^^ !defined(__AVX2__) ^^^
}
#endif // _HAS_TZCNT_BSF_INTRINSICS
#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
&& !defined(__INTEL_COMPILER)
#define _HAS_POPCNT_INTRINSICS 1
#else // ^^^ intrinsics available / intrinsics unavailable vvv
#define _HAS_POPCNT_INTRINSICS 0
#endif // ^^^ intrinsics unavailable ^^^
#if _HAS_POPCNT_INTRINSICS
template <class _Ty>
_NODISCARD int _Unchecked_x86_x64_popcount(const _Ty _Val) noexcept {
constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
if constexpr (_Digits <= 16) {
return static_cast<int>(__popcnt16(_Val));
} else if constexpr (_Digits == 32) {
return static_cast<int>(__popcnt(_Val));
} else {
#ifdef _M_IX86
return static_cast<int>(__popcnt(_Val >> 32) + __popcnt(static_cast<unsigned int>(_Val)));
#else // ^^^ defined(_M_IX86) / !defined(_M_IX86) vvv
return static_cast<int>(__popcnt64(_Val));
#endif // ^^^ !defined(_M_IX86) ^^^
}
}
template <class _Ty>
_NODISCARD int _Checked_x86_x64_popcount(const _Ty _Val) noexcept {
#ifndef __AVX__
const bool _Definitely_have_popcnt = __isa_available >= _Stl_isa_available_sse42;
if (!_Definitely_have_popcnt) {
return _Popcount_fallback(_Val);
}
#endif // !defined(__AVX__)
return _Unchecked_x86_x64_popcount(_Val);
}
#endif // _HAS_POPCNT_INTRINSICS
#if _HAS_NEON_INTRINSICS
_NODISCARD inline int _Arm64_popcount(const unsigned long long _Val) noexcept {
const __n64 _Temp = neon_cnt(__uint64ToN64_v(_Val));
return neon_addv8(_Temp).n8_i8[0];
}
#endif // _HAS_NEON_INTRINSICS
template <class _Ty>
_INLINE_VAR constexpr bool _Is_standard_unsigned_integer =
_Is_any_of_v<remove_cv_t<_Ty>, unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long>;
template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
_NODISCARD _CONSTEXPR20 int _Countr_zero(const _Ty _Val) noexcept {
#if _HAS_TZCNT_BSF_INTRINSICS
#if _HAS_CXX20
if (!_STD is_constant_evaluated())
#endif // _HAS_CXX20
{
return _Checked_x86_x64_countr_zero(_Val);
}
#endif // _HAS_TZCNT_BSF_INTRINSICS
return _Countr_zero_fallback(_Val);
}
template <class _Ty, class _Fn>
constexpr decltype(auto) _Select_countr_zero_impl(_Fn _Callback) {
// TRANSITION, DevCom-1527995: Lambdas in this function ensure inlining
#if _HAS_TZCNT_BSF_INTRINSICS && _HAS_CXX20
if (!_STD is_constant_evaluated()) {
#ifdef __AVX2__
return _Callback([](_Ty _Val) { return _Countr_zero_tzcnt(_Val); });
#else // ^^^ AVX2 / not AVX2 vvv
const bool _Definitely_have_tzcnt = __isa_available >= _Stl_isa_available_avx2;
if (_Definitely_have_tzcnt) {
return _Callback([](_Ty _Val) { return _Countr_zero_tzcnt(_Val); });
} else {
return _Callback([](_Ty _Val) { return _Countr_zero_bsf(_Val); });
}
#endif // ^^^ not AVX2 ^^^
}
#endif // ^^^ _HAS_TZCNT_BSF_INTRINSICS && _HAS_CXX20 ^^^
// C++17 constexpr gcd() calls this function, so it should be constexpr unless we detect runtime evaluation.
return _Callback([](_Ty _Val) { return _Countr_zero_fallback(_Val); });
}
template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
_NODISCARD _CONSTEXPR20 int _Popcount(const _Ty _Val) noexcept {
#if _HAS_POPCNT_INTRINSICS || _HAS_NEON_INTRINSICS
#if _HAS_CXX20
if (!_STD is_constant_evaluated())
#endif // _HAS_CXX20
{
#if _HAS_POPCNT_INTRINSICS
return _Checked_x86_x64_popcount(_Val);
#elif _HAS_NEON_INTRINSICS // ^^^ x86/x64 intrinsics available / ARM64 intrinsics available vvv
return _Arm64_popcount(_Val);
#endif // ^^^ ARM64 intrinsics available ^^^
}
#endif // ^^^ any intrinsics available ^^^
return _Popcount_fallback(_Val);
}
template <class _Ty, class _Fn>
_CONSTEXPR20 decltype(auto) _Select_popcount_impl(_Fn _Callback) {
// TRANSITION, DevCom-1527995: Lambdas in this function ensure inlining
#if _HAS_POPCNT_INTRINSICS || _HAS_NEON_INTRINSICS
#if _HAS_CXX20
if (!_STD is_constant_evaluated())
#endif // _HAS_CXX20
{
#if _HAS_POPCNT_INTRINSICS
#ifndef __AVX__
const bool _Definitely_have_popcnt = __isa_available >= _Stl_isa_available_sse42;
if (!_Definitely_have_popcnt) {
return _Callback([](_Ty _Val) { return _Popcount_fallback(_Val); });
}
#endif // !defined(__AVX__)
return _Callback([](_Ty _Val) { return _Unchecked_x86_x64_popcount(_Val); });
#elif _HAS_NEON_INTRINSICS // ^^^ x86/x64 intrinsics available / ARM64 intrinsics available vvv
return _Callback([](_Ty _Val) { return _Arm64_popcount(_Val); });
#endif // ^^^ ARM64 intrinsics available ^^^
}
#endif // ^^^ any intrinsics available ^^^
return _Callback([](_Ty _Val) { return _Popcount_fallback(_Val); });
}
#undef _HAS_POPCNT_INTRINSICS
#undef _HAS_TZCNT_BSF_INTRINSICS
_STD_END
#undef _HAS_NEON_INTRINSICS
#pragma pop_macro("new")
_STL_RESTORE_CLANG_WARNINGS
#pragma warning(pop)
#pragma pack(pop)
#endif // _STL_COMPILER_PREPROCESSOR
#endif // __MSVC_BIT_UTILS_HPP