Skip to content

bpo-29882: Add _Py_popcount32() function #20518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
/* Bytes swap functions, reverse order of bytes:
/* Bit and bytes utilities.

Bytes swap functions, reverse order of bytes:

- _Py_bswap16(uint16_t)
- _Py_bswap32(uint32_t)
Expand Down Expand Up @@ -82,6 +84,53 @@ _Py_bswap64(uint64_t word)
}


// Population count: count the number of 1's in 'x'
// (number of bits set to 1), also known as the hamming weight.
//
// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
// can be used, to keep the implementation simple. For example, Visual Studio
// __popcnt() is not used this reason. The clang and GCC builtin function can
// use the x86 POPCNT instruction if the target architecture has SSE4a or
// newer.
static inline int
_Py_popcount32(uint32_t x)
{
#if (defined(__clang__) || defined(__GNUC__))

#if SIZEOF_INT >= 4
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
return __builtin_popcount(x);
#else
// The C standard guarantees that unsigned long will always be big enough
// to hold a uint32_t value without losing information.
Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
return __builtin_popcountl(x);
#endif

#else
// 32-bit SWAR (SIMD Within A Register) popcount

// Binary: 0 1 0 1 ...
const uint32_t M1 = 0x55555555;
// Binary: 00 11 00 11. ..
const uint32_t M2 = 0x33333333;
// Binary: 0000 1111 0000 1111 ...
const uint32_t M4 = 0x0F0F0F0F;
// 256**4 + 256**3 + 256**2 + 256**1
const uint32_t SUM = 0x01010101;

// Put count of each 2 bits into those 2 bits
x = x - ((x >> 1) & M1);
// Put count of each 4 bits into those 4 bits
x = (x & M2) + ((x >> 2) & M2);
// Put count of each 8 bits into those 8 bits
x = (x + (x >> 4)) & M4;
// Sum of the 4 byte counts
return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24;
#endif
}


#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 1 addition & 1 deletion Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1121,7 +1121,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_abstract.h \
$(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_atomic.h \
$(srcdir)/Include/internal/pycore_byteswap.h \
$(srcdir)/Include/internal/pycore_bitutils.h \
$(srcdir)/Include/internal/pycore_bytes_methods.h \
$(srcdir)/Include/internal/pycore_call.h \
$(srcdir)/Include/internal/pycore_ceval.h \
Expand Down
2 changes: 1 addition & 1 deletion Modules/_ctypes/cfield.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "Python.h"
#include "pycore_byteswap.h" // _Py_bswap32()
#include "pycore_bitutils.h" // _Py_bswap32()

#include <ffi.h>
#ifdef MS_WIN32
Expand Down
42 changes: 41 additions & 1 deletion Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#define PY_SSIZE_T_CLEAN

#include "Python.h"
#include "pycore_byteswap.h" // _Py_bswap32()
#include "pycore_bitutils.h" // _Py_bswap32()
#include "pycore_initconfig.h" // _Py_GetConfigsAsDict()
#include "pycore_hashtable.h" // _Py_hashtable_new()
#include "pycore_gc.h" // PyGC_Head
Expand Down Expand Up @@ -63,6 +63,45 @@ test_bswap(PyObject *self, PyObject *Py_UNUSED(args))
}


static int
check_popcount(uint32_t x, int expected)
{
// Use volatile to prevent the compiler to optimize out the whole test
volatile uint32_t u = x;
int bits = _Py_popcount32(u);
if (bits != expected) {
PyErr_Format(PyExc_AssertionError,
"_Py_popcount32(%lu) returns %i, expected %i",
(unsigned long)x, bits, expected);
return -1;
}
return 0;
}


static PyObject*
test_popcount(PyObject *self, PyObject *Py_UNUSED(args))
{
#define CHECK(X, RESULT) \
do { \
if (check_popcount(X, RESULT) < 0) { \
return NULL; \
} \
} while (0)

CHECK(0, 0);
CHECK(1, 1);
CHECK(0x08080808, 4);
CHECK(0x10101010, 4);
CHECK(0x10204080, 4);
CHECK(0xDEADCAFE, 22);
CHECK(0xFFFFFFFF, 32);
Py_RETURN_NONE;

#undef CHECK
}


#define TO_PTR(ch) ((void*)(uintptr_t)ch)
#define FROM_PTR(ptr) ((uintptr_t)ptr)
#define VALUE(key) (1 + ((int)(key) - 'a'))
Expand Down Expand Up @@ -157,6 +196,7 @@ static PyMethodDef TestMethods[] = {
{"get_configs", get_configs, METH_NOARGS},
{"get_recursion_depth", get_recursion_depth, METH_NOARGS},
{"test_bswap", test_bswap, METH_NOARGS},
{"test_popcount", test_popcount, METH_NOARGS},
{"test_hashtable", test_hashtable, METH_NOARGS},
{NULL, NULL} /* sentinel */
};
Expand Down
2 changes: 1 addition & 1 deletion Modules/sha256module.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
/* SHA objects */

#include "Python.h"
#include "pycore_byteswap.h" // _Py_bswap32()
#include "pycore_bitutils.h" // _Py_bswap32()
#include "structmember.h" // PyMemberDef
#include "hashlib.h"
#include "pystrhex.h"
Expand Down
2 changes: 1 addition & 1 deletion Modules/sha512module.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
/* SHA objects */

#include "Python.h"
#include "pycore_byteswap.h" // _Py_bswap32()
#include "pycore_bitutils.h" // _Py_bswap32()
#include "structmember.h" // PyMemberDef
#include "hashlib.h"
#include "pystrhex.h"
Expand Down
15 changes: 7 additions & 8 deletions Objects/longobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
/* XXX The functional organization of this file is terrible */

#include "Python.h"
#include "pycore_interp.h" // _PY_NSMALLPOSINTS
#include "pycore_pystate.h" // _Py_IsMainInterpreter()
#include "pycore_bitutils.h" // _Py_popcount32()
#include "pycore_interp.h" // _PY_NSMALLPOSINTS
#include "pycore_pystate.h" // _Py_IsMainInterpreter()
#include "longintrepr.h"

#include <float.h>
Expand Down Expand Up @@ -5307,12 +5308,10 @@ int_bit_length_impl(PyObject *self)
static int
popcount_digit(digit d)
{
/* 32bit SWAR popcount. */
uint32_t u = d;
u -= (u >> 1) & 0x55555555U;
u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U);
u = (u + (u >> 4)) & 0x0f0f0f0fU;
return (uint32_t)(u * 0x01010101U) >> 24;
// digit can be larger than uint32_t, but only PyLong_SHIFT bits
// of it will be ever used.
Py_BUILD_ASSERT(PyLong_SHIFT <= 32);
return _Py_popcount32((uint32_t)d);
}

/*[clinic input]
Expand Down
2 changes: 1 addition & 1 deletion Objects/stringlib/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# error "codecs.h is specific to Unicode"
#endif

#include "pycore_byteswap.h" // _Py_bswap32()
#include "pycore_bitutils.h" // _Py_bswap32()

/* Mask to quickly check whether a C 'long' contains a
non-ASCII, UTF8-encoded char. */
Expand Down
2 changes: 1 addition & 1 deletion PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@
<ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h" />
<ClInclude Include="..\Include\internal\pycore_byteswap.h" />
<ClInclude Include="..\Include\internal\pycore_bitutils.h" />
<ClInclude Include="..\Include\internal\pycore_call.h" />
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
<ClInclude Include="..\Include\internal\pycore_code.h" />
Expand Down
2 changes: 1 addition & 1 deletion PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@
<ClInclude Include="..\Include\internal\pycore_atomic.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_byteswap.h">
<ClInclude Include="..\Include\internal\pycore_bitutils.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_bytes_methods.h">
Expand Down
25 changes: 3 additions & 22 deletions Python/hamt.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "Python.h"

#include "pycore_bitutils.h" // _Py_popcount32
#include "pycore_hamt.h"
#include "pycore_object.h" // _PyObject_GC_TRACK()
#include <stddef.h> // offsetof()
Expand Down Expand Up @@ -433,30 +434,10 @@ hamt_bitpos(int32_t hash, uint32_t shift)
return (uint32_t)1 << hamt_mask(hash, shift);
}

static inline uint32_t
hamt_bitcount(uint32_t i)
{
/* We could use native popcount instruction but that would
require to either add configure flags to enable SSE4.2
support or to detect it dynamically. Otherwise, we have
a risk of CPython not working properly on older hardware.

In practice, there's no observable difference in
performance between using a popcount instruction or the
following fallback code.

The algorithm is copied from:
https://graphics.stanford.edu/~seander/bithacks.html
*/
i = i - ((i >> 1) & 0x55555555);
i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
return (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
}

static inline uint32_t
hamt_bitindex(uint32_t bitmap, uint32_t bit)
{
return hamt_bitcount(bitmap & (bit - 1));
return (uint32_t)_Py_popcount32(bitmap & (bit - 1));
}


Expand Down Expand Up @@ -820,7 +801,7 @@ hamt_node_bitmap_assoc(PyHamtNode_Bitmap *self,
else {
/* There was no key before with the same (shift,hash). */

uint32_t n = hamt_bitcount(self->b_bitmap);
uint32_t n = (uint32_t)_Py_popcount32(self->b_bitmap);

if (n >= 16) {
/* When we have a situation where we want to store more
Expand Down