Description
Currently, the cardinality implementation of BitArray walks the buffer in byte chunks, looks up the popcount for each chunk and sums them up to get the cardinality.
There's a BitSet implementation hidden in the ANTLR4 runtime implementation:
It calculates the cardinality by what appears to be a software simulated popcount that has been specialized for lists:
BitArray would probably benefit from taking a similar approach.
(#17 / dart-lang/sdk#52673 would be nice here, too.)
For future reference, here's an implementation of popcount:
int count_bits_64_popcount(
final int value_64_bits,
) {
// TODO try this:
// /// http://resnet.uoregon.edu/~gurney_j/jmpc/bitwise.html
// #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
// #define BX_(x) ((x) - (((x)>>1)&0x77777777) - (((x)>>2)&0x33333333) - (((x)>>3)&0x11111111))
// /// https://graphics.stanford.edu/~seander/bithacks.html
// int _count_bits_parallel_32_bits(
// final int value,
// ) {
// // In binary: 01010101010101010101010101010101
// const alternating_1 = 0x55555555;
// // In binary: 00110011001100110011001100110011
// const alternating_2 = 0x33333333;
// // In binary: 00001111000011110000111100001111
// const alternating_3 = 0x0F0F0F0F;
// // In binary: 00000000111111110000000011111111
// const alternating_4 = 0x00FF00FF;
// // In binary: 00000000000000001111111111111111
// const alternating_5 = 0x0000FFFF;
// final a = value >> 1;
// final b = value - (a & alternating_1);
// final c = ((b >> 2) & alternating_2) + (b & alternating_2);
// final d = ((c >> 4) + c) & alternating_3;
// final e = ((d >> 8) + d) & alternating_4;
// final f = ((e >> 16) + e) & alternating_5;
// return f;
// }
//
// int _manual_count_bits_64(
// final int value_64_bits,
// ) {
// // Shift the high 32 bits down.
// final hi = _count_bits_parallel_32_bits(value_64_bits >> 32);
// // The subroutine is assumed to ignore the bits beyond 32.
// final lo = _count_bits_parallel_32_bits(value_64_bits);
// return hi + lo;
// }
// According to wikipedia, this should be the fastest way to do popcount?
// const m1 = 0x5555555555555555; //binary: 0101...
// const m2 = 0x3333333333333333; //binary: 00110011..
// const m4 = 0x0f0f0f0f0f0f0f0f; //binary: 4 zeros, 4 ones ...
// const m8 = 0x00ff00ff00ff00ff; //binary: 8 zeros, 8 ones ...
// const m16 = 0x0000ffff0000ffff; //binary: 16 zeros, 16 ones ...
// const m32 = 0x00000000ffffffff; //binary: 32 zeros, 32 ones
// const h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
//
// int x = value_64_bits;
// x -= (x >> 1) & m1; //put count of each 2 bits into those 2 bits
// x = (x & m2) + ((x >> 2) & m2); //put count of each 4 bits into those 4 bits
// x = (x + (x >> 4)) & m4; //put count of each 8 bits into those 8 bits
// return (x * h01) >> 56; //returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
const String lookup_table = "\x00\x01\x01\x02\x01\x02\x02\x03\x01\x02\x02\x03"
"\x02\x03\x03\x04\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04"
"\x04\x05\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05"
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06\x01\x02"
"\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05\x02\x03\x03\x04"
"\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06\x02\x03\x03\x04\x03\x04"
"\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06\x03\x04\x04\x05\x04\x05\x05\x06"
"\x04\x05\x05\x06\x05\x06\x06\x07\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03"
"\x03\x04\x03\x04\x04\x05\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05"
"\x04\x05\x05\x06\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05"
"\x05\x06\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07"
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06\x03\x04"
"\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07\x03\x04\x04\x05"
"\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07\x04\x05\x05\x06\x05\x06"
"\x06\x07\x05\x06\x06\x07\x06\x07\x07\x08";
return lookup_table.codeUnitAt(value_64_bits & 0xFF) +
lookup_table.codeUnitAt((value_64_bits >>> 8) & 0xFF) +
lookup_table.codeUnitAt((value_64_bits >>> 16) & 0xFF) +
lookup_table.codeUnitAt((value_64_bits >>> 24) & 0xFF) +
lookup_table.codeUnitAt((value_64_bits >>> 32) & 0xFF) +
lookup_table.codeUnitAt((value_64_bits >>> 40) & 0xFF) +
lookup_table.codeUnitAt((value_64_bits >>> 48) & 0xFF) +
lookup_table.codeUnitAt((value_64_bits >>> 56) & 0xFF);
}
(Some while ago I've found that the String-lookuptable-version appears to be the fastest. The comments contain some other versions. Maybe we could use that to walk a Uint64List view in 64bit chunks? Many possible approaches here.)