haskell · Bodigrim · May 15, 2021 · Feb 2, 2020 · Feb 26, 2021 · Feb 28, 2021
diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs
@@ -39,6 +39,7 @@ import           Foreign
 import System.Random
 
 import BenchBoundsCheckFusion
+import BenchCount
 import BenchCSV
 import BenchIndices
 
@@ -437,6 +438,7 @@ main = do
       , bench "map (+1) small" $ nf (S.map (+ 1)) smallTraversalInput
       ]
     , benchBoundsCheckFusion
+    , benchCount
     , benchCSV
     , benchIndices
     ]
diff --git a/bench/BenchCount.hs b/bench/BenchCount.hs
@@ -0,0 +1,29 @@
+-- |
+-- Copyright   : (c) 2021 Georg Rudoy
+-- License     : BSD3-style (see LICENSE)
+--
+-- Maintainer  : Georg Rudoy <0xd34df00d+github@gmail.com>
+--
+-- Benchmark count
+
+module BenchCount (benchCount) where
+
+import           Test.Tasty.Bench
+import qualified Data.ByteString.Char8 as B
+
+benchCount :: Benchmark
+benchCount = bgroup "Count"
+  [ bgroup "no matches, same char"       $ mkBenches (1 : commonSizes) (\s -> B.replicate s 'b')
+  , bgroup "no matches, different chars" $ mkBenches      commonSizes  (\s -> genCyclic 10 s 'b')
+  , bgroup "some matches, alternating"   $ mkBenches      commonSizes  (\s -> genCyclic 2 s 'a')
+  , bgroup "some matches, short cycle"   $ mkBenches      commonSizes  (\s -> genCyclic 5 s 'a')
+  , bgroup "some matches, long cycle"    $ mkBenches      commonSizes  (\s -> genCyclic 10 s 'a')
+  , bgroup "all matches"                 $ mkBenches (1 : commonSizes) (\s -> B.replicate s 'a')
+  ]
+  where
+    aboveSimdSwitchThreshold = 1030 -- something above the threshold of 1024 that's divisible by cycle lengths
+    commonSizes = [ 10, 100, 1000, aboveSimdSwitchThreshold, 10000, 100000, 1000000 ]
+    mkBenches sizes gen = [ bench (show size ++ " chars long") $ nf (B.count 'a') (gen size)
+                          | size <- sizes
+                          ]
+    genCyclic cycleLen size from = B.concat $ replicate (size `div` cycleLen) $ B.pack (take cycleLen [from..])
diff --git a/bytestring.cabal b/bytestring.cabal
@@ -115,6 +115,7 @@ library
 
   c-sources:         cbits/fpstring.c
                      cbits/itoa.c
+  cc-options:        -std=c11
   include-dirs:      include
   includes:          fpstring.h
   install-includes:  fpstring.h
@@ -167,6 +168,7 @@ test-suite test-builder
 benchmark bytestring-bench
   main-is:          BenchAll.hs
   other-modules:    BenchBoundsCheckFusion
+                    BenchCount
                     BenchCSV
                     BenchIndices
   type:             exitcode-stdio-1.0

diff --git a/cbits/fpstring.c b/cbits/fpstring.c
@@ -31,8 +31,15 @@
 
 #include "fpstring.h"
 #if defined(__x86_64__)
-#include <emmintrin.h>
-#include <xmmintrin.h>
+#include <x86intrin.h>
+#include <cpuid.h>
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifndef __STDC_NO_ATOMICS__
+#include <stdatomic.h>
 #endif
 
 /* copy a string in reverse */
@@ -90,19 +97,190 @@ unsigned char fps_minimum(unsigned char *p, size_t len) {
     return c;
 }
 
+int fps_compare(const void *a, const void *b) {
+    return (int)*(unsigned char*)a - (int)*(unsigned char*)b;
+}
+
+void fps_sort(unsigned char *p, size_t len) {
+    return qsort(p, len, 1, fps_compare);
+}
+
 /* count the number of occurences of a char in a string */
-size_t fps_count(unsigned char *p, size_t len, unsigned char w) {
+size_t fps_count_naive(unsigned char *str, size_t len, unsigned char w) {
     size_t c;
-    for (c = 0; len-- != 0; ++p)
-        if (*p == w)
+    for (c = 0; len-- != 0; ++str)
+        if (*str == w)
             ++c;
     return c;
 }
 
-int fps_compare(const void *a, const void *b) {
-    return (int)*(unsigned char*)a - (int)*(unsigned char*)b;
+#if defined(__x86_64__) && (__GNUC__ >= 6 || defined(__clang_major__)) && !defined(__STDC_NO_ATOMICS__)
+#define USE_SIMD_COUNT
+#endif
+
+#ifdef USE_SIMD_COUNT
+__attribute__((target("sse4.2")))
+size_t fps_count_cmpestrm(unsigned char *str, size_t len, unsigned char w) {
+    const __m128i pat = _mm_set1_epi8(w);
+
+    size_t res = 0;
+
+    size_t i = 0;
+
+    for (; i < len && (intptr_t)(str + i) % 64; ++i) {
+        res += str[i] == w;
+    }
+
+    for (size_t end = len - 128; i < end; i += 128) {
+        __m128i p0 = _mm_load_si128((const __m128i*)(str + i + 16 * 0));
+        __m128i p1 = _mm_load_si128((const __m128i*)(str + i + 16 * 1));
+        __m128i p2 = _mm_load_si128((const __m128i*)(str + i + 16 * 2));
+        __m128i p3 = _mm_load_si128((const __m128i*)(str + i + 16 * 3));
+        __m128i p4 = _mm_load_si128((const __m128i*)(str + i + 16 * 4));
+        __m128i p5 = _mm_load_si128((const __m128i*)(str + i + 16 * 5));
+        __m128i p6 = _mm_load_si128((const __m128i*)(str + i + 16 * 6));
+        __m128i p7 = _mm_load_si128((const __m128i*)(str + i + 16 * 7));
+        // Here, cmpestrm compares two strings in the following mode:
+        // * _SIDD_SBYTE_OPS: interprets the strings as consisting of 8-bit chars,
+        // * _SIDD_CMP_EQUAL_EACH: computes the number of `i`s
+        //    for which `p[i]`, a part of `str`, is equal to `pat[i]`
+        //    (the latter being always equal to `w`).
+        //
+        // q.v. https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm&expand=835
+#define MODE _SIDD_SBYTE_OPS | _SIDD_CMP_EQUAL_EACH
+        __m128i r0 = _mm_cmpestrm(p0, 16, pat, 16, MODE);
+        __m128i r1 = _mm_cmpestrm(p1, 16, pat, 16, MODE);
+        __m128i r2 = _mm_cmpestrm(p2, 16, pat, 16, MODE);
+        __m128i r3 = _mm_cmpestrm(p3, 16, pat, 16, MODE);
+        __m128i r4 = _mm_cmpestrm(p4, 16, pat, 16, MODE);
+        __m128i r5 = _mm_cmpestrm(p5, 16, pat, 16, MODE);
+        __m128i r6 = _mm_cmpestrm(p6, 16, pat, 16, MODE);
+        __m128i r7 = _mm_cmpestrm(p7, 16, pat, 16, MODE);
+#undef MODE
+        res += _popcnt64(_mm_extract_epi64(r0, 0));
+        res += _popcnt64(_mm_extract_epi64(r1, 0));
+        res += _popcnt64(_mm_extract_epi64(r2, 0));
+        res += _popcnt64(_mm_extract_epi64(r3, 0));
+        res += _popcnt64(_mm_extract_epi64(r4, 0));
+        res += _popcnt64(_mm_extract_epi64(r5, 0));
+        res += _popcnt64(_mm_extract_epi64(r6, 0));
+        res += _popcnt64(_mm_extract_epi64(r7, 0));
+    }
+
+    for (; i < len; ++i) {
+        res += str[i] == w;
+    }
+
+    return res;
 }
 
-void fps_sort(unsigned char *p, size_t len) {
-    return qsort(p, len, 1, fps_compare);
+__attribute__((target("avx2")))
+size_t fps_count_avx2(unsigned char *str, size_t len, unsigned char w) {
+    __m256i pat = _mm256_set1_epi8(w);
+
+    size_t prefix = 0, res = 0;
+
+    size_t i = 0;
+
+    for (; i < len && (intptr_t)(str + i) % 64; ++i) {
+        prefix += str[i] == w;
+    }
+
+    for (size_t end = len - 128; i < end; i += 128) {
+        __m256i p0 = _mm256_load_si256((const __m256i*)(str + i + 32 * 0));
+        __m256i p1 = _mm256_load_si256((const __m256i*)(str + i + 32 * 1));
+        __m256i p2 = _mm256_load_si256((const __m256i*)(str + i + 32 * 2));
+        __m256i p3 = _mm256_load_si256((const __m256i*)(str + i + 32 * 3));
+        __m256i r0 = _mm256_cmpeq_epi8(p0, pat);
+        __m256i r1 = _mm256_cmpeq_epi8(p1, pat);
+        __m256i r2 = _mm256_cmpeq_epi8(p2, pat);
+        __m256i r3 = _mm256_cmpeq_epi8(p3, pat);
+        res += _popcnt64(_mm256_extract_epi64(r0, 0));
+        res += _popcnt64(_mm256_extract_epi64(r0, 1));
+        res += _popcnt64(_mm256_extract_epi64(r0, 2));
+        res += _popcnt64(_mm256_extract_epi64(r0, 3));
+        res += _popcnt64(_mm256_extract_epi64(r1, 0));
+        res += _popcnt64(_mm256_extract_epi64(r1, 1));
+        res += _popcnt64(_mm256_extract_epi64(r1, 2));
+        res += _popcnt64(_mm256_extract_epi64(r1, 3));
+        res += _popcnt64(_mm256_extract_epi64(r2, 0));
+        res += _popcnt64(_mm256_extract_epi64(r2, 1));
+        res += _popcnt64(_mm256_extract_epi64(r2, 2));
+        res += _popcnt64(_mm256_extract_epi64(r2, 3));
+        res += _popcnt64(_mm256_extract_epi64(r3, 0));
+        res += _popcnt64(_mm256_extract_epi64(r3, 1));
+        res += _popcnt64(_mm256_extract_epi64(r3, 2));
+        res += _popcnt64(_mm256_extract_epi64(r3, 3));
+    }
+
+    // _mm256_cmpeq_epi8(p, pat) returns a SIMD vector
+    // with `i`th byte consisting of eight `1`s if `p[i] == pat[i]`,
+    // and of eight `0`s otherwise,
+    // hence each matching byte is counted 8 times by popcnt.
+    // Dividing by 8 corrects for that.
+    res /= 8;
+
+    res += prefix;
+
+    for (; i < len; ++i) {
+        res += str[i] == w;
+    }
+
+    return res;
+}
+
+typedef size_t (*fps_impl_t) (unsigned char*, size_t, unsigned char);
+
+fps_impl_t select_fps_simd_impl() {
+    uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
+
+    uint32_t ecx1 = 0;
+    if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+        ecx1 = ecx;
+    }
+
+    const bool has_xsave = ecx1 & (1 << 26);
+    const bool has_popcnt = ecx1 & (1 << 23);
+
+    if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
+        const bool has_avx2 = has_xsave && (ebx & (1 << 5));
+        if (has_avx2 && has_popcnt) {
+            return &fps_count_avx2;
+        }
+    }
+
+    const bool has_sse42 = ecx1 & (1 << 19);
+    if (has_sse42 && has_popcnt) {
+        return &fps_count_cmpestrm;
+    }
+
+    return &fps_count_naive;
+}
+#endif
+
+
+
+size_t fps_count(unsigned char *str, size_t len, unsigned char w) {
+#ifndef USE_SIMD_COUNT
+    return fps_count_naive(str, len, w);
+#else
+    // 1024 is a rough guesstimate of the string length
+    // for which the extra performance of the main SIMD loop
+    // starts to compensate the extra work and extra branching outside the SIMD loop.
+    // The real optimal number depends on the specific μarch
+    // and isn't worth optimizing for in this context,
+    // since counting characters in shorter strings is unlikely to be a hot spot.
+    if (len <= 1024) {
+        return fps_count_naive(str, len, w);
+    }
+
+    static _Atomic fps_impl_t s_impl = (fps_impl_t)NULL;
+    fps_impl_t impl = atomic_load_explicit(&s_impl, memory_order_relaxed);
+    if (!impl) {
+      impl = select_fps_simd_impl();
+      atomic_store_explicit(&s_impl, impl, memory_order_relaxed);
+    }
+
+    return (*impl)(str, len, w);
+#endif
 }
diff --git a/tests/Properties/ByteString.hs b/tests/Properties/ByteString.hs
@@ -284,6 +284,11 @@ tests =
     \x -> B.length x === fromIntegral (length (B.unpack x))
   , testProperty "count" $
     \(toElem -> c) x -> B.count c x === fromIntegral (length (elemIndices c (B.unpack x)))
+  -- for long strings, the multiplier is non-round (and not power of 2)
+  -- to ensure non-trivial prefix or suffix of the string is handled outside any possible SIMD-based loop,
+  -- which typically handles chunks of 16 or 32 or 64 etc bytes.
+  , testProperty "count (long strings)" $
+    \(toElem -> c) x (Positive n) -> B.count c x * fromIntegral n === B.count c (B.concat $ replicate n x)
   , testProperty "filter" $
     \f x -> B.unpack (B.filter f x) === filter f (B.unpack x)
   , testProperty "filter compose" $