Merge pull request #120 from r-devulap/kv-32bit

Improve key-value sort performance
intel · Jan 8, 2024 · 5c133e7 · 5c133e7
2 parents 9b978ec + 845bc36
commit 5c133e7
Show file tree

Hide file tree

Showing 7 changed files with 421 additions and 41 deletions.
diff --git a/benchmarks/bench-objsort.hpp b/benchmarks/bench-objsort.hpp
@@ -5,19 +5,19 @@ static constexpr char euclidean[] = "euclidean";
 static constexpr char taxicab[] = "taxicab";
 static constexpr char chebyshev[] = "chebyshev";
 
-template <const char* val>
+template <typename T, const char* val>
 struct Point3D {
-    double x;
-    double y;
-    double z;
+    T x;
+    T y;
+    T z;
     static constexpr std::string_view name {val};
     Point3D()
     {
-        x = (double)rand() / RAND_MAX;
-        y = (double)rand() / RAND_MAX;
-        z = (double)rand() / RAND_MAX;
+        x = (T)rand() / RAND_MAX;
+        y = (T)rand() / RAND_MAX;
+        z = (T)rand() / RAND_MAX;
     }
-    double distance()
+    T distance()
     {
         if constexpr (name == "x") {
             return x;
@@ -77,7 +77,7 @@ static void simdobjsort(benchmark::State &state)
     std::vector<T> arr_bkp = arr;
     // benchmark
     for (auto _ : state) {
-        x86simdsort::object_qsort(arr.data(), arr.size(), [](T p) -> double {
+        x86simdsort::object_qsort(arr.data(), arr.size(), [](T p) {
             return p.distance();
         });
         state.PauseTiming();
@@ -89,20 +89,22 @@ static void simdobjsort(benchmark::State &state)
     }
 }
 
-#define BENCHMARK_OBJSORT(func, T) \
-    BENCHMARK_TEMPLATE(func, T) \
+#define BENCHMARK_OBJSORT(func, T, type, dist) \
+    BENCHMARK_TEMPLATE(func, T<type,dist>) \
             ->Arg(10e1) \
             ->Arg(10e2) \
             ->Arg(10e3) \
             ->Arg(10e4) \
             ->Arg(10e5) \
             ->Arg(10e6);
 
-BENCHMARK_OBJSORT(simdobjsort, Point3D<x>)
-BENCHMARK_OBJSORT(scalarobjsort, Point3D<x>)
-BENCHMARK_OBJSORT(simdobjsort, Point3D<taxicab>)
-BENCHMARK_OBJSORT(scalarobjsort, Point3D<taxicab>)
-BENCHMARK_OBJSORT(simdobjsort, Point3D<euclidean>)
-BENCHMARK_OBJSORT(scalarobjsort, Point3D<euclidean>)
-BENCHMARK_OBJSORT(simdobjsort, Point3D<chebyshev>)
-BENCHMARK_OBJSORT(scalarobjsort, Point3D<chebyshev>)
+BENCHMARK_OBJSORT(simdobjsort, Point3D, double, x)
+BENCHMARK_OBJSORT(scalarobjsort, Point3D, double, x)
+BENCHMARK_OBJSORT(simdobjsort, Point3D, float, x)
+BENCHMARK_OBJSORT(scalarobjsort, Point3D, float, x)
+BENCHMARK_OBJSORT(simdobjsort, Point3D, double, taxicab )
+BENCHMARK_OBJSORT(scalarobjsort, Point3D, double, taxicab)
+BENCHMARK_OBJSORT(simdobjsort, Point3D, double, euclidean)
+BENCHMARK_OBJSORT(scalarobjsort, Point3D, double, euclidean)
+BENCHMARK_OBJSORT(simdobjsort, Point3D, double, chebyshev)
+BENCHMARK_OBJSORT(scalarobjsort, Point3D, double, chebyshev)
diff --git a/run-bench.py b/run-bench.py
@@ -7,6 +7,7 @@
 parser.add_argument("-b", '--branch', type=str, default="main", required=False)
 parser.add_argument('--benchcompare', type=str, help='Compare simd bench with stdsort methods. Requires one of qsort, qselect, partialsort, argsort or argselect')
 parser.add_argument("-f", '--filter', type=str, required=False)
+parser.add_argument("-r", '--repeat', type=int, required=False)
 args = parser.parse_args()
 
 if len(sys.argv) == 1:
@@ -15,6 +16,9 @@
 filterb = ""
 if args.filter is not None:
     filterb = args.filter
+repeatnum = 1
+if args.repeat is not None:
+    repeatnum = args.repeat
 
 if args.benchcompare:
     baseline = ""
@@ -43,11 +47,11 @@
     else:
         parser.print_help(sys.stderr)
         parser.error("ERROR: Unknown argument '%s'" % args.benchcompare)
-    rc = subprocess.check_call("./scripts/bench-compare.sh '%s' '%s'" % (baseline, contender), shell=True)
+    rc = subprocess.check_call("./scripts/bench-compare.sh '%s' '%s' '%d'" % (baseline, contender, repeatnum), shell=True)
 
 if args.branchcompare:
     branch = args.branch
     if args.filter is None:
-        rc = subprocess.check_call("./scripts/branch-compare.sh '%s'" % (branch), shell=True)
+        rc = subprocess.check_call("./scripts/branch-compare.sh '%s' '%d'" % (branch, repeatnum), shell=True)
     else:
-        rc = subprocess.check_call("./scripts/branch-compare.sh '%s' '%s'" % (branch, args.filter), shell=True)
+        rc = subprocess.check_call("./scripts/branch-compare.sh '%s' '%s' '%d'" % (branch, args.filter, repeatnum), shell=True)
diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh
@@ -14,4 +14,4 @@ compare=$(realpath .bench/google-benchmark/tools/compare.py)
 meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir-${branch}
 cd builddir-${branch}
 ninja
-$compare filters ./benchexe $1 $2
+$compare filters ./benchexe $1 $2 --benchmark_repetitions=$3
diff --git a/scripts/branch-compare.sh b/scripts/branch-compare.sh
@@ -44,10 +44,10 @@ build_branch $basebranch
 contender=$(realpath ${branch}/builddir/benchexe)
 baseline=$(realpath ${basebranch}/builddir/benchexe)
 
-if [ -z "$2" ]; then
+if [ -z "$3" ]; then
     echo "Comparing all benchmarks .."
-    $compare benchmarks $baseline $contender
+    $compare benchmarks $baseline $contender --benchmark_repetitions=$2
 else
     echo "Comparing benchmark $2 .."
-    $compare benchmarksfiltered $baseline $2 $contender $2
+    $compare benchmarksfiltered $baseline $2 $contender $2 --benchmark_repetitions=$3
 fi
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
@@ -32,6 +32,7 @@ template <>
 struct zmm_vector<int32_t> {
     using type_t = int32_t;
     using reg_t = __m512i;
+    using regi_t = __m512i;
     using halfreg_t = __m256i;
     using opmask_t = __mmask16;
     static const uint8_t numlanes = 16;
@@ -65,6 +66,10 @@ struct zmm_vector<int32_t> {
     {
         return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epi32_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -123,6 +128,40 @@ struct zmm_vector<int32_t> {
     {
         return _mm512_set1_epi32(v);
     }
+    static regi_t seti(int v1,
+                       int v2,
+                       int v3,
+                       int v4,
+                       int v5,
+                       int v6,
+                       int v7,
+                       int v8,
+                       int v9,
+                       int v10,
+                       int v11,
+                       int v12,
+                       int v13,
+                       int v14,
+                       int v15,
+                       int v16)
+    {
+        return _mm512_set_epi32(v1,
+                                v2,
+                                v3,
+                                v4,
+                                v5,
+                                v6,
+                                v7,
+                                v8,
+                                v9,
+                                v10,
+                                v11,
+                                v12,
+                                v13,
+                                v14,
+                                v15,
+                                v16);
+    }
     template <uint8_t mask>
     static reg_t shuffle(reg_t zmm)
     {
@@ -171,6 +210,7 @@ template <>
 struct zmm_vector<uint32_t> {
     using type_t = uint32_t;
     using reg_t = __m512i;
+    using regi_t = __m512i;
     using halfreg_t = __m256i;
     using opmask_t = __mmask16;
     static const uint8_t numlanes = 16;
@@ -214,6 +254,10 @@ struct zmm_vector<uint32_t> {
     {
         return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epu32_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -262,6 +306,40 @@ struct zmm_vector<uint32_t> {
     {
         return _mm512_set1_epi32(v);
     }
+    static regi_t seti(int v1,
+                       int v2,
+                       int v3,
+                       int v4,
+                       int v5,
+                       int v6,
+                       int v7,
+                       int v8,
+                       int v9,
+                       int v10,
+                       int v11,
+                       int v12,
+                       int v13,
+                       int v14,
+                       int v15,
+                       int v16)
+    {
+        return _mm512_set_epi32(v1,
+                                v2,
+                                v3,
+                                v4,
+                                v5,
+                                v6,
+                                v7,
+                                v8,
+                                v9,
+                                v10,
+                                v11,
+                                v12,
+                                v13,
+                                v14,
+                                v15,
+                                v16);
+    }
     template <uint8_t mask>
     static reg_t shuffle(reg_t zmm)
     {
@@ -310,6 +388,7 @@ template <>
 struct zmm_vector<float> {
     using type_t = float;
     using reg_t = __m512;
+    using regi_t = __m512i;
     using halfreg_t = __m256;
     using opmask_t = __mmask16;
     static const uint8_t numlanes = 16;
@@ -343,6 +422,10 @@ struct zmm_vector<float> {
     {
         return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_ps_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -415,6 +498,40 @@ struct zmm_vector<float> {
     {
         return _mm512_set1_ps(v);
     }
+    static regi_t seti(int v1,
+                       int v2,
+                       int v3,
+                       int v4,
+                       int v5,
+                       int v6,
+                       int v7,
+                       int v8,
+                       int v9,
+                       int v10,
+                       int v11,
+                       int v12,
+                       int v13,
+                       int v14,
+                       int v15,
+                       int v16)
+    {
+        return _mm512_set_epi32(v1,
+                                v2,
+                                v3,
+                                v4,
+                                v5,
+                                v6,
+                                v7,
+                                v8,
+                                v9,
+                                v10,
+                                v11,
+                                v12,
+                                v13,
+                                v14,
+                                v15,
+                                v16);
+    }
     template <uint8_t mask>
     static reg_t shuffle(reg_t zmm)
     {