nmslib · yurymalkov · Oct 3, 2021 · Sep 1, 2021 · Sep 10, 2021 · Sep 16, 2021
diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h
@@ -4,6 +4,9 @@
 #define USE_SSE
 #ifdef __AVX__
 #define USE_AVX
+#ifdef __AVX512F__
+#define USE_AVX512
+#endif
 #endif
 #endif
 #endif
@@ -16,10 +19,16 @@
 #include <x86intrin.h>
 #endif
 
+#if defined(USE_AVX512)
+#include <immintrin.h>
+#endif
+
 #if defined(__GNUC__)
 #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
+#define PORTABLE_ALIGN64 __attribute__((aligned(64)))
 #else
 #define PORTABLE_ALIGN32 __declspec(align(32))
+#define PORTABLE_ALIGN64 __declspec(align(64))
 #endif
 #endif
 

diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h
@@ -124,7 +124,40 @@ namespace hnswlib {
 
 #endif
 
-#if defined(USE_AVX)
+
+#if defined(USE_AVX512)
+
+    static float
+    InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        float PORTABLE_ALIGN64 TmpRes[16];
+        float *pVect1 = (float *) pVect1v;
+        float *pVect2 = (float *) pVect2v;
+        size_t qty = *((size_t *) qty_ptr);
+
+        size_t qty16 = qty / 16;
+
+
+        const float *pEnd1 = pVect1 + 16 * qty16;
+
+        __m512 sum512 = _mm512_set1_ps(0);
+
+        while (pVect1 < pEnd1) {
+            //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
+
+            __m512 v1 = _mm512_loadu_ps(pVect1);
+            pVect1 += 16;
+            __m512 v2 = _mm512_loadu_ps(pVect2);
+            pVect2 += 16;
+            sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));
+        }
+
+        _mm512_store_ps(TmpRes, sum512);
+        float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
+
+        return 1.0f - sum;
+    }
+
+#elif defined(USE_AVX)
 
     static float
     InnerProductSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
@@ -211,7 +244,7 @@ namespace hnswlib {
 
 #endif
 
-#if defined(USE_SSE) || defined(USE_AVX)
+#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
     static float
     InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         size_t qty = *((size_t *) qty_ptr);
@@ -249,7 +282,7 @@ namespace hnswlib {
     public:
         InnerProductSpace(size_t dim) {
             fstdistfunc_ = InnerProduct;
-    #if defined(USE_AVX) || defined(USE_SSE)
+    #if defined(USE_AVX) || defined(USE_SSE) || defined(USE_AVX512)
             if (dim % 16 == 0)
                 fstdistfunc_ = InnerProductSIMD16Ext;
             else if (dim % 4 == 0)

diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
@@ -19,7 +19,41 @@ namespace hnswlib {
         return (res);
     }
 
-#if defined(USE_AVX)
+#if defined(USE_AVX512)
+
+    // Favor using AVX512 if available.
+    static float
+    L2SqrSIMD16Ext(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        float *pVect1 = (float *) pVect1v;
+        float *pVect2 = (float *) pVect2v;
+        size_t qty = *((size_t *) qty_ptr);
+        float PORTABLE_ALIGN64 TmpRes[16];
+        size_t qty16 = qty >> 4;
+
+        const float *pEnd1 = pVect1 + (qty16 << 4);
+
+        __m512 diff, v1, v2;
+        __m512 sum = _mm512_set1_ps(0);
+
+        while (pVect1 < pEnd1) {
+            v1 = _mm512_loadu_ps(pVect1);
+            pVect1 += 16;
+            v2 = _mm512_loadu_ps(pVect2);
+            pVect2 += 16;
+            diff = _mm512_sub_ps(v1, v2);
+            // sum = _mm512_fmadd_ps(diff, diff, sum);
+            sum = _mm512_add_ps(sum, _mm512_mul_ps(diff, diff));
+        }
+
+        _mm512_store_ps(TmpRes, sum);
+        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] +
+                TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] +
+                TmpRes[13] + TmpRes[14] + TmpRes[15];
+
+        return (res);
+}
+
+#elif defined(USE_AVX)
 
     // Favor using AVX if available.
     static float
@@ -106,7 +140,7 @@ namespace hnswlib {
     }
 #endif
 
-#if defined(USE_SSE) || defined(USE_AVX)
+#if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
     static float
     L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
         size_t qty = *((size_t *) qty_ptr);
@@ -174,7 +208,7 @@ namespace hnswlib {
     public:
         L2Space(size_t dim) {
             fstdistfunc_ = L2Sqr;
-        #if defined(USE_SSE) || defined(USE_AVX)
+        #if defined(USE_SSE) || defined(USE_AVX) || defined(USE_AVX512)
             if (dim % 16 == 0)
                 fstdistfunc_ = L2SqrSIMD16Ext;
             else if (dim % 4 == 0)
@@ -278,4 +312,4 @@ namespace hnswlib {
     };
 
 
-}
+}