Add optimized crc32 for POWER8 and later processors

This commit adds an optimized version of the crc32 function based on crc32-vpmsum from https://github.com/antonblanchard/crc32-vpmsum/ . The code has been relicensed to the zlib license. This is the C implementation created by Rogerio Alves <rogealve@br.ibm.com> It makes use of vector instructions to speed up CRC32 algorithm. Decompression times were improved by +30% on tests. Based on Daniel Black's work for the original zlib (madler/zlib#478).
mscastanho · Jun 16, 2021 · ea2541c · ea2541c
1 parent d87e6d9
commit ea2541c
Show file tree

Hide file tree

Showing 7 changed files with 1,996 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -591,6 +591,10 @@ if(WITH_OPTIM)
             list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
             set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/slide_power8.c)
+            if("${ARCH}" MATCHES "powerpc64(le)?")
+              add_definitions(-DPOWER8_VSX_CRC32)
+              list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
+            endif()
             list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
             set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
         endif()

diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in
@@ -19,6 +19,8 @@ all: power.o \
      power.lo \
      adler32_power8.o \
      adler32_power8.lo \
+     crc32_power8.o \
+     crc32_power8.lo \
      slide_power8.o \
      slide_power8.lo
 
@@ -34,6 +36,12 @@ adler32_power8.o:
 adler32_power8.lo:
 	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
 
+crc32_power8.o:
+	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
+crc32_power8.lo:
+	$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c
+
 slide_power8.o:
 	$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_power8.c
 

diff --git a/arch/power/clang_workaround.h b/arch/power/clang_workaround.h
@@ -0,0 +1,98 @@
+/* Helper functions to work around issues with clang builtins
+ * Copyright (C) 2021 IBM Corporation
+ *
+ * Authors:
+ *   Daniel Black <daniel@linux.vnet.ibm.com>
+ *   Rogerio Alves <rogealve@br.ibm.com>
+ *   Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CLANG_WORKAROUNDS_H
+#define CLANG_WORKAROUNDS_H
+
+/*
+ * These stubs fix clang incompatibilities with GCC builtins.
+ */
+
+#ifndef __builtin_crypto_vpmsumw
+#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
+#endif
+#ifndef __builtin_crypto_vpmsumd
+#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
+#endif
+
+static inline
+__vector unsigned long long __attribute__((overloadable))
+vec_ld(int __a, const __vector unsigned long long* __b)
+{
+	return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
+}
+
+/*
+ * GCC __builtin_pack_vector_int128 returns a vector __int128_t but Clang
+ * does not recognize this type. On GCC this builtin is translated to a
+ * xxpermdi instruction that only moves the registers __a, __b instead generates
+ * a load.
+ *
+ * Clang has vec_xxpermdi intrinsics. It was implemented in 4.0.0.
+ */
+static inline
+__vector unsigned long long  __builtin_pack_vector (unsigned long __a,
+						    unsigned long __b)
+{
+	#if defined(__BIG_ENDIAN__)
+	__vector unsigned long long __v = {__a, __b};
+	#else
+	__vector unsigned long long __v = {__b, __a};
+	#endif
+	return __v;
+}
+
+/*
+ * Clang 7 changed the behavior of vec_xxpermdi in order to provide the same
+ * behavior of GCC. That means code adapted to Clang >= 7 does not work on
+ * Clang <= 6.  So, fallback to __builtin_unpack_vector() on Clang <= 6.
+ */
+#if !defined vec_xxpermdi || __clang_major__ <= 6
+
+static inline
+unsigned long __builtin_unpack_vector (__vector unsigned long long __v,
+				       int __o)
+{
+	return __v[__o];
+}
+
+#if defined(__BIG_ENDIAN__)
+#define __builtin_unpack_vector_0(a) __builtin_unpack_vector ((a), 0)
+#define __builtin_unpack_vector_1(a) __builtin_unpack_vector ((a), 1)
+#else
+#define __builtin_unpack_vector_0(a) __builtin_unpack_vector ((a), 1)
+#define __builtin_unpack_vector_1(a) __builtin_unpack_vector ((a), 0)
+#endif
+
+#else
+
+static inline
+unsigned long __builtin_unpack_vector_0 (__vector unsigned long long __v)
+{
+	#if defined(__BIG_ENDIAN__)
+	return vec_xxpermdi(__v, __v, 0x0)[0];
+	#else
+	return vec_xxpermdi(__v, __v, 0x3)[0];
+	#endif
+}
+
+static inline
+unsigned long __builtin_unpack_vector_1 (__vector unsigned long long __v)
+{
+	#if defined(__BIG_ENDIAN__)
+	return vec_xxpermdi(__v, __v, 0x3)[0];
+	#else
+	return vec_xxpermdi(__v, __v, 0x0)[0];
+	#endif
+}
+#endif /* vec_xxpermdi */
+
+#endif