Skip to content

Commit

Permalink
Add optimized crc32 for POWER8 and later processors
Browse files Browse the repository at this point in the history
This commit adds an optimized version of the crc32 function based
on crc32-vpmsum from https://github.com/antonblanchard/crc32-vpmsum/ .
The code has been relicensed to the zlib license.

This is the C implementation created by Rogerio Alves <rogealve@br.ibm.com>

It makes use of vector instructions to speed up CRC32 algorithm. Decompression
times were improved by +30% on tests.

Based on Daniel Black's work for the original zlib (madler/zlib#478).
  • Loading branch information
mscastanho committed Jun 16, 2021
1 parent d87e6d9 commit ea2541c
Show file tree
Hide file tree
Showing 7 changed files with 1,996 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,10 @@ if(WITH_OPTIM)
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/slide_power8.c)
if("${ARCH}" MATCHES "powerpc64(le)?")
add_definitions(-DPOWER8_VSX_CRC32)
list(APPEND POWER8_SRCS ${ARCHDIR}/crc32_power8.c)
endif()
list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
endif()
Expand Down
8 changes: 8 additions & 0 deletions arch/power/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ all: power.o \
power.lo \
adler32_power8.o \
adler32_power8.lo \
crc32_power8.o \
crc32_power8.lo \
slide_power8.o \
slide_power8.lo

Expand All @@ -34,6 +36,12 @@ adler32_power8.o:
adler32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c

crc32_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c

crc32_power8.lo:
$(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_power8.c

slide_power8.o:
$(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_power8.c

Expand Down
98 changes: 98 additions & 0 deletions arch/power/clang_workaround.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/* Helper functions to work around issues with clang builtins
* Copyright (C) 2021 IBM Corporation
*
* Authors:
* Daniel Black <daniel@linux.vnet.ibm.com>
* Rogerio Alves <rogealve@br.ibm.com>
* Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#ifndef CLANG_WORKAROUNDS_H
#define CLANG_WORKAROUNDS_H

/*
* These stubs fix clang incompatibilities with GCC builtins.
*/

#ifndef __builtin_crypto_vpmsumw
#define __builtin_crypto_vpmsumw __builtin_crypto_vpmsumb
#endif
#ifndef __builtin_crypto_vpmsumd
#define __builtin_crypto_vpmsumd __builtin_crypto_vpmsumb
#endif

static inline
__vector unsigned long long __attribute__((overloadable))
vec_ld(int __a, const __vector unsigned long long* __b)
{
return (__vector unsigned long long)__builtin_altivec_lvx(__a, __b);
}

/*
* GCC __builtin_pack_vector_int128 returns a vector __int128_t but Clang
* does not recognize this type. On GCC this builtin is translated to a
* xxpermdi instruction that only moves the registers __a, __b instead generates
* a load.
*
* Clang has vec_xxpermdi intrinsics. It was implemented in 4.0.0.
*/
static inline
__vector unsigned long long __builtin_pack_vector (unsigned long __a,
unsigned long __b)
{
#if defined(__BIG_ENDIAN__)
__vector unsigned long long __v = {__a, __b};
#else
__vector unsigned long long __v = {__b, __a};
#endif
return __v;
}

/*
* Clang 7 changed the behavior of vec_xxpermdi in order to provide the same
* behavior of GCC. That means code adapted to Clang >= 7 does not work on
* Clang <= 6. So, fallback to __builtin_unpack_vector() on Clang <= 6.
*/
#if !defined vec_xxpermdi || __clang_major__ <= 6

static inline
unsigned long __builtin_unpack_vector (__vector unsigned long long __v,
int __o)
{
return __v[__o];
}

#if defined(__BIG_ENDIAN__)
#define __builtin_unpack_vector_0(a) __builtin_unpack_vector ((a), 0)
#define __builtin_unpack_vector_1(a) __builtin_unpack_vector ((a), 1)
#else
#define __builtin_unpack_vector_0(a) __builtin_unpack_vector ((a), 1)
#define __builtin_unpack_vector_1(a) __builtin_unpack_vector ((a), 0)
#endif

#else

static inline
unsigned long __builtin_unpack_vector_0 (__vector unsigned long long __v)
{
#if defined(__BIG_ENDIAN__)
return vec_xxpermdi(__v, __v, 0x0)[0];
#else
return vec_xxpermdi(__v, __v, 0x3)[0];
#endif
}

static inline
unsigned long __builtin_unpack_vector_1 (__vector unsigned long long __v)
{
#if defined(__BIG_ENDIAN__)
return vec_xxpermdi(__v, __v, 0x3)[0];
#else
return vec_xxpermdi(__v, __v, 0x0)[0];
#endif
}
#endif /* vec_xxpermdi */

#endif
Loading

0 comments on commit ea2541c

Please sign in to comment.