|
16 | 16 | * limitations under the License. |
17 | 17 | */ |
18 | 18 |
|
| 19 | +#include <assert.h> |
| 20 | +#include <stddef.h> // for size_t |
| 21 | +#include <stdio.h> |
| 22 | +#include <string.h> |
| 23 | + |
| 24 | +#include "bulk_crc32.h" |
| 25 | +#include "gcc_optimizations.h" |
| 26 | + |
| 27 | +/** |
| 28 | + * Hardware-accelerated CRC32 calculation using RISC-V Zbc extension. |
| 29 | + * Uses carry-less multiply instructions (clmul/clmulh) for CRC32 (zlib |
| 30 | + * polynomial). |
| 31 | + */ |
| 32 | + |
| 33 | +typedef void (*crc_pipelined_func_t)(uint32_t *, uint32_t *, uint32_t *, |
| 34 | + const uint8_t *, size_t, int); |
| 35 | +extern crc_pipelined_func_t pipelined_crc32_zlib_func; |
| 36 | + |
| 37 | +#if defined(__riscv) && (__riscv_xlen == 64) |
| 38 | + |
| 39 | +#define RV_CRC32_CONST_R3 0x01751997d0ULL |
| 40 | +#define RV_CRC32_CONST_R4 0x00ccaa009eULL |
| 41 | +#define RV_CRC32_CONST_R5 0x0163cd6124ULL |
| 42 | +#define RV_CRC32_MASK32 0x00000000FFFFFFFFULL |
| 43 | +#define RV_CRC32_POLY_TRUE_LE_FULL 0x01DB710641ULL |
| 44 | +#define RV_CRC32_CONST_RU 0x01F7011641ULL |
| 45 | + |
| 46 | +static inline uint64_t rv_clmul(uint64_t a, uint64_t b) { |
| 47 | + uint64_t r; |
| 48 | + __asm__ volatile( |
| 49 | + ".option push\n\t" |
| 50 | + ".option arch, +zbc\n\t" |
| 51 | + "clmul %0, %1, %2\n\t" |
| 52 | + ".option pop\n\t" |
| 53 | + : "=r"(r) |
| 54 | + : "r"(a), "r"(b)); |
| 55 | + return r; |
| 56 | +} |
| 57 | + |
| 58 | +static inline uint64_t rv_clmulh(uint64_t a, uint64_t b) { |
| 59 | + uint64_t r; |
| 60 | + __asm__ volatile( |
| 61 | + ".option push\n\t" |
| 62 | + ".option arch, +zbc\n\t" |
| 63 | + "clmulh %0, %1, %2\n\t" |
| 64 | + ".option pop\n\t" |
| 65 | + : "=r"(r) |
| 66 | + : "r"(a), "r"(b)); |
| 67 | + return r; |
| 68 | +} |
| 69 | + |
| 70 | +static inline uint32_t rv_crc32_zlib_bitwise(uint32_t crc, const uint8_t *buf, |
| 71 | + size_t len) { |
| 72 | + uint32_t c = crc; |
| 73 | + for (size_t i = 0; i < len; ++i) { |
| 74 | + c ^= buf[i]; |
| 75 | + for (int k = 0; k < 8; ++k) { |
| 76 | + uint32_t mask = -(int32_t)(c & 1); |
| 77 | + c = (c >> 1) ^ (0xEDB88320U & mask); // reflected polynomial |
| 78 | + } |
| 79 | + } |
| 80 | + return c; |
| 81 | +} |
| 82 | + |
| 83 | +static uint32_t rv_crc32_zlib_clmul(uint32_t crc, const uint8_t *buf, |
| 84 | + size_t len) { |
| 85 | + const uint8_t *p = buf; |
| 86 | + size_t n = len; |
| 87 | + |
| 88 | + if (n < 32) { |
| 89 | + return rv_crc32_zlib_bitwise(crc, p, n); |
| 90 | + } |
| 91 | + |
| 92 | + uintptr_t mis = (uintptr_t)p & 0xF; |
| 93 | + if (unlikely(mis)) { |
| 94 | + size_t pre = 16 - mis; |
| 95 | + if (pre > n) pre = n; |
| 96 | + crc = rv_crc32_zlib_bitwise(crc, p, pre); |
| 97 | + p += pre; |
| 98 | + n -= pre; |
| 99 | + } |
| 100 | + |
| 101 | + uint64_t x0 = *(const uint64_t *)(const void *)(p + 0); |
| 102 | + uint64_t x1 = *(const uint64_t *)(const void *)(p + 8); |
| 103 | + x0 ^= (uint64_t)crc; |
| 104 | + p += 16; |
| 105 | + n -= 16; |
| 106 | + |
| 107 | + const uint64_t C1 = RV_CRC32_CONST_R3; |
| 108 | + const uint64_t C2 = RV_CRC32_CONST_R4; |
| 109 | + |
| 110 | + while (likely(n >= 16)) { |
| 111 | + uint64_t tL = rv_clmul(C2, x1); |
| 112 | + uint64_t tH = rv_clmulh(C2, x1); |
| 113 | + uint64_t yL = rv_clmul(C1, x0); |
| 114 | + uint64_t yH = rv_clmulh(C1, x0); |
| 115 | + x0 = yL ^ tL; |
| 116 | + x1 = yH ^ tH; |
| 117 | + |
| 118 | + uint64_t d0 = *(const uint64_t *)(const void *)(p + 0); |
| 119 | + uint64_t d1 = *(const uint64_t *)(const void *)(p + 8); |
| 120 | + x0 ^= d0; |
| 121 | + x1 ^= d1; |
| 122 | + p += 16; |
| 123 | + n -= 16; |
| 124 | + } |
| 125 | + |
| 126 | + { |
| 127 | + uint64_t tH = rv_clmulh(x0, C2); |
| 128 | + uint64_t tL = rv_clmul(x0, C2); |
| 129 | + x0 = x1 ^ tL; |
| 130 | + x1 = tH; |
| 131 | + } |
| 132 | + |
| 133 | + uint64_t hi = x1; |
| 134 | + uint64_t lo = x0; |
| 135 | + uint64_t t2 = (lo >> 32) | (hi << 32); |
| 136 | + lo &= RV_CRC32_MASK32; |
| 137 | + |
| 138 | + lo = rv_clmul(RV_CRC32_CONST_R5, lo) ^ t2; |
| 139 | + uint64_t tmp = lo; |
| 140 | + lo &= RV_CRC32_MASK32; |
| 141 | + lo = rv_clmul(lo, RV_CRC32_CONST_RU); |
| 142 | + lo &= RV_CRC32_MASK32; |
| 143 | + lo = rv_clmul(lo, RV_CRC32_POLY_TRUE_LE_FULL) ^ tmp; |
| 144 | + |
| 145 | + uint32_t c = (uint32_t)(lo >> 32); |
| 146 | + |
| 147 | + if (n) { |
| 148 | + c = rv_crc32_zlib_bitwise(c, p, n); |
| 149 | + } |
| 150 | + return c; |
| 151 | +} |
| 152 | + |
19 | 153 | /** |
20 | | - * RISC-V CRC32 hardware acceleration (placeholder) |
| 154 | + * Pipelined version of hardware-accelerated CRC32 calculation using |
| 155 | + * RISC-V Zbc carry-less multiply instructions. |
21 | 156 | * |
22 | | - * Phase 1: provide a RISC-V-specific compilation unit that currently makes |
23 | | - * no runtime changes and falls back to the generic software path in |
24 | | - * bulk_crc32.c. Future work will add Zbc-based acceleration and runtime |
25 | | - * dispatch. |
| 157 | + * crc1, crc2, crc3 : Store initial checksum for each block before |
| 158 | + * calling. When it returns, updated checksums are stored. |
| 159 | + * p_buf : The base address of the data buffer. The buffer should be |
| 160 | + * at least as big as block_size * num_blocks. |
| 161 | + * block_size : The size of each block in bytes. |
| 162 | + * num_blocks : The number of blocks to work on. Min = 1, Max = 3 |
26 | 163 | */ |
| 164 | +static void pipelined_crc32_zlib(uint32_t *crc1, uint32_t *crc2, uint32_t *crc3, |
| 165 | + const uint8_t *p_buf, size_t block_size, |
| 166 | + int num_blocks) { |
| 167 | + const uint8_t *p1 = p_buf; |
| 168 | + const uint8_t *p2 = p_buf + block_size; |
| 169 | + const uint8_t *p3 = p_buf + 2 * block_size; |
27 | 170 |
|
28 | | -#include <assert.h> |
29 | | -#include <stddef.h> // for size_t |
| 171 | + switch (num_blocks) { |
| 172 | + case 3: |
| 173 | + *crc3 = rv_crc32_zlib_clmul(*crc3, p3, block_size); |
| 174 | + // fall through |
| 175 | + case 2: |
| 176 | + *crc2 = rv_crc32_zlib_clmul(*crc2, p2, block_size); |
| 177 | + // fall through |
| 178 | + case 1: |
| 179 | + *crc1 = rv_crc32_zlib_clmul(*crc1, p1, block_size); |
| 180 | + break; |
| 181 | + case 0: |
| 182 | + return; |
| 183 | + default: |
| 184 | + assert(0 && "BUG: Invalid number of checksum blocks"); |
| 185 | + } |
| 186 | +} |
30 | 187 |
|
31 | | -#include "bulk_crc32.h" |
32 | | -#include "gcc_optimizations.h" |
| 188 | +#endif // __riscv && __riscv_xlen==64 |
33 | 189 |
|
34 | | -/* Constructor hook reserved for future HW capability detection and |
35 | | - * function-pointer dispatch. Intentionally a no-op for the initial phase. */ |
36 | | -void __attribute__((constructor)) init_riscv_crc_support(void) |
37 | | -{ |
38 | | - /* No-op: keep using the default software implementations. */ |
| 190 | +/** |
| 191 | + * On library load, determine what sort of crc we are going to do |
| 192 | + * and set crc function pointers appropriately. |
| 193 | + */ |
| 194 | +void __attribute__((constructor)) init_cpu_support_flag(void) { |
| 195 | +#if defined(__riscv) && (__riscv_xlen == 64) |
| 196 | + // check if CPU supports Zbc. |
| 197 | + // parse /proc/cpuinfo 'isa' line for substring "zbc". |
| 198 | + FILE *f = fopen("/proc/cpuinfo", "r"); |
| 199 | + if (f) { |
| 200 | + char line[256]; |
| 201 | + int has_zbc = 0; |
| 202 | + while (fgets(line, sizeof(line), f)) { |
| 203 | + if ((strstr(line, "isa") || strstr(line, "extensions")) && |
| 204 | + strstr(line, "zbc")) { |
| 205 | + has_zbc = 1; |
| 206 | + break; |
| 207 | + } |
| 208 | + } |
| 209 | + fclose(f); |
| 210 | + if (has_zbc) { |
| 211 | + pipelined_crc32_zlib_func = pipelined_crc32_zlib; |
| 212 | + } |
| 213 | + } |
| 214 | +#endif |
39 | 215 | } |
0 commit comments