|
2 | 2 | #include <stdint.h> |
3 | 3 | #include <limits.h> |
4 | 4 |
|
| 5 | +#ifdef __wasm_simd128__ |
| 6 | +#include <wasm_simd128.h> |
| 7 | +#endif |
| 8 | + |
5 | 9 | #define SS (sizeof(size_t)) |
6 | 10 | #define ALIGN (sizeof(size_t)-1) |
7 | 11 | #define ONES ((size_t)-1/UCHAR_MAX) |
|
10 | 14 |
|
11 | 15 | void *memchr(const void *src, int c, size_t n) |
12 | 16 | { |
| 17 | +#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string) |
| 18 | + // Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) |
| 19 | + // which results in an ICE when inline assembly is used with a vector result. |
| 20 | +#if __clang_major__ != 19 && __clang_major__ != 20 |
| 21 | + // When n is zero, a function that locates a character finds no occurrence. |
| 22 | + // Otherwise, decrement n to ensure sub_overflow overflows |
| 23 | + // when n would go equal-to-or-below zero. |
| 24 | + if (!n--) { |
| 25 | + return NULL; |
| 26 | + } |
| 27 | + |
| 28 | + // Note that reading before/after the allocation of a pointer is UB in |
| 29 | + // C, so inline assembly is used to generate the exact machine |
| 30 | + // instruction we want with opaque semantics to the compiler to avoid |
| 31 | + // the UB. |
| 32 | + uintptr_t align = (uintptr_t)src % sizeof(v128_t); |
| 33 | + uintptr_t addr = (uintptr_t)src - align; |
| 34 | + v128_t vc = wasm_i8x16_splat(c); |
| 35 | + |
| 36 | + for (;;) { |
| 37 | + v128_t v; |
| 38 | + __asm__ ( |
| 39 | + "local.get %1\n" |
| 40 | + "v128.load 0\n" |
| 41 | + "local.set %0\n" |
| 42 | + : "=r"(v) |
| 43 | + : "r"(addr) |
| 44 | + : "memory"); |
| 45 | + v128_t cmp = wasm_i8x16_eq(v, vc); |
| 46 | + // Bitmask is slow on AArch64, any_true is much faster. |
| 47 | + if (wasm_v128_any_true(cmp)) { |
| 48 | + // Clear the bits corresponding to align (little-endian) |
| 49 | + // so we can count trailing zeros. |
| 50 | + int mask = wasm_i8x16_bitmask(cmp) >> align << align; |
| 51 | + // At least one bit will be set, unless align cleared them. |
| 52 | + // Knowing this helps the compiler if it unrolls the loop. |
| 53 | + __builtin_assume(mask || align); |
| 54 | + // If the mask became zero because of align, |
| 55 | + // it's as if we didn't find anything. |
| 56 | + if (mask) { |
| 57 | + // Find the offset of the first one bit (little-endian). |
| 58 | + // That's a match, unless it is beyond the end of the object. |
| 59 | + // Recall that we decremented n, so less-than-or-equal-to is correct. |
| 60 | + size_t ctz = __builtin_ctz(mask); |
| 61 | + return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src) |
| 62 | + : NULL; |
| 63 | + } |
| 64 | + } |
| 65 | + // Decrement n; if it overflows we're done. |
| 66 | + if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) { |
| 67 | + return NULL; |
| 68 | + } |
| 69 | + align = 0; |
| 70 | + addr += sizeof(v128_t); |
| 71 | + } |
| 72 | +#endif |
| 73 | +#endif |
| 74 | + |
13 | 75 | const unsigned char *s = src; |
14 | 76 | c = (unsigned char)c; |
15 | 77 | #ifdef __GNUC__ |
|
0 commit comments