Optional SIMD memchr (#592)

ncruces · web-flow · commit 4ea6fdfa288e · 2025-07-11T10:07:11.000-07:00
Continuing #580, followup to #586. Chose `memchr` because it's somewhat similar to `strlen`, but also because it is the basis for `strnlen` (and in that capacity, for `strndup` and `strlcat`) and is also used by `strstr`, `fnmatch`.
diff --git a/libc-top-half/musl/src/string/memchr.c b/libc-top-half/musl/src/string/memchr.c
@@ -2,6 +2,10 @@
 #include <stdint.h>
 #include <limits.h>
 
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#endif
+
 #define SS (sizeof(size_t))
 #define ALIGN (sizeof(size_t)-1)
 #define ONES ((size_t)-1/UCHAR_MAX)
@@ -10,6 +14,64 @@
 
 void *memchr(const void *src, int c, size_t n)
 {
+#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
+	// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
+	// which results in an ICE when inline assembly is used with a vector result.
+#if __clang_major__ != 19 && __clang_major__ != 20
+	// When n is zero, a function that locates a character finds no occurrence.
+	// Otherwise, decrement n to ensure sub_overflow overflows
+	// when n would go equal-to-or-below zero.
+	if (!n--) {
+		return NULL;
+	}
+
+	// Note that reading before/after the allocation of a pointer is UB in
+	// C, so inline assembly is used to generate the exact machine
+	// instruction we want with opaque semantics to the compiler to avoid
+	// the UB.
+	uintptr_t align = (uintptr_t)src % sizeof(v128_t);
+	uintptr_t addr = (uintptr_t)src - align;
+	v128_t vc = wasm_i8x16_splat(c);
+
+	for (;;) {
+		v128_t v;
+		__asm__ (
+			"local.get %1\n"
+			"v128.load 0\n"
+			"local.set %0\n"
+			: "=r"(v)
+			: "r"(addr)
+			: "memory");
+		v128_t cmp = wasm_i8x16_eq(v, vc);
+		// Bitmask is slow on AArch64, any_true is much faster.
+		if (wasm_v128_any_true(cmp)) {
+			// Clear the bits corresponding to align (little-endian)
+			// so we can count trailing zeros.
+			int mask = wasm_i8x16_bitmask(cmp) >> align << align;
+			// At least one bit will be set, unless align cleared them.
+			// Knowing this helps the compiler if it unrolls the loop.
+			__builtin_assume(mask || align);
+			// If the mask became zero because of align,
+			// it's as if we didn't find anything.
+			if (mask) {
+				// Find the offset of the first one bit (little-endian).
+				// That's a match, unless it is beyond the end of the object.
+				// Recall that we decremented n, so less-than-or-equal-to is correct.
+				size_t ctz = __builtin_ctz(mask);
+				return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src)
+				                        : NULL;
+			}
+		}
+		// Decrement n; if it overflows we're done.
+		if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
+			return NULL;
+		}
+		align = 0;
+		addr += sizeof(v128_t);
+	}
+#endif
+#endif
+
 	const unsigned char *s = src;
 	c = (unsigned char)c;
 #ifdef __GNUC__
diff --git a/libc-top-half/musl/src/string/strlen.c b/libc-top-half/musl/src/string/strlen.c
@@ -14,28 +14,28 @@
 size_t strlen(const char *s)
 {
 #if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
-// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574) which
-// results in an ICE when inline assembly is used with a vector result.
+	// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
+	// which results in an ICE when inline assembly is used with a vector result.
 #if __clang_major__ != 19 && __clang_major__ != 20
-        // Note that reading before/after the allocation of a pointer is UB in
-        // C, so inline assembly is used to generate the exact machine
-        // instruction we want with opaque semantics to the compiler to avoid
-        // the UB.
+	// Note that reading before/after the allocation of a pointer is UB in
+	// C, so inline assembly is used to generate the exact machine
+	// instruction we want with opaque semantics to the compiler to avoid
+	// the UB.
 	uintptr_t align = (uintptr_t)s % sizeof(v128_t);
-	uintptr_t v = (uintptr_t)s - align;
+	uintptr_t addr = (uintptr_t)s - align;
 
 	for (;;) {
-                v128_t chunk;
+		v128_t v;
 		__asm__ (
 			"local.get %1\n"
 			"v128.load 0\n"
 			"local.set %0\n"
-			: "=r"(chunk)
-                        : "r"(v)
-                        : "memory");
+			: "=r"(v)
+			: "r"(addr)
+			: "memory");
 		// Bitmask is slow on AArch64, all_true is much faster.
-		if (!wasm_i8x16_all_true(chunk)) {
-			const v128_t cmp = wasm_i8x16_eq(chunk, (v128_t){});
+		if (!wasm_i8x16_all_true(v)) {
+			const v128_t cmp = wasm_i8x16_eq(v, (v128_t){});
 			// Clear the bits corresponding to align (little-endian)
 			// so we can count trailing zeros.
 			int mask = wasm_i8x16_bitmask(cmp) >> align << align;
@@ -46,11 +46,11 @@ size_t strlen(const char *s)
 			// it's as if we didn't find anything.
 			if (mask) {
 				// Find the offset of the first one bit (little-endian).
-				return v - (uintptr_t)s + __builtin_ctz(mask);
+				return addr - (uintptr_t)s + __builtin_ctz(mask);
 			}
 		}
 		align = 0;
-		v += sizeof(v128_t);
+		addr += sizeof(v128_t);
 	}
 #endif
 #endif
diff --git a/test/src/misc/memchr.c b/test/src/misc/memchr.c
@@ -0,0 +1,53 @@
+//! add-flags.py(LDFLAGS): -Wl,--stack-first -Wl,--initial-memory=327680
+
+#include <__macro_PAGESIZE.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+void test(char *ptr, size_t length, void *want) {
+  void *got = memchr(ptr, 7, length);
+  if (got != want) {
+    printf("memchr(%p, 7, %lu) = %p, want %p\n", ptr, length, got, want);
+  }
+}
+
+int main(void) {
+  char *const LIMIT = (char *)(__builtin_wasm_memory_size(0) * PAGESIZE);
+
+  for (size_t length = 0; length < 64; length++) {
+    for (size_t alignment = 0; alignment < 24; alignment++) {
+      for (ptrdiff_t pos = -2; pos < length + 2; pos++) {
+        // Create a buffer with the given length, at a pointer with the given
+        // alignment. Using the offset LIMIT - PAGESIZE - 8 means many buffers
+        // will straddle a (Wasm, and likely OS) page boundary. Place the
+        // character to find at every position in the buffer, including just
+        // prior to it and after its end.
+        char *ptr = LIMIT - PAGESIZE - 8 + alignment;
+        memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
+        memset(ptr, 5, length);
+        ptr[pos] = 7;
+
+        // The first instance of the character is found.
+        if (pos >= 0) ptr[pos + 2] = 7;
+
+        // The character is found if it's within range.
+        test(ptr, length, 0 <= pos && pos < length ? &ptr[pos] : NULL);
+      }
+    }
+
+    // Ensure we never read past the end of memory.
+    char *ptr = LIMIT - length;
+    memset(LIMIT - 2 * PAGESIZE, 0, 2 * PAGESIZE);
+    memset(ptr, 5, length);
+    ptr[length - 1] = 7;
+
+    // Nothing found on an empty buffer.
+    test(ptr, length, length != 0 ? &ptr[length - 1] : NULL);
+
+    // Test for length overflow.
+    if (length > 0) test(ptr, SIZE_MAX, &ptr[length - 1]);
+  }
+
+  return 0;
+}