Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/stringzilla/find.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr
// techniques and process eight characters at a time.
sz_u64_vec_t h_vec, n_vec, match_vec;
match_vec.u64 = 0;
n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
n_vec.u64 = (sz_u64_t) * (sz_u8_t const *)n * 0x0101010101010101ull;
for (; h + 8 <= h_end; h += 8) {
h_vec.u64 = *(sz_u64_t const *)h;
match_vec = sz_u64_each_byte_equal_(h_vec, n_vec);
Expand Down Expand Up @@ -427,7 +427,7 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) {
// Broadcast the n into every byte of a 64-bit integer to use SWAR
// techniques and process eight characters at a time.
sz_u64_vec_t h_vec, n_vec, match_vec;
n_vec.u64 = (sz_u64_t)n[0] * 0x0101010101010101ull;
n_vec.u64 = (sz_u64_t) * (sz_u8_t const *)n * 0x0101010101010101ull;
for (; h >= h_start + 7; h -= 8) {
h_vec.u64 = *(sz_u64_t const *)(h - 7);
match_vec = sz_u64_each_byte_equal_(h_vec, n_vec);
Expand Down
75 changes: 75 additions & 0 deletions scripts/test_stringzilla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2256,6 +2256,79 @@ void test_comparisons() {
assert("a\0"_sv == "a\0"_sv);
}

/**
* @brief Regression test for signed-char bug in `sz_find_byte_serial` and `sz_rfind_byte_serial`.
* When compiled with `-fsigned-char`, bytes > 0x7F would be sign-extended during the SWAR
* broadcast multiplication, producing incorrect results. This test verifies that single-byte
* search works correctly for all byte values 0x00-0xFF across various haystack lengths.
* @see https://github.com/ashvardanian/StringZilla/issues/306
*/
void test_find_byte_serial_high_bytes() {

// Test every byte value in a haystack long enough to exercise the SWAR loop (>=8 bytes)
// and the scalar tail. We place the target byte at different positions to cover both paths.
sz_u8_t haystack_bytes[64];

// Fill the haystack with a neutral byte that won't match our needle
std::memset(haystack_bytes, 0x00, sizeof(haystack_bytes));

for (unsigned needle_byte = 0x80; needle_byte <= 0xFF; ++needle_byte) {
sz_u8_t needle_u8 = (sz_u8_t)needle_byte;
char const *needle = (char const *)&needle_u8;

// Test 1: needle in the SWAR-processed region (position 5, within first 8-byte block)
haystack_bytes[5] = needle_u8;
{
sz_cptr_t result = sz_find_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
assert(result != SZ_NULL_CHAR && "sz_find_byte_serial must find high bytes in SWAR region");
assert((sz_size_t)(result - (sz_cptr_t)haystack_bytes) == 5);
}
{
sz_cptr_t result = sz_rfind_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
assert(result != SZ_NULL_CHAR && "sz_rfind_byte_serial must find high bytes in SWAR region");
assert((sz_size_t)(result - (sz_cptr_t)haystack_bytes) == 5);
}
haystack_bytes[5] = 0x00;

// Test 2: needle in the scalar tail region (position 61, within last few bytes)
haystack_bytes[61] = needle_u8;
{
sz_cptr_t result = sz_find_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
assert(result != SZ_NULL_CHAR && "sz_find_byte_serial must find high bytes in scalar tail");
assert((sz_size_t)(result - (sz_cptr_t)haystack_bytes) == 61);
}
{
sz_cptr_t result = sz_rfind_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
assert(result != SZ_NULL_CHAR && "sz_rfind_byte_serial must find high bytes in scalar tail");
assert((sz_size_t)(result - (sz_cptr_t)haystack_bytes) == 61);
}
haystack_bytes[61] = 0x00;

// Test 3: needle not present - must return NULL
{
sz_cptr_t result = sz_find_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
assert(result == SZ_NULL_CHAR && "sz_find_byte_serial must return NULL when byte is absent");
}
{
sz_cptr_t result = sz_rfind_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
assert(result == SZ_NULL_CHAR && "sz_rfind_byte_serial must return NULL when byte is absent");
}
}

// Test 4: multiple occurrences - find returns first, rfind returns last
std::memset(haystack_bytes, 0x00, sizeof(haystack_bytes));
haystack_bytes[3] = 0xBE;
haystack_bytes[40] = 0xBE;
{
sz_u8_t n = 0xBE;
char const *needle = (char const *)&n;
sz_cptr_t first = sz_find_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
sz_cptr_t last = sz_rfind_byte_serial((sz_cptr_t)haystack_bytes, sizeof(haystack_bytes), needle);
assert(first != SZ_NULL_CHAR && (sz_size_t)(first - (sz_cptr_t)haystack_bytes) == 3);
assert(last != SZ_NULL_CHAR && (sz_size_t)(last - (sz_cptr_t)haystack_bytes) == 40);
}
}

/**
* @brief Tests the correctness of the string class search methods, such as `find` and `find_first_of`.
* This covers haystacks and needles of different lengths, as well as character-sets.
Expand Down Expand Up @@ -4556,6 +4629,8 @@ int main(int argc, char const **argv) {
test_updates();

std::printf("\n=== Search and Comparison ===\n");
std::printf("- test_find_byte_serial_high_bytes...\n");
test_find_byte_serial_high_bytes();
std::printf("- test_comparisons...\n");
test_comparisons();
std::printf("- test_search...\n");
Expand Down