@@ -117,25 +117,26 @@ inline std::int32_t l1_distance_int8_neon(std::span<const std::int8_t> a,
117117 std::size_t i = 0 ;
118118
119119 // Process 64 elements at a time (4 × 16)
120+ // Note: vabdq_s8 returns int8x16_t; reinterpret as uint8x16_t since abs diff is non-negative
120121 while (i + 63 < size) {
121122 int8x16_t v1 = vld1q_s8 (&a[i]);
122123 int8x16_t v2 = vld1q_s8 (&b[i]);
123- uint8x16_t diff1 = vabdq_s8 (v1, v2);
124+ uint8x16_t diff1 = vreinterpretq_u8_s8 ( vabdq_s8 (v1, v2) );
124125 acc1 = vaddq_s32 (acc1, vreinterpretq_s32_u32 (vpaddlq_u16 (vpaddlq_u8 (diff1))));
125126
126127 v1 = vld1q_s8 (&a[i + 16 ]);
127128 v2 = vld1q_s8 (&b[i + 16 ]);
128- uint8x16_t diff2 = vabdq_s8 (v1, v2);
129+ uint8x16_t diff2 = vreinterpretq_u8_s8 ( vabdq_s8 (v1, v2) );
129130 acc2 = vaddq_s32 (acc2, vreinterpretq_s32_u32 (vpaddlq_u16 (vpaddlq_u8 (diff2))));
130131
131132 v1 = vld1q_s8 (&a[i + 32 ]);
132133 v2 = vld1q_s8 (&b[i + 32 ]);
133- uint8x16_t diff3 = vabdq_s8 (v1, v2);
134+ uint8x16_t diff3 = vreinterpretq_u8_s8 ( vabdq_s8 (v1, v2) );
134135 acc3 = vaddq_s32 (acc3, vreinterpretq_s32_u32 (vpaddlq_u16 (vpaddlq_u8 (diff3))));
135136
136137 v1 = vld1q_s8 (&a[i + 48 ]);
137138 v2 = vld1q_s8 (&b[i + 48 ]);
138- uint8x16_t diff4 = vabdq_s8 (v1, v2);
139+ uint8x16_t diff4 = vreinterpretq_u8_s8 ( vabdq_s8 (v1, v2) );
139140 acc4 = vaddq_s32 (acc4, vreinterpretq_s32_u32 (vpaddlq_u16 (vpaddlq_u8 (diff4))));
140141
141142 i += 64 ;
@@ -145,7 +146,7 @@ inline std::int32_t l1_distance_int8_neon(std::span<const std::int8_t> a,
145146 while (i + 15 < size) {
146147 int8x16_t v1 = vld1q_s8 (&a[i]);
147148 int8x16_t v2 = vld1q_s8 (&b[i]);
148- uint8x16_t diff = vabdq_s8 (v1, v2);
149+ uint8x16_t diff = vreinterpretq_u8_s8 ( vabdq_s8 (v1, v2) );
149150 acc1 = vaddq_s32 (acc1, vreinterpretq_s32_u32 (vpaddlq_u16 (vpaddlq_u8 (diff))));
150151 i += 16 ;
151152 }
0 commit comments