Skip to content

Commit 674c28a

Browse files
extract more splats
1 parent 12601cb commit 674c28a

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

src/collect.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,10 @@ impl<I: ChunkIt<u32x8>> CollectAndDedup for PaddedIt<I> {
169169
break;
170170
}
171171

172+
// FIXME: IS this one slow?
172173
let mut m = [u32x8::ZERO; 8];
173174
let mut i = 0;
175+
let eight = S::splat(8);
174176
it.for_each(
175177
#[inline(always)]
176178
|x| {
@@ -218,7 +220,7 @@ impl<I: ChunkIt<u32x8>> CollectAndDedup for PaddedIt<I> {
218220
}
219221
old[j] = lane;
220222
}
221-
offsets += u32x8::splat(8);
223+
offsets += eight;
222224
}
223225
i += 1;
224226
},

src/intrinsics/dedup.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ use crate::minimizers::SKIPPED;
33
use core::mem::transmute;
44
use packed_seq::L;
55

6+
const SIMD_SKIPPED: S = unsafe { std::mem::transmute([SKIPPED; 8]) };
7+
#[cfg(target_feature = "neon")]
8+
const OFFSET: S = unsafe { std::mem::transmute([0x03_02_01_00; 8]) };
9+
#[cfg(target_feature = "neon")]
10+
const MASK: S = unsafe { std::mem::transmute([0x04_04_04_04; 8]) };
11+
612
/// Dedup adjacent `new` values (starting with the last element of `old`).
713
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
814
#[inline(always)]
@@ -87,7 +93,7 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
8793
let mut m = vec_tmp.cmp_eq(new);
8894
if SKIP_MAX {
8995
// skip everything equal to prev, or equal to MAX.
90-
m |= new.cmp_eq(S::splat(SKIPPED));
96+
m |= new.cmp_eq(SIMD_SKIPPED);
9197
}
9298
let m = _mm256_movemask_ps(transmute(m)) as usize;
9399
let numberofnewvalues = L - m.count_ones() as usize;
@@ -189,7 +195,7 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
189195

190196
let mut dup = prec.cmp_eq(new);
191197
if SKIP_MAX {
192-
dup |= new.cmp_eq(S::splat(SKIPPED));
198+
dup |= new.cmp_eq(SIMD_SKIPPED);
193199
}
194200
// emulate movemask
195201
let (d1, d2): (u32x4, u32x4) = transmute(dup);
@@ -201,7 +207,7 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
201207

202208
let numberofnewvalues = L - m.count_ones() as usize;
203209
let key = UNIQSHUF[m];
204-
let idx = key * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
210+
let idx = key * MASK + OFFSET;
205211
let (i1, i2) = transmute(idx);
206212
let t = transmute(vals);
207213
let r1 = vqtbl2q_u8(t, i1);
@@ -273,7 +279,7 @@ pub unsafe fn append_unique_vals_2(
273279

274280
let numberofnewvalues = L - m.count_ones() as usize;
275281
let key = UNIQSHUF[m];
276-
let idx = key * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
282+
let idx = key * MASK + OFFSET;
277283
let (i1, i2) = transmute(idx);
278284
let t = transmute(vals);
279285
let r1 = vqtbl2q_u8(t, i1);

src/sliding_min.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -235,24 +235,24 @@ pub fn sliding_min_mapper_simd<const LEFT: bool>(
235235
let pos_mask = S::splat(0x0000_ffff);
236236
let max_pos = S::splat((1 << 16) - 1);
237237
let mut pos = S::splat(0);
238-
let one = S::splat(1);
239238
// Sliding min is over w+k-1 characters, so chunks overlap w+k-2.
240239
// Thus, the true length of each lane is len-(k+w-2).
241240
//
242241
// The k-mer starting at position 0 is done after processing the char at
243242
// position k-1, so we compensate for that as well.
244243
let mut pos_offset: S = from_fn(|l| (l * len.saturating_sub(w - 1)) as u32).into();
244+
let delta = S::splat((1 << 16) - 2 - w as u32);
245245

246246
#[inline(always)]
247247
move |val| {
248248
// Make sure the position does not interfere with the hash value.
249249
if pos == max_pos {
250250
// Slow case extracted to a function to have better inlining here.
251-
reset_positions_offsets(w, &mut pos, &mut prefix_min, &mut pos_offset, ring_buf);
251+
reset_positions_offsets(delta, &mut pos, &mut prefix_min, &mut pos_offset, ring_buf);
252252
}
253253
// slightly faster than assigning S::splat(u32::MAX)
254254
let elem = (if LEFT { val } else { !val } & val_mask) | pos;
255-
pos += one;
255+
pos += S::ONE;
256256
ring_buf.push(elem);
257257
prefix_min = simd_min::<LEFT>(prefix_min, elem);
258258
// After a chunk has been filled, compute suffix minima.
@@ -284,13 +284,12 @@ fn suffix_minima<const LEFT: bool>(
284284
}
285285

286286
fn reset_positions_offsets(
287-
w: usize,
287+
delta: S,
288288
pos: &mut S,
289289
prefix_min: &mut S,
290290
pos_offset: &mut S,
291291
ring_buf: &mut RingBuf<S>,
292292
) {
293-
let delta = S::splat((1 << 16) - 2 - w as u32);
294293
*pos -= delta;
295294
*prefix_min -= delta;
296295
*pos_offset += delta;
@@ -318,7 +317,6 @@ pub fn sliding_lr_min_mapper_simd(
318317
let max_pos = S::splat((1 << 16) - 1);
319318
let mut pos = S::splat(0);
320319
let mut pos_offset: S = from_fn(|l| (l * len.saturating_sub(w - 1)) as u32).into();
321-
let one = S::splat(1);
322320
let delta = S::splat((1 << 16) - 2 - w as u32);
323321

324322
#[inline(always)]
@@ -338,7 +336,7 @@ pub fn sliding_lr_min_mapper_simd(
338336
let lelem = (val & val_mask) | pos;
339337
let relem = (!val & val_mask) | pos;
340338
let elem = (lelem, relem);
341-
pos += one;
339+
pos += S::ONE;
342340
ring_buf.push(elem);
343341
prefix_lr_min = simd_lr_min(prefix_lr_min, elem);
344342
// After a chunk has been filled, compute suffix minima.

0 commit comments

Comments
 (0)