Skip to content

Commit 0d55265

Browse files
extract more splats
1 parent 12601cb commit 0d55265

File tree

4 files changed

+20
-14
lines changed

4 files changed

+20
-14
lines changed

src/collect.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,10 @@ impl<I: ChunkIt<u32x8>> CollectAndDedup for PaddedIt<I> {
169169
break;
170170
}
171171

172+
// FIXME: IS this one slow?
172173
let mut m = [u32x8::ZERO; 8];
173174
let mut i = 0;
175+
let eight = S::splat(8);
174176
it.for_each(
175177
#[inline(always)]
176178
|x| {
@@ -218,7 +220,7 @@ impl<I: ChunkIt<u32x8>> CollectAndDedup for PaddedIt<I> {
218220
}
219221
old[j] = lane;
220222
}
221-
offsets += u32x8::splat(8);
223+
offsets += eight;
222224
}
223225
i += 1;
224226
},

src/intrinsics/dedup.rs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
use crate::S;
2-
use crate::minimizers::SKIPPED;
2+
use crate::minimizers::SIMD_SKIPPED;
33
use core::mem::transmute;
44
use packed_seq::L;
55

6+
#[cfg(target_feature = "neon")]
7+
const OFFSET: S = unsafe { std::mem::transmute([0x03_02_01_00; 8]) };
8+
#[cfg(target_feature = "neon")]
9+
const MASK: S = unsafe { std::mem::transmute([0x04_04_04_04; 8]) };
10+
611
/// Dedup adjacent `new` values (starting with the last element of `old`).
712
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
813
#[inline(always)]
@@ -87,7 +92,7 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
8792
let mut m = vec_tmp.cmp_eq(new);
8893
if SKIP_MAX {
8994
// skip everything equal to prev, or equal to MAX.
90-
m |= new.cmp_eq(S::splat(SKIPPED));
95+
m |= new.cmp_eq(SIMD_SKIPPED);
9196
}
9297
let m = _mm256_movemask_ps(transmute(m)) as usize;
9398
let numberofnewvalues = L - m.count_ones() as usize;
@@ -189,7 +194,7 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
189194

190195
let mut dup = prec.cmp_eq(new);
191196
if SKIP_MAX {
192-
dup |= new.cmp_eq(S::splat(SKIPPED));
197+
dup |= new.cmp_eq(SIMD_SKIPPED);
193198
}
194199
// emulate movemask
195200
let (d1, d2): (u32x4, u32x4) = transmute(dup);
@@ -201,7 +206,7 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
201206

202207
let numberofnewvalues = L - m.count_ones() as usize;
203208
let key = UNIQSHUF[m];
204-
let idx = key * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
209+
let idx = key * MASK + OFFSET;
205210
let (i1, i2) = transmute(idx);
206211
let t = transmute(vals);
207212
let r1 = vqtbl2q_u8(t, i1);
@@ -273,7 +278,7 @@ pub unsafe fn append_unique_vals_2(
273278

274279
let numberofnewvalues = L - m.count_ones() as usize;
275280
let key = UNIQSHUF[m];
276-
let idx = key * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
281+
let idx = key * MASK + OFFSET;
277282
let (i1, i2) = transmute(idx);
278283
let t = transmute(vals);
279284
let r1 = vqtbl2q_u8(t, i1);

src/minimizers.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use seq_hash::KmerHasher;
1616
use wide::u32x8;
1717

1818
pub const SKIPPED: u32 = u32::MAX - 1;
19+
pub(crate) const SIMD_SKIPPED: u32x8 = unsafe { std::mem::transmute([SKIPPED; 8]) };
1920

2021
/// Minimizer position of a single window.
2122
pub fn one_minimizer<'s>(seq: impl Seq<'s>, hasher: &impl KmerHasher) -> usize {
@@ -202,6 +203,6 @@ pub fn canonical_minimizers_skip_ambiguous_windows<'s>(
202203
let hash = hash_mapper((a, rh));
203204
let canonical = canonical_mapper((a, rc));
204205
let (lmin, rmin) = sliding_min_mapper(hash);
205-
ambi.blend(u32x8::splat(SKIPPED), canonical.blend(lmin, rmin))
206+
ambi.blend(SIMD_SKIPPED, canonical.blend(lmin, rmin))
206207
})
207208
}

src/sliding_min.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -235,24 +235,24 @@ pub fn sliding_min_mapper_simd<const LEFT: bool>(
235235
let pos_mask = S::splat(0x0000_ffff);
236236
let max_pos = S::splat((1 << 16) - 1);
237237
let mut pos = S::splat(0);
238-
let one = S::splat(1);
239238
// Sliding min is over w+k-1 characters, so chunks overlap w+k-2.
240239
// Thus, the true length of each lane is len-(k+w-2).
241240
//
242241
// The k-mer starting at position 0 is done after processing the char at
243242
// position k-1, so we compensate for that as well.
244243
let mut pos_offset: S = from_fn(|l| (l * len.saturating_sub(w - 1)) as u32).into();
244+
let delta = S::splat((1 << 16) - 2 - w as u32);
245245

246246
#[inline(always)]
247247
move |val| {
248248
// Make sure the position does not interfere with the hash value.
249249
if pos == max_pos {
250250
// Slow case extracted to a function to have better inlining here.
251-
reset_positions_offsets(w, &mut pos, &mut prefix_min, &mut pos_offset, ring_buf);
251+
reset_positions_offsets(delta, &mut pos, &mut prefix_min, &mut pos_offset, ring_buf);
252252
}
253253
// slightly faster than assigning S::splat(u32::MAX)
254254
let elem = (if LEFT { val } else { !val } & val_mask) | pos;
255-
pos += one;
255+
pos += S::ONE;
256256
ring_buf.push(elem);
257257
prefix_min = simd_min::<LEFT>(prefix_min, elem);
258258
// After a chunk has been filled, compute suffix minima.
@@ -284,13 +284,12 @@ fn suffix_minima<const LEFT: bool>(
284284
}
285285

286286
fn reset_positions_offsets(
287-
w: usize,
287+
delta: S,
288288
pos: &mut S,
289289
prefix_min: &mut S,
290290
pos_offset: &mut S,
291291
ring_buf: &mut RingBuf<S>,
292292
) {
293-
let delta = S::splat((1 << 16) - 2 - w as u32);
294293
*pos -= delta;
295294
*prefix_min -= delta;
296295
*pos_offset += delta;
@@ -318,7 +317,6 @@ pub fn sliding_lr_min_mapper_simd(
318317
let max_pos = S::splat((1 << 16) - 1);
319318
let mut pos = S::splat(0);
320319
let mut pos_offset: S = from_fn(|l| (l * len.saturating_sub(w - 1)) as u32).into();
321-
let one = S::splat(1);
322320
let delta = S::splat((1 << 16) - 2 - w as u32);
323321

324322
#[inline(always)]
@@ -338,7 +336,7 @@ pub fn sliding_lr_min_mapper_simd(
338336
let lelem = (val & val_mask) | pos;
339337
let relem = (!val & val_mask) | pos;
340338
let elem = (lelem, relem);
341-
pos += one;
339+
pos += S::ONE;
342340
ring_buf.push(elem);
343341
prefix_lr_min = simd_lr_min(prefix_lr_min, elem);
344342
// After a chunk has been filled, compute suffix minima.

0 commit comments

Comments
 (0)