Skip to content

Commit 12601cb

Browse files
move splat out of loop
1 parent c54ec59 commit 12601cb

File tree

2 files changed

+15
-6
lines changed

2 files changed

+15
-6
lines changed

src/canonical.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,15 @@ pub fn canonical_mapper_simd(l: usize) -> (Delay, impl FnMut((S, S)) -> u32x8) {
4646

4747
// Cnt of odd characters, offset by -l/2 so >0 is canonical and <0 is not.
4848
let mut cnt = i32x8::splat(-(l as i32));
49+
let zero = i32x8::splat(0);
4950
let two = i32x8::splat(2);
5051

5152
(
5253
Delay(l - 1),
5354
#[inline(always)]
5455
move |(a, r)| {
5556
cnt += unsafe { transmute::<_, i32x8>(a) } & two;
56-
let out = unsafe { transmute::<_, u32x8>(cnt.cmp_gt(i32x8::splat(0))) };
57+
let out = unsafe { transmute::<_, u32x8>(cnt.cmp_gt(zero)) };
5758
cnt -= unsafe { transmute::<_, i32x8>(r) } & two;
5859
out
5960
},

src/sliding_min.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ pub fn sliding_min_mapper_simd<const LEFT: bool>(
235235
let pos_mask = S::splat(0x0000_ffff);
236236
let max_pos = S::splat((1 << 16) - 1);
237237
let mut pos = S::splat(0);
238+
let one = S::splat(1);
238239
// Sliding min is over w+k-1 characters, so chunks overlap w+k-2.
239240
// Thus, the true length of each lane is len-(k+w-2).
240241
//
@@ -251,7 +252,7 @@ pub fn sliding_min_mapper_simd<const LEFT: bool>(
251252
}
252253
// slightly faster than assigning S::splat(u32::MAX)
253254
let elem = (if LEFT { val } else { !val } & val_mask) | pos;
254-
pos += S::splat(1);
255+
pos += one;
255256
ring_buf.push(elem);
256257
prefix_min = simd_min::<LEFT>(prefix_min, elem);
257258
// After a chunk has been filled, compute suffix minima.
@@ -317,19 +318,27 @@ pub fn sliding_lr_min_mapper_simd(
317318
let max_pos = S::splat((1 << 16) - 1);
318319
let mut pos = S::splat(0);
319320
let mut pos_offset: S = from_fn(|l| (l * len.saturating_sub(w - 1)) as u32).into();
321+
let one = S::splat(1);
322+
let delta = S::splat((1 << 16) - 2 - w as u32);
320323

321324
#[inline(always)]
322325
move |val| {
323326
// Make sure the position does not interfere with the hash value.
324327
if pos == max_pos {
325328
// Slow case extracted to a function to have better inlining here.
326-
reset_positions_offsets_lr(w, &mut pos, &mut prefix_lr_min, &mut pos_offset, ring_buf);
329+
reset_positions_offsets_lr(
330+
delta,
331+
&mut pos,
332+
&mut prefix_lr_min,
333+
&mut pos_offset,
334+
ring_buf,
335+
);
327336
}
328337
// slightly faster than assigning S::splat(u32::MAX)
329338
let lelem = (val & val_mask) | pos;
330339
let relem = (!val & val_mask) | pos;
331340
let elem = (lelem, relem);
332-
pos += S::splat(1);
341+
pos += one;
333342
ring_buf.push(elem);
334343
prefix_lr_min = simd_lr_min(prefix_lr_min, elem);
335344
// After a chunk has been filled, compute suffix minima.
@@ -372,13 +381,12 @@ fn suffix_lr_minima(
372381

373382
#[inline(always)]
374383
fn reset_positions_offsets_lr(
375-
w: usize,
384+
delta: S,
376385
pos: &mut S,
377386
prefix_min: &mut (S, S),
378387
pos_offset: &mut S,
379388
ring_buf: &mut RingBuf<(S, S)>,
380389
) {
381-
let delta = S::splat((1 << 16) - 2 - w as u32);
382390
*pos -= delta;
383391
*pos_offset += delta;
384392
prefix_min.0 -= delta;

0 commit comments

Comments
 (0)