Skip to content

Commit

Permalink
Use rotate45/135 for neon butterfly16
Browse files Browse the repository at this point in the history
  • Loading branch information
ejmahler committed Feb 27, 2024
1 parent 9498352 commit a83161f
Showing 1 changed file with 4 additions and 8 deletions.
12 changes: 4 additions & 8 deletions src/neon/neon_butterflies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2307,9 +2307,7 @@ pub struct NeonF32Butterfly16<T> {
bf4: NeonF32Butterfly4<T>,
twiddles_packed: [float32x4_t; 6],
twiddle1: float32x4_t,
twiddle2: float32x4_t,
twiddle3: float32x4_t,
twiddle6: float32x4_t,
twiddle9: float32x4_t,
}

Expand Down Expand Up @@ -2342,9 +2340,7 @@ impl<T: FftNum> NeonF32Butterfly16<T> {
pack_32(tw6, tw9),
],
twiddle1: pack_32(tw1, tw1),
twiddle2: pack_32(tw2, tw2),
twiddle3: pack_32(tw3, tw3),
twiddle6: pack_32(tw6, tw6),
twiddle9: pack_32(tw9, tw9),
}
}
Expand Down Expand Up @@ -2429,19 +2425,19 @@ impl<T: FftNum> NeonF32Butterfly16<T> {
let [in2, in3] = load(2);
let mut tmp2 = self.bf4.perform_parallel_fft_direct(in2);
let mut tmp3 = self.bf4.perform_parallel_fft_direct(in3);
tmp2[1] = NeonVector::mul_complex(tmp2[1], self.twiddle2);
tmp2[1] = self.bf4.rotate.rotate_both_45(tmp2[1]);
tmp2[2] = self.bf4.rotate.rotate_both(tmp2[2]);
tmp2[3] = NeonVector::mul_complex(tmp2[3], self.twiddle6);
tmp2[3] = self.bf4.rotate.rotate_both_135(tmp2[3]);
tmp3[1] = NeonVector::mul_complex(tmp3[1], self.twiddle3);
tmp3[2] = NeonVector::mul_complex(tmp3[2], self.twiddle6);
tmp3[2] = self.bf4.rotate.rotate_both_135(tmp3[2]);
tmp3[3] = NeonVector::mul_complex(tmp3[3], self.twiddle9);

// Do these last, because fewer twiddles means fewer temporaries forcing the above data to spill
let [in0, in1] = load(0);
let tmp0 = self.bf4.perform_parallel_fft_direct(in0);
let mut tmp1 = self.bf4.perform_parallel_fft_direct(in1);
tmp1[1] = NeonVector::mul_complex(tmp1[1], self.twiddle1);
tmp1[2] = NeonVector::mul_complex(tmp1[2], self.twiddle2);
tmp1[2] = self.bf4.rotate.rotate_both_45(tmp1[2]);
tmp1[3] = NeonVector::mul_complex(tmp1[3], self.twiddle3);

////////////////////////////////////////////////////////////
Expand Down

0 comments on commit a83161f

Please sign in to comment.