From a83161f511920f709dcf6d3bccce104f5ca580f0 Mon Sep 17 00:00:00 2001 From: Elliott Mahler Date: Mon, 26 Feb 2024 21:00:14 -0800 Subject: [PATCH] Use rotate45/135 for neon butterfly16 --- src/neon/neon_butterflies.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs index 64ed699..2e76b63 100644 --- a/src/neon/neon_butterflies.rs +++ b/src/neon/neon_butterflies.rs @@ -2307,9 +2307,7 @@ pub struct NeonF32Butterfly16 { bf4: NeonF32Butterfly4, twiddles_packed: [float32x4_t; 6], twiddle1: float32x4_t, - twiddle2: float32x4_t, twiddle3: float32x4_t, - twiddle6: float32x4_t, twiddle9: float32x4_t, } @@ -2342,9 +2340,7 @@ impl NeonF32Butterfly16 { pack_32(tw6, tw9), ], twiddle1: pack_32(tw1, tw1), - twiddle2: pack_32(tw2, tw2), twiddle3: pack_32(tw3, tw3), - twiddle6: pack_32(tw6, tw6), twiddle9: pack_32(tw9, tw9), } } @@ -2429,11 +2425,11 @@ impl NeonF32Butterfly16 { let [in2, in3] = load(2); let mut tmp2 = self.bf4.perform_parallel_fft_direct(in2); let mut tmp3 = self.bf4.perform_parallel_fft_direct(in3); - tmp2[1] = NeonVector::mul_complex(tmp2[1], self.twiddle2); + tmp2[1] = self.bf4.rotate.rotate_both_45(tmp2[1]); tmp2[2] = self.bf4.rotate.rotate_both(tmp2[2]); - tmp2[3] = NeonVector::mul_complex(tmp2[3], self.twiddle6); + tmp2[3] = self.bf4.rotate.rotate_both_135(tmp2[3]); tmp3[1] = NeonVector::mul_complex(tmp3[1], self.twiddle3); - tmp3[2] = NeonVector::mul_complex(tmp3[2], self.twiddle6); + tmp3[2] = self.bf4.rotate.rotate_both_135(tmp3[2]); tmp3[3] = NeonVector::mul_complex(tmp3[3], self.twiddle9); // Do these last, because fewer twiddles means fewer temporaries forcing the above data to spill @@ -2441,7 +2437,7 @@ impl NeonF32Butterfly16 { let tmp0 = self.bf4.perform_parallel_fft_direct(in0); let mut tmp1 = self.bf4.perform_parallel_fft_direct(in1); tmp1[1] = NeonVector::mul_complex(tmp1[1], self.twiddle1); - tmp1[2] = NeonVector::mul_complex(tmp1[2], self.twiddle2); + tmp1[2] = self.bf4.rotate.rotate_both_45(tmp1[2]); tmp1[3] = NeonVector::mul_complex(tmp1[3], self.twiddle3); ////////////////////////////////////////////////////////////