From 22209139d355142e20053c9e2de0596e141662ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Magnenat?= Date: Wed, 14 Jun 2023 10:50:18 +0200 Subject: [PATCH] Added SIMD versions of filters, for 25%-400% performance improvement depending on kernel size. --- akaze/Cargo.toml | 1 + akaze/benches/criterion.rs | 37 ++++++++++++++++- akaze/src/image.rs | 85 +++++++++++++++++++++++++++++--------- akaze/src/lib.rs | 2 +- 4 files changed, 102 insertions(+), 23 deletions(-) diff --git a/akaze/Cargo.toml b/akaze/Cargo.toml index 8411696e..15b77dd8 100644 --- a/akaze/Cargo.toml +++ b/akaze/Cargo.toml @@ -28,6 +28,7 @@ bitarray = "0.9.0" thiserror = { version = "1.0.40", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } rayon = { version = "1.7.0", optional = true } +wide = "0.7" [dev-dependencies] eight-point = { version = "0.8.0", path = "../eight-point" } diff --git a/akaze/benches/criterion.rs b/akaze/benches/criterion.rs index 523b25d4..88b7607b 100644 --- a/akaze/benches/criterion.rs +++ b/akaze/benches/criterion.rs @@ -14,8 +14,41 @@ fn extract(c: &mut Criterion) { } criterion_group!( - name = benches; + name = akaze; config = Criterion::default().sample_size(10); targets = extract ); -criterion_main!(benches); + +fn bench_horizontal_filter(c: &mut Criterion) { + let image = + akaze::image::GrayFloatImage::from_dynamic(&image::open("../res/0000000000.png").unwrap()); + let small_kernel = akaze::image::gaussian_kernel(1.0, 7); + c.bench_function("horizontal_filter_small_kernel", |b| { + b.iter(|| akaze::image::horizontal_filter(&image.0, &small_kernel)) + }); + let large_kernel = akaze::image::gaussian_kernel(10.0, 71); + c.bench_function("horizontal_filter_large_kernel", |b| { + b.iter(|| akaze::image::horizontal_filter(&image.0, &large_kernel)) + }); +} + +fn bench_vertical_filter(c: &mut Criterion) { + let image = + akaze::image::GrayFloatImage::from_dynamic(&image::open("../res/0000000000.png").unwrap()); + let small_kernel = akaze::image::gaussian_kernel(1.0, 7); + c.bench_function("vertical_filter_small_kernel", |b| { + b.iter(|| akaze::image::vertical_filter(&image.0, &small_kernel)) + }); + let large_kernel = akaze::image::gaussian_kernel(10.0, 71); + c.bench_function("vertical_filter_large_kernel", |b| { + b.iter(|| akaze::image::vertical_filter(&image.0, &large_kernel)) + }); +} + +criterion_group!( + name = akaze_image; + config = Criterion::default().sample_size(10); + targets = bench_horizontal_filter, bench_vertical_filter +); + +criterion_main!(akaze, akaze_image); diff --git a/akaze/src/image.rs b/akaze/src/image.rs index 9aac3a00..7151749e 100644 --- a/akaze/src/image.rs +++ b/akaze/src/image.rs @@ -4,6 +4,7 @@ use log::*; use ndarray::{azip, s, Array2, ArrayView2, ArrayViewMut2}; use nshare::{MutNdarray2, RefNdarray2}; use std::f32; +use wide::f32x4; type GrayImageBuffer = ImageBuffer, Vec>; @@ -178,47 +179,85 @@ impl GrayFloatImage { } } -fn horizontal_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer { +pub fn horizontal_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer { + // Validate kernel size. let kernel_size = kernel.len(); debug_assert!(kernel_size % 2 == 1); let kernel_half_size = kernel_size / 2; + // Prepare output. let width = image.width() as usize; let height = image.height() as usize; let mut output = vec![0.0; width * height]; + // Create SIMD kernel, padded with 0. + let kernel_simd = kernel + .chunks(4) + .map(|chunk| { + let data = [ + #[allow(clippy::get_first)] + chunk.get(0).copied().unwrap_or(0.0), + chunk.get(1).copied().unwrap_or(0.0), + chunk.get(2).copied().unwrap_or(0.0), + chunk.get(3).copied().unwrap_or(0.0), + ]; + f32x4::new(data) + }) + .collect::>(); + let kernel_simd_size = 4 * (kernel_size + 3) / 4; + let kernel_simd_extra_elements = kernel_simd_size - kernel_size; // Process each row independently. let row_in_it = image.as_raw().chunks_exact(width); let row_out_it = output.chunks_exact_mut(width); - let mut scratch = vec![0f32; width + kernel_half_size * 2]; + let mut scratch = vec![0f32; width + kernel_half_size * 2 + kernel_simd_extra_elements]; for (row_in, row_out) in row_in_it.zip(row_out_it) { // Prefill extended buffer with center and edge values. scratch[0..kernel_half_size].fill(row_in[0]); scratch[kernel_half_size..kernel_half_size + width].copy_from_slice(row_in); - scratch[kernel_half_size + width..].fill(row_in[width - 1]); + scratch[kernel_half_size + width..2 * kernel_half_size + width].fill(row_in[width - 1]); + scratch[2 * kernel_half_size + width..].fill(0.); // Apply kernel. scratch - .windows(kernel_size) + .windows(kernel_simd_size) .zip(row_out) .for_each(|(window, output)| { *output = window - .iter() - .zip(kernel.iter()) - .map(|(a, b)| a * b) - .sum::() + .chunks_exact(4) + .map(|chunk| f32x4::new([chunk[0], chunk[1], chunk[2], chunk[3]])) + .zip(kernel_simd.iter()) + .fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(*b, acc)) + .reduce_add() }); } GrayImageBuffer::from_raw(width as u32, height as u32, output).unwrap() } -fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer { +pub fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer { + // Validate kernel size. let kernel_size = kernel.len(); debug_assert!(kernel_size % 2 == 1); let kernel_half_size = kernel_size / 2; + // Prepare output. let width = image.width() as usize; let height = image.height() as usize; let mut output = vec![0.0; width * height]; + // Create SIMD kernel, padded with 0. + let kernel_simd = kernel + .chunks(4) + .map(|chunk| { + let data = [ + #[allow(clippy::get_first)] + chunk.get(0).copied().unwrap_or(0.0), + chunk.get(1).copied().unwrap_or(0.0), + chunk.get(2).copied().unwrap_or(0.0), + chunk.get(3).copied().unwrap_or(0.0), + ]; + f32x4::new(data) + }) + .collect::>(); + let kernel_simd_size = 4 * (kernel_size + 3) / 4; + let kernel_simd_extra_elements = kernel_simd_size - kernel_size; // We use a scratch buffer of L1 cache width (64 bytes) to optimize memory access. const SCRATCH_WIDTH: usize = 16; - let scratch_height = height + kernel_half_size * 2; + let scratch_height = height + kernel_half_size * 2 + kernel_simd_extra_elements; let mut scratch = vec![0f32; SCRATCH_WIDTH * scratch_height]; let image = image.as_raw(); for x_s in (0..width).step_by(SCRATCH_WIDTH) { @@ -235,6 +274,9 @@ fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer { for i in 0..kernel_half_size { scratch[scratch_end_col_start + i] = image[image_last_row_start + x]; } + for i in 0..kernel_simd_extra_elements { + scratch[scratch_end_col_start + kernel_half_size + i] = 0.; + } } // Then main content. for y in 0..height { @@ -252,14 +294,15 @@ fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer { .enumerate() .for_each(|(dx, col)| { let x = x_s + dx; - col.windows(kernel_size) + col.windows(kernel_simd_size) .enumerate() .for_each(|(y, window)| { let value = window - .iter() - .zip(kernel.iter()) - .map(|(a, b)| a * b) - .sum::(); + .chunks_exact(4) + .map(|chunk| f32x4::new([chunk[0], chunk[1], chunk[2], chunk[3]])) + .zip(kernel_simd.iter()) + .fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(*b, acc)) + .reduce_add(); output[y * width + x] = value; }); }); @@ -267,7 +310,7 @@ fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer { GrayImageBuffer::from_raw(width as u32, height as u32, output).unwrap() } -pub(crate) fn separable_filter( +pub fn separable_filter( image: &GrayImageBuffer, h_kernel: &[f32], v_kernel: &[f32], @@ -287,14 +330,15 @@ fn gaussian(x: f32, r: f32) -> f32 { ((2.0 * f32::consts::PI).sqrt() * r).recip() * (-x.powi(2) / (2.0 * r.powi(2))).exp() } -/// Generate a Gaussina kernel. +/// Generate a Gaussian kernel. /// /// # Arguments /// * `r` - sigma. /// * `kernel_size` - The size of the kernel. /// # Return value /// The kernel (a vector). -fn gaussian_kernel(r: f32, kernel_size: usize) -> Vec { +pub fn gaussian_kernel(r: f32, kernel_size: usize) -> Vec { + assert!(kernel_size % 2 == 1, "kernel_size must be odd"); let mut kernel = vec![0f32; kernel_size]; let half_width = (kernel_size / 2) as i32; let mut sum = 0f32; @@ -327,6 +371,7 @@ pub fn gaussian_blur(image: &GrayFloatImage, r: f32) -> GrayFloatImage { #[cfg(test)] mod tests { use super::gaussian_kernel; + #[test] fn gaussian_kernel_correct() { // test against known correct kernel @@ -353,7 +398,7 @@ mod tests { let kernel = gaussian_kernel(3.0, 7); let filtered_ours = super::horizontal_filter(&image.0, &kernel); let filtered_imageproc = imageproc::filter::horizontal_filter(&image.0, &kernel); - imageproc::assert_pixels_eq!(filtered_ours, filtered_imageproc); + imageproc::assert_pixels_eq_within!(filtered_ours, filtered_imageproc, 0.0001); } #[test] @@ -363,6 +408,6 @@ mod tests { let kernel = gaussian_kernel(3.0, 7); let filtered_ours = super::vertical_filter(&image.0, &kernel); let filtered_imageproc = imageproc::filter::vertical_filter(&image.0, &kernel); - imageproc::assert_pixels_eq!(filtered_ours, filtered_imageproc); + imageproc::assert_pixels_eq_within!(filtered_ours, filtered_imageproc, 0.0001); } } diff --git a/akaze/src/lib.rs b/akaze/src/lib.rs index 6e015b9a..dd696368 100644 --- a/akaze/src/lib.rs +++ b/akaze/src/lib.rs @@ -4,7 +4,7 @@ mod descriptors; mod detector_response; mod evolution; mod fed_tau; -mod image; +pub mod image; mod nonlinear_diffusion; mod scale_space_extrema;