Skip to content

Commit

Permalink
Added SIMD versions of filters, for 25%-400% performance improvement …
Browse files Browse the repository at this point in the history
…depending on kernel size.
  • Loading branch information
stephanemagnenat committed Jun 17, 2023
1 parent 43dbc1e commit 2220913
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 23 deletions.
1 change: 1 addition & 0 deletions akaze/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ bitarray = "0.9.0"
thiserror = { version = "1.0.40", default-features = false }
serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
rayon = { version = "1.7.0", optional = true }
wide = "0.7"

[dev-dependencies]
eight-point = { version = "0.8.0", path = "../eight-point" }
Expand Down
37 changes: 35 additions & 2 deletions akaze/benches/criterion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,41 @@ fn extract(c: &mut Criterion) {
}

criterion_group!(
name = benches;
name = akaze;
config = Criterion::default().sample_size(10);
targets = extract
);
criterion_main!(benches);

fn bench_horizontal_filter(c: &mut Criterion) {
let image =
akaze::image::GrayFloatImage::from_dynamic(&image::open("../res/0000000000.png").unwrap());
let small_kernel = akaze::image::gaussian_kernel(1.0, 7);
c.bench_function("horizontal_filter_small_kernel", |b| {
b.iter(|| akaze::image::horizontal_filter(&image.0, &small_kernel))
});
let large_kernel = akaze::image::gaussian_kernel(10.0, 71);
c.bench_function("horizontal_filter_large_kernel", |b| {
b.iter(|| akaze::image::horizontal_filter(&image.0, &large_kernel))
});
}

fn bench_vertical_filter(c: &mut Criterion) {
let image =
akaze::image::GrayFloatImage::from_dynamic(&image::open("../res/0000000000.png").unwrap());
let small_kernel = akaze::image::gaussian_kernel(1.0, 7);
c.bench_function("vertical_filter_small_kernel", |b| {
b.iter(|| akaze::image::vertical_filter(&image.0, &small_kernel))
});
let large_kernel = akaze::image::gaussian_kernel(10.0, 71);
c.bench_function("vertical_filter_large_kernel", |b| {
b.iter(|| akaze::image::vertical_filter(&image.0, &large_kernel))
});
}

criterion_group!(
name = akaze_image;
config = Criterion::default().sample_size(10);
targets = bench_horizontal_filter, bench_vertical_filter
);

criterion_main!(akaze, akaze_image);
85 changes: 65 additions & 20 deletions akaze/src/image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use log::*;
use ndarray::{azip, s, Array2, ArrayView2, ArrayViewMut2};
use nshare::{MutNdarray2, RefNdarray2};
use std::f32;
use wide::f32x4;

type GrayImageBuffer = ImageBuffer<Luma<f32>, Vec<f32>>;

Expand Down Expand Up @@ -178,47 +179,85 @@ impl GrayFloatImage {
}
}

fn horizontal_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer {
pub fn horizontal_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer {
// Validate kernel size.
let kernel_size = kernel.len();
debug_assert!(kernel_size % 2 == 1);
let kernel_half_size = kernel_size / 2;
// Prepare output.
let width = image.width() as usize;
let height = image.height() as usize;
let mut output = vec![0.0; width * height];
// Create SIMD kernel, padded with 0.
let kernel_simd = kernel
.chunks(4)
.map(|chunk| {
let data = [
#[allow(clippy::get_first)]
chunk.get(0).copied().unwrap_or(0.0),
chunk.get(1).copied().unwrap_or(0.0),
chunk.get(2).copied().unwrap_or(0.0),
chunk.get(3).copied().unwrap_or(0.0),
];
f32x4::new(data)
})
.collect::<Vec<_>>();
let kernel_simd_size = 4 * (kernel_size + 3) / 4;
let kernel_simd_extra_elements = kernel_simd_size - kernel_size;
// Process each row independently.
let row_in_it = image.as_raw().chunks_exact(width);
let row_out_it = output.chunks_exact_mut(width);
let mut scratch = vec![0f32; width + kernel_half_size * 2];
let mut scratch = vec![0f32; width + kernel_half_size * 2 + kernel_simd_extra_elements];
for (row_in, row_out) in row_in_it.zip(row_out_it) {
// Prefill extended buffer with center and edge values.
scratch[0..kernel_half_size].fill(row_in[0]);
scratch[kernel_half_size..kernel_half_size + width].copy_from_slice(row_in);
scratch[kernel_half_size + width..].fill(row_in[width - 1]);
scratch[kernel_half_size + width..2 * kernel_half_size + width].fill(row_in[width - 1]);
scratch[2 * kernel_half_size + width..].fill(0.);
// Apply kernel.
scratch
.windows(kernel_size)
.windows(kernel_simd_size)
.zip(row_out)
.for_each(|(window, output)| {
*output = window
.iter()
.zip(kernel.iter())
.map(|(a, b)| a * b)
.sum::<f32>()
.chunks_exact(4)
.map(|chunk| f32x4::new([chunk[0], chunk[1], chunk[2], chunk[3]]))
.zip(kernel_simd.iter())
.fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(*b, acc))
.reduce_add()
});
}
GrayImageBuffer::from_raw(width as u32, height as u32, output).unwrap()
}

fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer {
pub fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer {
// Validate kernel size.
let kernel_size = kernel.len();
debug_assert!(kernel_size % 2 == 1);
let kernel_half_size = kernel_size / 2;
// Prepare output.
let width = image.width() as usize;
let height = image.height() as usize;
let mut output = vec![0.0; width * height];
// Create SIMD kernel, padded with 0.
let kernel_simd = kernel
.chunks(4)
.map(|chunk| {
let data = [
#[allow(clippy::get_first)]
chunk.get(0).copied().unwrap_or(0.0),
chunk.get(1).copied().unwrap_or(0.0),
chunk.get(2).copied().unwrap_or(0.0),
chunk.get(3).copied().unwrap_or(0.0),
];
f32x4::new(data)
})
.collect::<Vec<_>>();
let kernel_simd_size = 4 * (kernel_size + 3) / 4;
let kernel_simd_extra_elements = kernel_simd_size - kernel_size;
// We use a scratch buffer of L1 cache width (64 bytes) to optimize memory access.
const SCRATCH_WIDTH: usize = 16;
let scratch_height = height + kernel_half_size * 2;
let scratch_height = height + kernel_half_size * 2 + kernel_simd_extra_elements;
let mut scratch = vec![0f32; SCRATCH_WIDTH * scratch_height];
let image = image.as_raw();
for x_s in (0..width).step_by(SCRATCH_WIDTH) {
Expand All @@ -235,6 +274,9 @@ fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer {
for i in 0..kernel_half_size {
scratch[scratch_end_col_start + i] = image[image_last_row_start + x];
}
for i in 0..kernel_simd_extra_elements {
scratch[scratch_end_col_start + kernel_half_size + i] = 0.;
}
}
// Then main content.
for y in 0..height {
Expand All @@ -252,22 +294,23 @@ fn vertical_filter(image: &GrayImageBuffer, kernel: &[f32]) -> GrayImageBuffer {
.enumerate()
.for_each(|(dx, col)| {
let x = x_s + dx;
col.windows(kernel_size)
col.windows(kernel_simd_size)
.enumerate()
.for_each(|(y, window)| {
let value = window
.iter()
.zip(kernel.iter())
.map(|(a, b)| a * b)
.sum::<f32>();
.chunks_exact(4)
.map(|chunk| f32x4::new([chunk[0], chunk[1], chunk[2], chunk[3]]))
.zip(kernel_simd.iter())
.fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(*b, acc))
.reduce_add();
output[y * width + x] = value;
});
});
}
GrayImageBuffer::from_raw(width as u32, height as u32, output).unwrap()
}

pub(crate) fn separable_filter(
pub fn separable_filter(
image: &GrayImageBuffer,
h_kernel: &[f32],
v_kernel: &[f32],
Expand All @@ -287,14 +330,15 @@ fn gaussian(x: f32, r: f32) -> f32 {
((2.0 * f32::consts::PI).sqrt() * r).recip() * (-x.powi(2) / (2.0 * r.powi(2))).exp()
}

/// Generate a Gaussina kernel.
/// Generate a Gaussian kernel.
///
/// # Arguments
/// * `r` - sigma.
/// * `kernel_size` - The size of the kernel.
/// # Return value
/// The kernel (a vector).
fn gaussian_kernel(r: f32, kernel_size: usize) -> Vec<f32> {
pub fn gaussian_kernel(r: f32, kernel_size: usize) -> Vec<f32> {
assert!(kernel_size % 2 == 1, "kernel_size must be odd");
let mut kernel = vec![0f32; kernel_size];
let half_width = (kernel_size / 2) as i32;
let mut sum = 0f32;
Expand Down Expand Up @@ -327,6 +371,7 @@ pub fn gaussian_blur(image: &GrayFloatImage, r: f32) -> GrayFloatImage {
#[cfg(test)]
mod tests {
use super::gaussian_kernel;

#[test]
fn gaussian_kernel_correct() {
// test against known correct kernel
Expand All @@ -353,7 +398,7 @@ mod tests {
let kernel = gaussian_kernel(3.0, 7);
let filtered_ours = super::horizontal_filter(&image.0, &kernel);
let filtered_imageproc = imageproc::filter::horizontal_filter(&image.0, &kernel);
imageproc::assert_pixels_eq!(filtered_ours, filtered_imageproc);
imageproc::assert_pixels_eq_within!(filtered_ours, filtered_imageproc, 0.0001);
}

#[test]
Expand All @@ -363,6 +408,6 @@ mod tests {
let kernel = gaussian_kernel(3.0, 7);
let filtered_ours = super::vertical_filter(&image.0, &kernel);
let filtered_imageproc = imageproc::filter::vertical_filter(&image.0, &kernel);
imageproc::assert_pixels_eq!(filtered_ours, filtered_imageproc);
imageproc::assert_pixels_eq_within!(filtered_ours, filtered_imageproc, 0.0001);
}
}
2 changes: 1 addition & 1 deletion akaze/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ mod descriptors;
mod detector_response;
mod evolution;
mod fed_tau;
mod image;
pub mod image;
mod nonlinear_diffusion;
mod scale_space_extrema;

Expand Down

0 comments on commit 2220913

Please sign in to comment.