Skip to content

Commit 2ee37fd

Browse files
committed
optimize the copying done in the ringbuffer
1 parent 29a6566 commit 2ee37fd

File tree

1 file changed

+88
-19
lines changed

1 file changed

+88
-19
lines changed

src/decoding/ringbuffer.rs

Lines changed: 88 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ pub struct RingBuffer {
2121

2222
// SAFETY: RingBuffer does not hold any thread specific values -> it can be sent to another thread -> RingBuffer is Send
2323
unsafe impl Send for RingBuffer {}
24+
2425
// SAFETY: Ringbuffer does not provide unsyncronized interior mutability which makes &RingBuffer Send -> RingBuffer is Sync
2526
unsafe impl Sync for RingBuffer {}
2627

@@ -277,39 +278,45 @@ impl RingBuffer {
277278
// continous data slice |____HDDDDDDDT_____|
278279
let after_tail = usize::min(len, self.cap - self.tail);
279280
unsafe {
280-
self.buf
281-
.as_ptr()
282-
.add(self.tail)
283-
.copy_from_nonoverlapping(self.buf.as_ptr().add(self.head + start), after_tail);
281+
let src = (
282+
self.buf.as_ptr().cast_const().add(self.head + start),
283+
self.tail - self.head,
284+
);
285+
let dst = (self.buf.as_ptr().add(self.tail), self.cap - self.tail);
286+
copy_bytes_overshooting(src, dst, after_tail);
287+
284288
if after_tail < len {
285-
self.buf.as_ptr().copy_from_nonoverlapping(
286-
self.buf.as_ptr().add(self.head + start + after_tail),
287-
len - after_tail,
288-
);
289+
let src = (src.0.add(after_tail), src.1 - after_tail);
290+
let dst = (self.buf.as_ptr(), self.head);
291+
copy_bytes_overshooting(src, dst, len - after_tail);
289292
}
290293
}
291294
} else {
292295
// continous free slice |DDDT_________HDDDD|
293296
if self.head + start > self.cap {
294297
let start = (self.head + start) % self.cap;
295298
unsafe {
296-
self.buf
297-
.as_ptr()
298-
.add(self.tail)
299-
.copy_from_nonoverlapping(self.buf.as_ptr().add(start), len)
299+
let src = (
300+
self.buf.as_ptr().add(start).cast_const(),
301+
self.cap - self.head,
302+
);
303+
let dst = (self.buf.as_ptr().add(self.tail), self.head - self.tail);
304+
copy_bytes_overshooting(src, dst, len);
300305
}
301306
} else {
302307
let after_start = usize::min(len, self.cap - self.head - start);
303308
unsafe {
304-
self.buf.as_ptr().add(self.tail).copy_from_nonoverlapping(
305-
self.buf.as_ptr().add(self.head + start),
306-
after_start,
309+
let src = (
310+
self.buf.as_ptr().add(self.head + start).cast_const(),
311+
self.cap - self.head,
307312
);
313+
let dst = (self.buf.as_ptr().add(self.tail), self.head - self.tail);
314+
copy_bytes_overshooting(src, dst, after_start);
315+
308316
if after_start < len {
309-
self.buf
310-
.as_ptr()
311-
.add(self.tail + after_start)
312-
.copy_from_nonoverlapping(self.buf.as_ptr(), len - after_start);
317+
let src = (self.buf.as_ptr().cast_const(), self.tail);
318+
let dst = (dst.0.add(after_start), dst.1 - after_start);
319+
copy_bytes_overshooting(src, dst, len - after_start);
313320
}
314321
}
315322
}
@@ -392,6 +399,68 @@ impl Drop for RingBuffer {
392399
}
393400
}
394401

402+
/// Similar to ptr::copy_nonoverlapping
403+
///
404+
/// But it might overshoot the desired copy length if deemed useful
405+
///
406+
/// src and dst specify the entire length they are eligible for reading/writing respectively
407+
/// in addition to the desired copy length.
408+
///
409+
/// This function will then copy in chunks and might copy up to chunk size - 1 more bytes from src to dst
410+
/// if that operation does not read/write memory that does not belong to src/dst.
411+
///
412+
/// The chunk size is not part of the contract and may change depending on the target platform.
413+
///
414+
/// If that isn't possible we just fall back to ptr::copy_nonoverlapping
415+
#[inline(always)]
416+
unsafe fn copy_bytes_overshooting(
417+
src: (*const u8, usize),
418+
dst: (*mut u8, usize),
419+
copy_at_least: usize,
420+
) {
421+
// By default use usize as the copy size
422+
#[cfg(all(not(target_feature = "sse2"), not(target_feature = "neon")))]
423+
type CopyType = usize;
424+
425+
// Use u128 if we detect a simd feature
426+
#[cfg(target_feature = "neon")]
427+
type CopyType = u128;
428+
#[cfg(target_feature = "sse2")]
429+
type CopyType = u128;
430+
431+
const COPY_AT_ONCE_SIZE: usize = std::mem::size_of::<CopyType>();
432+
let min_buffer_size = usize::min(src.1, dst.1);
433+
434+
// Can copy in just one read+write, very common case
435+
if min_buffer_size >= COPY_AT_ONCE_SIZE && copy_at_least <= COPY_AT_ONCE_SIZE {
436+
dst.0
437+
.cast::<CopyType>()
438+
.write_unaligned(src.0.cast::<CopyType>().read_unaligned())
439+
} else {
440+
let copy_multiple = copy_at_least.next_multiple_of(COPY_AT_ONCE_SIZE);
441+
// Can copy in multiple simple instructions
442+
if min_buffer_size >= copy_multiple {
443+
let mut src_ptr = src.0.cast::<CopyType>();
444+
let src_ptr_end = src.0.add(copy_multiple).cast::<CopyType>();
445+
let mut dst_ptr = dst.0.cast::<CopyType>();
446+
447+
while src_ptr < src_ptr_end {
448+
dst_ptr.write_unaligned(src_ptr.read_unaligned());
449+
src_ptr = src_ptr.add(1);
450+
dst_ptr = dst_ptr.add(1);
451+
}
452+
} else {
453+
// Fall back to standard memcopy
454+
dst.0.copy_from_nonoverlapping(src.0, copy_at_least);
455+
}
456+
}
457+
458+
debug_assert_eq!(
459+
slice::from_raw_parts(src.0, copy_at_least),
460+
slice::from_raw_parts(dst.0, copy_at_least)
461+
);
462+
}
463+
395464
#[allow(dead_code)]
396465
#[inline(always)]
397466
#[allow(clippy::too_many_arguments)]

0 commit comments

Comments
 (0)