-
Notifications
You must be signed in to change notification settings - Fork 234
Optimize memcpy, memmove and memset #405
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0c41d60
fcfecc1
3ad5fa9
2d28f4d
ce86d41
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,257 @@ | ||
use core::intrinsics::likely; | ||
|
||
const WORD_SIZE: usize = core::mem::size_of::<usize>(); | ||
const WORD_MASK: usize = WORD_SIZE - 1; | ||
|
||
// If the number of bytes involved exceed this threshold we will opt in word-wise copy. | ||
// The value here selected is max(2 * WORD_SIZE, 16): | ||
// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through | ||
// word-wise copy. | ||
// * The word-wise copy logic needs to perform some checks so it has some small overhead. | ||
// ensures that even on 32-bit platforms we have copied at least 8 bytes through | ||
// word-wise copy so the saving of word-wise copy outweights the fixed overhead. | ||
const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 { | ||
2 * WORD_SIZE | ||
} else { | ||
16 | ||
}; | ||
|
||
#[cfg(feature = "mem-unaligned")] | ||
unsafe fn read_usize_unaligned(x: *const usize) -> usize { | ||
// Do not use `core::ptr::read_unaligned` here, since it calls `copy_nonoverlapping` which | ||
// is translated to memcpy in LLVM. | ||
let x_read = (x as *const [u8; core::mem::size_of::<usize>()]).read(); | ||
core::mem::transmute(x_read) | ||
} | ||
|
||
#[inline(always)] | ||
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) { | ||
let mut i = 0; | ||
while i < n { | ||
*dest.add(i) = *src.add(i); | ||
i += 1; | ||
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) { | ||
#[inline(always)] | ||
unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { | ||
let dest_end = dest.add(n); | ||
while dest < dest_end { | ||
*dest = *src; | ||
dest = dest.add(1); | ||
src = src.add(1); | ||
} | ||
} | ||
|
||
#[inline(always)] | ||
unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { | ||
let mut dest_usize = dest as *mut usize; | ||
let mut src_usize = src as *mut usize; | ||
let dest_end = dest.add(n) as *mut usize; | ||
|
||
while dest_usize < dest_end { | ||
*dest_usize = *src_usize; | ||
dest_usize = dest_usize.add(1); | ||
src_usize = src_usize.add(1); | ||
} | ||
} | ||
|
||
#[cfg(not(feature = "mem-unaligned"))] | ||
#[inline(always)] | ||
unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { | ||
let mut dest_usize = dest as *mut usize; | ||
let dest_end = dest.add(n) as *mut usize; | ||
|
||
// Calculate the misalignment offset and shift needed to reassemble value. | ||
let offset = src as usize & WORD_MASK; | ||
let shift = offset * 8; | ||
|
||
// Realign src | ||
let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; | ||
// This will read (but won't use) bytes out of bound. | ||
let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); | ||
|
||
while dest_usize < dest_end { | ||
src_aligned = src_aligned.add(1); | ||
let cur_word = *src_aligned; | ||
#[cfg(target_endian = "little")] | ||
let resembled = prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift); | ||
#[cfg(target_endian = "big")] | ||
let resembled = prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift); | ||
prev_word = cur_word; | ||
|
||
*dest_usize = resembled; | ||
dest_usize = dest_usize.add(1); | ||
} | ||
} | ||
|
||
#[cfg(feature = "mem-unaligned")] | ||
#[inline(always)] | ||
unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { | ||
let mut dest_usize = dest as *mut usize; | ||
let mut src_usize = src as *mut usize; | ||
let dest_end = dest.add(n) as *mut usize; | ||
|
||
while dest_usize < dest_end { | ||
*dest_usize = read_usize_unaligned(src_usize); | ||
dest_usize = dest_usize.add(1); | ||
src_usize = src_usize.add(1); | ||
} | ||
} | ||
|
||
if n >= WORD_COPY_THRESHOLD { | ||
// Align dest | ||
// Because of n >= 2 * WORD_SIZE, dst_misalignment < n | ||
let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be possible to use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From align_offset's doc:
So that rules out the use of align_offset. Also note that this is the very hot path and I would prefer simple bit trcisk rather to rely on LLVM to optimise out a very complex function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you try benchmarking to see if there is overhead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not related to performance, it could simply not be used for correctness.
|
||
copy_forward_bytes(dest, src, dest_misalignment); | ||
dest = dest.add(dest_misalignment); | ||
src = src.add(dest_misalignment); | ||
n -= dest_misalignment; | ||
|
||
let n_words = n & !WORD_MASK; | ||
let src_misalignment = src as usize & WORD_MASK; | ||
if likely(src_misalignment == 0) { | ||
copy_forward_aligned_words(dest, src, n_words); | ||
} else { | ||
copy_forward_misaligned_words(dest, src, n_words); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, have you tested simply copying using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, first of all Secondly, This branch is necessary because we don't want to bear the burden of all the shifts and checks necessary for misaligned loads if dest and src are perfectly co-aligned. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you tried this out and seen infinite recursion? Have you tried it out and seen if it's slower? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There'll be an infinite recursion if compiled in debug mode. On architectures that does not support misaligned loads (so most ISAs other than armv8 and x86/64) the performance is much slower because it generates 8 byte load and 16 bit ops rather than 1 word load and 3 bit ops. |
||
} | ||
dest = dest.add(n_words); | ||
src = src.add(n_words); | ||
n -= n_words; | ||
} | ||
copy_forward_bytes(dest, src, n); | ||
} | ||
|
||
#[inline(always)] | ||
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) { | ||
// copy from end | ||
let mut i = n; | ||
while i != 0 { | ||
i -= 1; | ||
*dest.add(i) = *src.add(i); | ||
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) { | ||
// The following backward copy helper functions uses the pointers past the end | ||
// as their inputs instead of pointers to the start! | ||
#[inline(always)] | ||
unsafe fn copy_backward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) { | ||
let dest_start = dest.sub(n); | ||
while dest_start < dest { | ||
dest = dest.sub(1); | ||
src = src.sub(1); | ||
*dest = *src; | ||
} | ||
} | ||
|
||
#[inline(always)] | ||
unsafe fn copy_backward_aligned_words(dest: *mut u8, src: *const u8, n: usize) { | ||
let mut dest_usize = dest as *mut usize; | ||
let mut src_usize = src as *mut usize; | ||
let dest_start = dest.sub(n) as *mut usize; | ||
|
||
while dest_start < dest_usize { | ||
dest_usize = dest_usize.sub(1); | ||
src_usize = src_usize.sub(1); | ||
*dest_usize = *src_usize; | ||
} | ||
} | ||
|
||
#[cfg(not(feature = "mem-unaligned"))] | ||
#[inline(always)] | ||
unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { | ||
let mut dest_usize = dest as *mut usize; | ||
let dest_start = dest.sub(n) as *mut usize; | ||
|
||
// Calculate the misalignment offset and shift needed to reassemble value. | ||
let offset = src as usize & WORD_MASK; | ||
let shift = offset * 8; | ||
|
||
// Realign src_aligned | ||
let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize; | ||
// This will read (but won't use) bytes out of bound. | ||
let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned); | ||
|
||
while dest_start < dest_usize { | ||
src_aligned = src_aligned.sub(1); | ||
let cur_word = *src_aligned; | ||
#[cfg(target_endian = "little")] | ||
let resembled = prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift; | ||
#[cfg(target_endian = "big")] | ||
let resembled = prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift; | ||
prev_word = cur_word; | ||
|
||
dest_usize = dest_usize.sub(1); | ||
*dest_usize = resembled; | ||
} | ||
} | ||
|
||
#[cfg(feature = "mem-unaligned")] | ||
#[inline(always)] | ||
unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) { | ||
let mut dest_usize = dest as *mut usize; | ||
let mut src_usize = src as *mut usize; | ||
let dest_start = dest.sub(n) as *mut usize; | ||
|
||
while dest_start < dest_usize { | ||
dest_usize = dest_usize.sub(1); | ||
src_usize = src_usize.sub(1); | ||
*dest_usize = read_usize_unaligned(src_usize); | ||
} | ||
} | ||
|
||
let mut dest = dest.add(n); | ||
let mut src = src.add(n); | ||
|
||
if n >= WORD_COPY_THRESHOLD { | ||
// Align dest | ||
// Because of n >= 2 * WORD_SIZE, dst_misalignment < n | ||
let dest_misalignment = dest as usize & WORD_MASK; | ||
copy_backward_bytes(dest, src, dest_misalignment); | ||
dest = dest.sub(dest_misalignment); | ||
src = src.sub(dest_misalignment); | ||
n -= dest_misalignment; | ||
|
||
let n_words = n & !WORD_MASK; | ||
let src_misalignment = src as usize & WORD_MASK; | ||
if likely(src_misalignment == 0) { | ||
copy_backward_aligned_words(dest, src, n_words); | ||
} else { | ||
copy_backward_misaligned_words(dest, src, n_words); | ||
} | ||
dest = dest.sub(n_words); | ||
src = src.sub(n_words); | ||
n -= n_words; | ||
} | ||
copy_backward_bytes(dest, src, n); | ||
} | ||
|
||
#[inline(always)] | ||
pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) { | ||
let mut i = 0; | ||
while i < n { | ||
*s.add(i) = c; | ||
i += 1; | ||
pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) { | ||
#[inline(always)] | ||
pub unsafe fn set_bytes_bytes(mut s: *mut u8, c: u8, n: usize) { | ||
let end = s.add(n); | ||
while s < end { | ||
*s = c; | ||
s = s.add(1); | ||
} | ||
} | ||
|
||
#[inline(always)] | ||
pub unsafe fn set_bytes_words(s: *mut u8, c: u8, n: usize) { | ||
let mut broadcast = c as usize; | ||
let mut bits = 8; | ||
while bits < WORD_SIZE * 8 { | ||
broadcast |= broadcast << bits; | ||
bits *= 2; | ||
} | ||
|
||
let mut s_usize = s as *mut usize; | ||
let end = s.add(n) as *mut usize; | ||
|
||
while s_usize < end { | ||
*s_usize = broadcast; | ||
s_usize = s_usize.add(1); | ||
} | ||
} | ||
|
||
if likely(n >= WORD_COPY_THRESHOLD) { | ||
// Align s | ||
// Because of n >= 2 * WORD_SIZE, dst_misalignment < n | ||
let misalignment = (s as usize).wrapping_neg() & WORD_MASK; | ||
set_bytes_bytes(s, c, misalignment); | ||
s = s.add(misalignment); | ||
n -= misalignment; | ||
|
||
let n_words = n & !WORD_MASK; | ||
set_bytes_words(s, c, n_words); | ||
s = s.add(n_words); | ||
n -= n_words; | ||
} | ||
set_bytes_bytes(s, c, n); | ||
} |
Uh oh!
There was an error while loading. Please reload this page.