@@ -27,14 +27,18 @@ void copy_128_aligned(void* dest, const void* src, size_t count) {
2727void copy_and_swap_16_aligned (void * dest_ptr, const void * src_ptr,
2828 size_t count) {
2929 assert_zero (reinterpret_cast <uintptr_t >(src_ptr) & 0x1 );
30+ assert_true ((reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) ==
31+ (reinterpret_cast <uintptr_t >(dest_ptr) & 0xF ));
3032 auto dest = reinterpret_cast <uint16_t *>(dest_ptr);
3133 auto src = reinterpret_cast <const uint16_t *>(src_ptr);
3234 __m128i shufmask =
3335 _mm_set_epi8 (0x0E , 0x0F , 0x0C , 0x0D , 0x0A , 0x0B , 0x08 , 0x09 , 0x06 , 0x07 ,
3436 0x04 , 0x05 , 0x02 , 0x03 , 0x00 , 0x01 );
3537
3638 size_t i = 0 ;
37- size_t unaligned_words = (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / 2 ;
39+ size_t unaligned_words =
40+ ((0x10 - (reinterpret_cast <uintptr_t >(src_ptr) & 0xF )) & 0xF ) /
41+ sizeof (uint16_t );
3842 for (; unaligned_words > 0 && i < count; unaligned_words--, i++) {
3943 // Copy up to 16 byte alignment.
4044 dest[i] = byte_swap (src[i]);
@@ -71,14 +75,18 @@ void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
7175void copy_and_swap_32_aligned (void * dest_ptr, const void * src_ptr,
7276 size_t count) {
7377 assert_zero (reinterpret_cast <uintptr_t >(src_ptr) & 0x3 );
78+ assert_true ((reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) ==
79+ (reinterpret_cast <uintptr_t >(dest_ptr) & 0xF ));
7480 auto dest = reinterpret_cast <uint32_t *>(dest_ptr);
7581 auto src = reinterpret_cast <const uint32_t *>(src_ptr);
7682 __m128i shufmask =
7783 _mm_set_epi8 (0x0C , 0x0D , 0x0E , 0x0F , 0x08 , 0x09 , 0x0A , 0x0B , 0x04 , 0x05 ,
7884 0x06 , 0x07 , 0x00 , 0x01 , 0x02 , 0x03 );
7985
8086 size_t i = 0 ;
81- size_t unaligned_dwords = (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / 4 ;
87+ size_t unaligned_dwords =
88+ ((0x10 - (reinterpret_cast <uintptr_t >(src_ptr) & 0xF )) & 0xF ) /
89+ sizeof (uint32_t );
8290 for (; unaligned_dwords > 0 && i < count; unaligned_dwords--, i++) {
8391 // Copy up to 16 byte alignment.
8492 dest[i] = byte_swap (src[i]);
@@ -115,14 +123,17 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
115123void copy_and_swap_64_aligned (void * dest_ptr, const void * src_ptr,
116124 size_t count) {
117125 assert_zero (reinterpret_cast <uintptr_t >(src_ptr) & 0x7 );
126+ assert_true ((reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) ==
127+ (reinterpret_cast <uintptr_t >(dest_ptr) & 0xF ));
118128 auto dest = reinterpret_cast <uint64_t *>(dest_ptr);
119129 auto src = reinterpret_cast <const uint64_t *>(src_ptr);
120130 __m128i shufmask =
121131 _mm_set_epi8 (0x08 , 0x09 , 0x0A , 0x0B , 0x0C , 0x0D , 0x0E , 0x0F , 0x00 , 0x01 ,
122132 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 );
123133
124134 size_t i = 0 ;
125- size_t unaligned_qwords = (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / 8 ;
135+ size_t unaligned_qwords =
136+ (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / sizeof (uint64_t );
126137 for (; unaligned_qwords > 0 && i < count; unaligned_qwords--, i++) {
127138 // Copy up to 16 byte alignment.
128139 dest[i] = byte_swap (src[i]);
0 commit comments