@@ -34,15 +34,16 @@ void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr,
3434 0x04 , 0x05 , 0x02 , 0x03 , 0x00 , 0x01 );
3535
3636 size_t i = 0 ;
37- size_t unaligned_words = (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / 2 ;
37+ size_t unaligned_words =
38+ (0x10 - (reinterpret_cast <uintptr_t >(src_ptr) & 0xF )) / sizeof (uint16_t );
3839 for (; unaligned_words > 0 && i < count; unaligned_words--, i++) {
3940 // Copy up to 16 byte alignment.
4041 dest[i] = byte_swap (src[i]);
4142 }
4243 for (; i + 8 <= count; i += 8 ) {
4344 __m128i input = _mm_load_si128 (reinterpret_cast <const __m128i*>(&src[i]));
4445 __m128i output = _mm_shuffle_epi8 (input, shufmask);
45- _mm_store_si128 (reinterpret_cast <__m128i*>(&dest[i]), output);
46+ _mm_storeu_si128 (reinterpret_cast <__m128i*>(&dest[i]), output);
4647 }
4748 for (; i < count; ++i) { // handle residual elements
4849 dest[i] = byte_swap (src[i]);
@@ -78,15 +79,16 @@ void copy_and_swap_32_aligned(void* dest_ptr, const void* src_ptr,
7879 0x06 , 0x07 , 0x00 , 0x01 , 0x02 , 0x03 );
7980
8081 size_t i = 0 ;
81- size_t unaligned_dwords = (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / 4 ;
82+ size_t unaligned_dwords =
83+ (0x10 - (reinterpret_cast <uintptr_t >(src_ptr) & 0xF )) / sizeof (uint32_t );
8284 for (; unaligned_dwords > 0 && i < count; unaligned_dwords--, i++) {
8385 // Copy up to 16 byte alignment.
8486 dest[i] = byte_swap (src[i]);
8587 }
8688 for (; i + 4 <= count; i += 4 ) {
8789 __m128i input = _mm_load_si128 (reinterpret_cast <const __m128i*>(&src[i]));
8890 __m128i output = _mm_shuffle_epi8 (input, shufmask);
89- _mm_store_si128 (reinterpret_cast <__m128i*>(&dest[i]), output);
91+ _mm_storeu_si128 (reinterpret_cast <__m128i*>(&dest[i]), output);
9092 }
9193 for (; i < count; ++i) { // handle residual elements
9294 dest[i] = byte_swap (src[i]);
@@ -122,15 +124,16 @@ void copy_and_swap_64_aligned(void* dest_ptr, const void* src_ptr,
122124 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 );
123125
124126 size_t i = 0 ;
125- size_t unaligned_qwords = (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / 8 ;
127+ size_t unaligned_qwords =
128+ (reinterpret_cast <uintptr_t >(src_ptr) & 0xF ) / sizeof (uint64_t );
126129 for (; unaligned_qwords > 0 && i < count; unaligned_qwords--, i++) {
127130 // Copy up to 16 byte alignment.
128131 dest[i] = byte_swap (src[i]);
129132 }
130133 for (; i + 2 <= count; i += 2 ) {
131134 __m128i input = _mm_load_si128 (reinterpret_cast <const __m128i*>(&src[i]));
132135 __m128i output = _mm_shuffle_epi8 (input, shufmask);
133- _mm_store_si128 (reinterpret_cast <__m128i*>(&dest[i]), output);
136+ _mm_storeu_si128 (reinterpret_cast <__m128i*>(&dest[i]), output);
134137 }
135138 for (; i < count; ++i) { // handle residual elements
136139 dest[i] = byte_swap (src[i]);
0 commit comments