Skip to content

Commit ac0e0b1

Browse files
committed
[Memory] Fix alignment to 16 bytes
Fix segmentation fault happening on linux when `src` or `dst` is not aligned to 16 bytes. Use `_mm_storeu_si128` as there is no guarentee that `dest` is propely aligned. Grab all of words/dwords which are unaligned with 16 bytes.
1 parent a134721 commit ac0e0b1

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

src/xenia/base/memory.cc

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,16 @@ void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr,
3434
0x04, 0x05, 0x02, 0x03, 0x00, 0x01);
3535

3636
size_t i = 0;
37-
size_t unaligned_words = (reinterpret_cast<uintptr_t>(src_ptr) & 0xF) / 2;
37+
size_t unaligned_words =
38+
(0x10 - (reinterpret_cast<uintptr_t>(src_ptr) & 0xF)) / sizeof(uint16_t);
3839
for (; unaligned_words > 0 && i < count; unaligned_words--, i++) {
3940
// Copy up to 16 byte alignment.
4041
dest[i] = byte_swap(src[i]);
4142
}
4243
for (; i + 8 <= count; i += 8) {
4344
__m128i input = _mm_load_si128(reinterpret_cast<const __m128i*>(&src[i]));
4445
__m128i output = _mm_shuffle_epi8(input, shufmask);
45-
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
46+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
4647
}
4748
for (; i < count; ++i) { // handle residual elements
4849
dest[i] = byte_swap(src[i]);
@@ -78,15 +79,16 @@ void copy_and_swap_32_aligned(void* dest_ptr, const void* src_ptr,
7879
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
7980

8081
size_t i = 0;
81-
size_t unaligned_dwords = (reinterpret_cast<uintptr_t>(src_ptr) & 0xF) / 4;
82+
size_t unaligned_dwords =
83+
(0x10 - (reinterpret_cast<uintptr_t>(src_ptr) & 0xF)) / sizeof(uint32_t);
8284
for (; unaligned_dwords > 0 && i < count; unaligned_dwords--, i++) {
8385
// Copy up to 16 byte alignment.
8486
dest[i] = byte_swap(src[i]);
8587
}
8688
for (; i + 4 <= count; i += 4) {
8789
__m128i input = _mm_load_si128(reinterpret_cast<const __m128i*>(&src[i]));
8890
__m128i output = _mm_shuffle_epi8(input, shufmask);
89-
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
91+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
9092
}
9193
for (; i < count; ++i) { // handle residual elements
9294
dest[i] = byte_swap(src[i]);
@@ -122,15 +124,16 @@ void copy_and_swap_64_aligned(void* dest_ptr, const void* src_ptr,
122124
0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
123125

124126
size_t i = 0;
125-
size_t unaligned_qwords = (reinterpret_cast<uintptr_t>(src_ptr) & 0xF) / 8;
127+
size_t unaligned_qwords =
128+
(reinterpret_cast<uintptr_t>(src_ptr) & 0xF) / sizeof(uint64_t);
126129
for (; unaligned_qwords > 0 && i < count; unaligned_qwords--, i++) {
127130
// Copy up to 16 byte alignment.
128131
dest[i] = byte_swap(src[i]);
129132
}
130133
for (; i + 2 <= count; i += 2) {
131134
__m128i input = _mm_load_si128(reinterpret_cast<const __m128i*>(&src[i]));
132135
__m128i output = _mm_shuffle_epi8(input, shufmask);
133-
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
136+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
134137
}
135138
for (; i < count; ++i) { // handle residual elements
136139
dest[i] = byte_swap(src[i]);

0 commit comments

Comments
 (0)