Skip to content

Commit 2b9764d

Browse files
committed
[x64] Pass m128 args by reference
Clang passes m128 arguments in xmm registers, which breaks the guest to host thunk assumption that arguments are only in general-purpose registers (rsi/rdx/rcx). Pass arguments by reference to make Clang generate code compatible with the assumptions made in the thunk.
1 parent fe2b719 commit 2b9764d

File tree

4 files changed

+39
-39
lines changed

4 files changed

+39
-39
lines changed

src/xenia/cpu/backend/x64/x64_seq_vector.cc

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
693693
// OPCODE_VECTOR_SHL
694694
// ============================================================================
695695
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
696-
static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
696+
static __m128i EmulateVectorShl(void*, __m128i& src1, __m128i& src2) {
697697
alignas(16) T value[16 / sizeof(T)];
698698
alignas(16) T shamt[16 / sizeof(T)];
699699

@@ -901,7 +901,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
901901
// OPCODE_VECTOR_SHR
902902
// ============================================================================
903903
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
904-
static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
904+
static __m128i EmulateVectorShr(void*, __m128i& src1, __m128i& src2) {
905905
alignas(16) T value[16 / sizeof(T)];
906906
alignas(16) T shamt[16 / sizeof(T)];
907907

@@ -1271,7 +1271,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
12711271
// OPCODE_VECTOR_ROTATE_LEFT
12721272
// ============================================================================
12731273
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
1274-
static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
1274+
static __m128i EmulateVectorRotateLeft(void*, __m128i& src1, __m128i& src2) {
12751275
alignas(16) T value[16 / sizeof(T)];
12761276
alignas(16) T shamt[16 / sizeof(T)];
12771277

@@ -1365,7 +1365,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
13651365
// OPCODE_VECTOR_AVERAGE
13661366
// ============================================================================
13671367
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
1368-
static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) {
1368+
static __m128i EmulateVectorAverage(void*, __m128i& src1, __m128i& src2) {
13691369
alignas(16) T src1v[16 / sizeof(T)];
13701370
alignas(16) T src2v[16 / sizeof(T)];
13711371
alignas(16) T value[16 / sizeof(T)];
@@ -1937,7 +1937,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
19371937
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
19381938
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
19391939
}
1940-
static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
1940+
static __m128i EmulateFLOAT16_2(void*, __m128& src1) {
19411941
alignas(16) float a[4];
19421942
alignas(16) uint16_t b[8];
19431943
_mm_store_ps(a, src1);
@@ -1976,7 +1976,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
19761976
e.vmovaps(i.dest, e.xmm0);
19771977
}
19781978
}
1979-
static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
1979+
static __m128i EmulateFLOAT16_4(void*, __m128& src1) {
19801980
alignas(16) float a[4];
19811981
alignas(16) uint16_t b[8];
19821982
_mm_store_ps(a, src1);
@@ -2107,8 +2107,8 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
21072107
// Merge XZ and YW.
21082108
e.vorps(i.dest, e.xmm0);
21092109
}
2110-
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
2111-
__m128i src2) {
2110+
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i& src1,
2111+
__m128i& src2) {
21122112
alignas(16) uint16_t a[8];
21132113
alignas(16) uint16_t b[8];
21142114
alignas(16) uint8_t c[16];
@@ -2120,7 +2120,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
21202120
}
21212121
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
21222122
}
2123-
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
2123+
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i& src1, __m128i& src2) {
21242124
alignas(16) uint8_t a[16];
21252125
alignas(16) uint8_t b[16];
21262126
alignas(16) uint8_t c[16];
@@ -2353,7 +2353,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
23532353
e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
23542354
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
23552355
}
2356-
static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
2356+
static __m128 EmulateFLOAT16_2(void*, __m128i& src1) {
23572357
alignas(16) uint16_t a[8];
23582358
alignas(16) float b[4];
23592359
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
@@ -2410,7 +2410,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
24102410
e.vmovaps(i.dest, e.xmm0);
24112411
}
24122412
}
2413-
static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
2413+
static __m128 EmulateFLOAT16_4(void*, __m128i& src1) {
24142414
alignas(16) uint16_t a[8];
24152415
alignas(16) float b[4];
24162416
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);

src/xenia/cpu/backend/x64/x64_sequences.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,7 +2435,7 @@ EMITTER_OPCODE_TABLE(OPCODE_RECIP, RECIP_F32, RECIP_F64, RECIP_V128);
24352435
// TODO(benvanik): use approx here:
24362436
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
24372437
struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
2438-
static __m128 EmulatePow2(void*, __m128 src) {
2438+
static __m128 EmulatePow2(void*, __m128& src) {
24392439
float src_value;
24402440
_mm_store_ss(&src_value, src);
24412441
float result = std::exp2(src_value);
@@ -2449,7 +2449,7 @@ struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
24492449
}
24502450
};
24512451
struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
2452-
static __m128d EmulatePow2(void*, __m128d src) {
2452+
static __m128d EmulatePow2(void*, __m128d& src) {
24532453
double src_value;
24542454
_mm_store_sd(&src_value, src);
24552455
double result = std::exp2(src_value);
@@ -2463,7 +2463,7 @@ struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
24632463
}
24642464
};
24652465
struct POW2_V128 : Sequence<POW2_V128, I<OPCODE_POW2, V128Op, V128Op>> {
2466-
static __m128 EmulatePow2(void*, __m128 src) {
2466+
static __m128 EmulatePow2(void*, __m128& src) {
24672467
alignas(16) float values[4];
24682468
_mm_store_ps(values, src);
24692469
for (size_t i = 0; i < 4; ++i) {
@@ -2486,7 +2486,7 @@ EMITTER_OPCODE_TABLE(OPCODE_POW2, POW2_F32, POW2_F64, POW2_V128);
24862486
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
24872487
// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it!
24882488
struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
2489-
static __m128 EmulateLog2(void*, __m128 src) {
2489+
static __m128 EmulateLog2(void*, __m128& src) {
24902490
float src_value;
24912491
_mm_store_ss(&src_value, src);
24922492
float result = std::log2(src_value);
@@ -2504,7 +2504,7 @@ struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
25042504
}
25052505
};
25062506
struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
2507-
static __m128d EmulateLog2(void*, __m128d src) {
2507+
static __m128d EmulateLog2(void*, __m128d& src) {
25082508
double src_value;
25092509
_mm_store_sd(&src_value, src);
25102510
double result = std::log2(src_value);
@@ -2522,7 +2522,7 @@ struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
25222522
}
25232523
};
25242524
struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
2525-
static __m128 EmulateLog2(void*, __m128 src) {
2525+
static __m128 EmulateLog2(void*, __m128& src) {
25262526
alignas(16) float values[4];
25272527
_mm_store_ps(values, src);
25282528
for (size_t i = 0; i < 4; ++i) {
@@ -2955,7 +2955,7 @@ struct SHL_V128 : Sequence<SHL_V128, I<OPCODE_SHL, V128Op, V128Op, I8Op>> {
29552955
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShlV128));
29562956
e.vmovaps(i.dest, e.xmm0);
29572957
}
2958-
static __m128i EmulateShlV128(void*, __m128i src1, uint8_t src2) {
2958+
static __m128i EmulateShlV128(void*, __m128i& src1, uint8_t src2) {
29592959
// Almost all instances are shamt = 1, but non-constant.
29602960
// shamt is [0,7]
29612961
uint8_t shamt = src2 & 0x7;
@@ -3032,7 +3032,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
30323032
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
30333033
e.vmovaps(i.dest, e.xmm0);
30343034
}
3035-
static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
3035+
static __m128i EmulateShrV128(void*, __m128i& src1, uint8_t src2) {
30363036
// Almost all instances are shamt = 1, but non-constant.
30373037
// shamt is [0,7]
30383038
uint8_t shamt = src2 & 0x7;

src/xenia/cpu/backend/x64/x64_tracers.cc

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) {
7373
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
7474
DPRINT("{} ({:X}) = ctx i64 +{}\n", (int64_t)value, value, offset);
7575
}
76-
void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value) {
76+
void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128& value) {
7777
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
7878
DPRINT("{} ({:X}) = ctx f32 +{}\n", xe::m128_f32<0>(value),
7979
xe::m128_i32<0>(value), offset);
@@ -85,7 +85,7 @@ void TraceContextLoadF64(void* raw_context, uint64_t offset,
8585
DPRINT("{} ({:X}) = ctx f64 +{}\n", xe::m128_f64<0>(v), xe::m128_i64<0>(v),
8686
offset);
8787
}
88-
void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) {
88+
void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128& value) {
8989
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
9090
DPRINT("[{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}] = ctx v128 +{}\n",
9191
xe::m128_f32<0>(value), xe::m128_f32<1>(value), xe::m128_f32<2>(value),
@@ -109,7 +109,7 @@ void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) {
109109
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
110110
DPRINT("ctx i64 +{} = {} ({:X})\n", offset, (int64_t)value, value);
111111
}
112-
void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value) {
112+
void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128& value) {
113113
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
114114
DPRINT("ctx f32 +{} = {} ({:X})\n", offset, xe::m128_f32<0>(value),
115115
xe::m128_i32<0>(value));
@@ -121,7 +121,7 @@ void TraceContextStoreF64(void* raw_context, uint64_t offset,
121121
DPRINT("ctx f64 +{} = {} ({:X})\n", offset, xe::m128_f64<0>(v),
122122
xe::m128_i64<0>(v));
123123
}
124-
void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value) {
124+
void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128& value) {
125125
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
126126
DPRINT("ctx v128 +{} = [{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}]\n",
127127
offset, xe::m128_f32<0>(value), xe::m128_f32<1>(value),
@@ -146,17 +146,17 @@ void TraceMemoryLoadI64(void* raw_context, uint32_t address, uint64_t value) {
146146
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
147147
DPRINT("{} ({:X}) = load.i64 {:08X}\n", (int64_t)value, value, address);
148148
}
149-
void TraceMemoryLoadF32(void* raw_context, uint32_t address, __m128 value) {
149+
void TraceMemoryLoadF32(void* raw_context, uint32_t address, __m128& value) {
150150
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
151151
DPRINT("{} ({:X}) = load.f32 {:08X}\n", xe::m128_f32<0>(value),
152152
xe::m128_i32<0>(value), address);
153153
}
154-
void TraceMemoryLoadF64(void* raw_context, uint32_t address, __m128 value) {
154+
void TraceMemoryLoadF64(void* raw_context, uint32_t address, __m128& value) {
155155
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
156156
DPRINT("{} ({:X}) = load.f64 {:08X}\n", xe::m128_f64<0>(value),
157157
xe::m128_i64<0>(value), address);
158158
}
159-
void TraceMemoryLoadV128(void* raw_context, uint32_t address, __m128 value) {
159+
void TraceMemoryLoadV128(void* raw_context, uint32_t address, __m128& value) {
160160
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
161161
DPRINT(
162162
"[{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}] = load.v128 {:08X}\n",
@@ -181,17 +181,17 @@ void TraceMemoryStoreI64(void* raw_context, uint32_t address, uint64_t value) {
181181
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
182182
DPRINT("store.i64 {:08X} = {} ({:X})\n", address, (int64_t)value, value);
183183
}
184-
void TraceMemoryStoreF32(void* raw_context, uint32_t address, __m128 value) {
184+
void TraceMemoryStoreF32(void* raw_context, uint32_t address, __m128& value) {
185185
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
186186
DPRINT("store.f32 {:08X} = {} ({:X})\n", address, xe::m128_f32<0>(value),
187187
xe::m128_i32<0>(value));
188188
}
189-
void TraceMemoryStoreF64(void* raw_context, uint32_t address, __m128 value) {
189+
void TraceMemoryStoreF64(void* raw_context, uint32_t address, __m128& value) {
190190
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
191191
DPRINT("store.f64 {:08X} = {} ({:X})\n", address, xe::m128_f64<0>(value),
192192
xe::m128_i64<0>(value));
193193
}
194-
void TraceMemoryStoreV128(void* raw_context, uint32_t address, __m128 value) {
194+
void TraceMemoryStoreV128(void* raw_context, uint32_t address, __m128& value) {
195195
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
196196
DPRINT(
197197
"store.v128 {:08X} = [{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}]\n",

src/xenia/cpu/backend/x64/x64_tracers.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,35 +34,35 @@ void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value);
3434
void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value);
3535
void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value);
3636
void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value);
37-
void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value);
37+
void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128& value);
3838
void TraceContextLoadF64(void* raw_context, uint64_t offset,
3939
const double* value);
40-
void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value);
40+
void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128& value);
4141

4242
void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value);
4343
void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value);
4444
void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value);
4545
void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value);
46-
void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value);
46+
void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128& value);
4747
void TraceContextStoreF64(void* raw_context, uint64_t offset,
4848
const double* value);
49-
void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value);
49+
void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128& value);
5050

5151
void TraceMemoryLoadI8(void* raw_context, uint32_t address, uint8_t value);
5252
void TraceMemoryLoadI16(void* raw_context, uint32_t address, uint16_t value);
5353
void TraceMemoryLoadI32(void* raw_context, uint32_t address, uint32_t value);
5454
void TraceMemoryLoadI64(void* raw_context, uint32_t address, uint64_t value);
55-
void TraceMemoryLoadF32(void* raw_context, uint32_t address, __m128 value);
56-
void TraceMemoryLoadF64(void* raw_context, uint32_t address, __m128 value);
57-
void TraceMemoryLoadV128(void* raw_context, uint32_t address, __m128 value);
55+
void TraceMemoryLoadF32(void* raw_context, uint32_t address, __m128& value);
56+
void TraceMemoryLoadF64(void* raw_context, uint32_t address, __m128& value);
57+
void TraceMemoryLoadV128(void* raw_context, uint32_t address, __m128& value);
5858

5959
void TraceMemoryStoreI8(void* raw_context, uint32_t address, uint8_t value);
6060
void TraceMemoryStoreI16(void* raw_context, uint32_t address, uint16_t value);
6161
void TraceMemoryStoreI32(void* raw_context, uint32_t address, uint32_t value);
6262
void TraceMemoryStoreI64(void* raw_context, uint32_t address, uint64_t value);
63-
void TraceMemoryStoreF32(void* raw_context, uint32_t address, __m128 value);
64-
void TraceMemoryStoreF64(void* raw_context, uint32_t address, __m128 value);
65-
void TraceMemoryStoreV128(void* raw_context, uint32_t address, __m128 value);
63+
void TraceMemoryStoreF32(void* raw_context, uint32_t address, __m128& value);
64+
void TraceMemoryStoreF64(void* raw_context, uint32_t address, __m128& value);
65+
void TraceMemoryStoreV128(void* raw_context, uint32_t address, __m128& value);
6666

6767
void TraceMemset(void* raw_context, uint32_t address, uint8_t value,
6868
uint32_t length);

0 commit comments

Comments
 (0)