Skip to content

Commit 376bad5

Browse files
committed
[GPU] Remove register reinterpret_casts + WAIT_REG_MEM volatility
Hopefully prevents some potential #1971-like situations. WAIT_REG_MEM's implementation also allowed the compiler to load the value only once, which caused an infinite loop with the other changes in the commit (even in debug builds), so it's now accessed as volatile. Possibly it would be even better to replace it with some (acquire/release?) atomic load/store some day at least for the registers actually seen as participating in those waits. Also fixes the endianness being handled only on the first wait iteration in WAIT_REG_MEM.
1 parent f0ad4f4 commit 376bad5

19 files changed

+336
-318
lines changed

src/xenia/gpu/command_processor.cc

Lines changed: 49 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "xenia/base/byte_stream.h"
1919
#include "xenia/base/logging.h"
2020
#include "xenia/base/math.h"
21+
#include "xenia/base/memory.h"
2122
#include "xenia/base/profiling.h"
2223
#include "xenia/base/ring_buffer.h"
2324
#include "xenia/gpu/gpu_flags.h"
@@ -334,27 +335,29 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
334335
return;
335336
}
336337

337-
regs.values[index].u32 = value;
338+
// Volatile for the WAIT_REG_MEM loop.
339+
const_cast<volatile uint32_t&>(regs.values[index]) = value;
338340
if (!regs.GetRegisterInfo(index)) {
339341
XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
340342
}
341343

342344
// Scratch register writeback.
343345
if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) {
344346
uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0;
345-
if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK].u32) {
347+
if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK]) {
346348
// Enabled - write to address.
347-
uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR].u32;
349+
uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR];
348350
uint32_t mem_addr = scratch_addr + (scratch_reg * 4);
349351
xe::store_and_swap<uint32_t>(memory_->TranslatePhysical(mem_addr), value);
350352
}
351353
} else {
352354
switch (index) {
353355
// If this is a COHER register, set the dirty flag.
354-
// This will block the command processor the next time it WAIT_MEM_REGs
356+
// This will block the command processor the next time it WAIT_REG_MEMs
355357
// and allow us to synchronize the memory.
356358
case XE_GPU_REG_COHER_STATUS_HOST: {
357-
regs.values[index].u32 |= UINT32_C(0x80000000);
359+
const_cast<volatile uint32_t&>(regs.values[index]) |=
360+
UINT32_C(0x80000000);
358361
} break;
359362

360363
case XE_GPU_REG_DC_LUT_RW_INDEX: {
@@ -365,12 +368,12 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
365368

366369
case XE_GPU_REG_DC_LUT_SEQ_COLOR: {
367370
// Should be in the 256-entry table writing mode.
368-
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
371+
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1);
369372
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
370373
// DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write
371374
// enable mask is blue, green, red.
372375
bool write_gamma_ramp_component =
373-
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
376+
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] &
374377
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
375378
if (write_gamma_ramp_component) {
376379
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
@@ -401,14 +404,14 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
401404

402405
case XE_GPU_REG_DC_LUT_PWL_DATA: {
403406
// Should be in the PWL writing mode.
404-
assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
407+
assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1);
405408
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
406409
// Bit 7 of the index is ignored for PWL.
407410
uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F;
408411
// DC_LUT_PWL_DATA is likely in the red, green, blue order because
409412
// DC_LUT_SEQ_COLOR is, but the write enable mask is blue, green, red.
410413
bool write_gamma_ramp_component =
411-
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 &
414+
(regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] &
412415
(UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0;
413416
if (write_gamma_ramp_component) {
414417
reg::DC_LUT_PWL_DATA& gamma_ramp_entry =
@@ -436,10 +439,10 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
436439

437440
case XE_GPU_REG_DC_LUT_30_COLOR: {
438441
// Should be in the 256-entry table writing mode.
439-
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1);
442+
assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1);
440443
auto& gamma_ramp_rw_index = regs.Get<reg::DC_LUT_RW_INDEX>();
441444
uint32_t gamma_ramp_write_enable_mask =
442-
regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & 0b111;
445+
regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] & 0b111;
443446
if (gamma_ramp_write_enable_mask) {
444447
reg::DC_LUT_30_COLOR& gamma_ramp_entry =
445448
gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index];
@@ -479,10 +482,12 @@ void CommandProcessor::MakeCoherent() {
479482
// https://web.archive.org/web/20160711162346/https://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf
480483
// https://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454
481484

482-
RegisterFile* regs = register_file_;
483-
auto& status_host = regs->Get<reg::COHER_STATUS_HOST>();
484-
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
485-
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
485+
// Volatile because this may be called from the WAIT_REG_MEM loop.
486+
volatile uint32_t* regs_volatile = register_file_->values;
487+
auto status_host = xe::memory::Reinterpret<reg::COHER_STATUS_HOST>(
488+
uint32_t(regs_volatile[XE_GPU_REG_COHER_STATUS_HOST]));
489+
uint32_t base_host = regs_volatile[XE_GPU_REG_COHER_BASE_HOST];
490+
uint32_t size_host = regs_volatile[XE_GPU_REG_COHER_SIZE_HOST];
486491

487492
if (!status_host.status) {
488493
return;
@@ -502,7 +507,7 @@ void CommandProcessor::MakeCoherent() {
502507
base_host + size_host, size_host, action);
503508

504509
// Mark coherent.
505-
status_host.status = 0;
510+
regs_volatile[XE_GPU_REG_COHER_STATUS_HOST] = 0;
506511
}
507512

508513
void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }
@@ -940,28 +945,33 @@ bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingBuffer* reader,
940945
SCOPE_profile_cpu_f("gpu");
941946

942947
// wait until a register or memory location is a specific value
948+
943949
uint32_t wait_info = reader->ReadAndSwap<uint32_t>();
944950
uint32_t poll_reg_addr = reader->ReadAndSwap<uint32_t>();
945951
uint32_t ref = reader->ReadAndSwap<uint32_t>();
946952
uint32_t mask = reader->ReadAndSwap<uint32_t>();
947953
uint32_t wait = reader->ReadAndSwap<uint32_t>();
954+
955+
bool is_memory = (wait_info & 0x10) != 0;
956+
957+
assert_true(is_memory || poll_reg_addr < RegisterFile::kRegisterCount);
958+
const volatile uint32_t& value_ref =
959+
is_memory ? *reinterpret_cast<uint32_t*>(memory_->TranslatePhysical(
960+
poll_reg_addr & ~uint32_t(0x3)))
961+
: register_file_->values[poll_reg_addr];
962+
948963
bool matched = false;
949964
do {
950-
uint32_t value;
951-
if (wait_info & 0x10) {
952-
// Memory.
953-
auto endianness = static_cast<xenos::Endian>(poll_reg_addr & 0x3);
954-
poll_reg_addr &= ~0x3;
955-
value = xe::load<uint32_t>(memory_->TranslatePhysical(poll_reg_addr));
956-
value = GpuSwap(value, endianness);
957-
trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr), 4);
965+
uint32_t value = value_ref;
966+
if (is_memory) {
967+
trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr & ~uint32_t(0x3)),
968+
sizeof(uint32_t));
969+
value = xenos::GpuSwap(value,
970+
static_cast<xenos::Endian>(poll_reg_addr & 0x3));
958971
} else {
959-
// Register.
960-
assert_true(poll_reg_addr < RegisterFile::kRegisterCount);
961-
value = register_file_->values[poll_reg_addr].u32;
962972
if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) {
963973
MakeCoherent();
964-
value = register_file_->values[poll_reg_addr].u32;
974+
value = value_ref;
965975
}
966976
}
967977
switch (wait_info & 0x7) {
@@ -1024,17 +1034,17 @@ bool CommandProcessor::ExecutePacketType3_REG_RMW(RingBuffer* reader,
10241034
uint32_t rmw_info = reader->ReadAndSwap<uint32_t>();
10251035
uint32_t and_mask = reader->ReadAndSwap<uint32_t>();
10261036
uint32_t or_mask = reader->ReadAndSwap<uint32_t>();
1027-
uint32_t value = register_file_->values[rmw_info & 0x1FFF].u32;
1037+
uint32_t value = register_file_->values[rmw_info & 0x1FFF];
10281038
if ((rmw_info >> 31) & 0x1) {
10291039
// & reg
1030-
value &= register_file_->values[and_mask & 0x1FFF].u32;
1040+
value &= register_file_->values[and_mask & 0x1FFF];
10311041
} else {
10321042
// & imm
10331043
value &= and_mask;
10341044
}
10351045
if ((rmw_info >> 30) & 0x1) {
10361046
// | reg
1037-
value |= register_file_->values[or_mask & 0x1FFF].u32;
1047+
value |= register_file_->values[or_mask & 0x1FFF];
10381048
} else {
10391049
// | imm
10401050
value |= or_mask;
@@ -1055,7 +1065,7 @@ bool CommandProcessor::ExecutePacketType3_REG_TO_MEM(RingBuffer* reader,
10551065
uint32_t reg_val;
10561066

10571067
assert_true(reg_addr < RegisterFile::kRegisterCount);
1058-
reg_val = register_file_->values[reg_addr].u32;
1068+
reg_val = register_file_->values[reg_addr];
10591069

10601070
auto endianness = static_cast<xenos::Endian>(mem_addr & 0x3);
10611071
mem_addr &= ~0x3;
@@ -1105,7 +1115,7 @@ bool CommandProcessor::ExecutePacketType3_COND_WRITE(RingBuffer* reader,
11051115
} else {
11061116
// Register.
11071117
assert_true(poll_reg_addr < RegisterFile::kRegisterCount);
1108-
value = register_file_->values[poll_reg_addr].u32;
1118+
value = register_file_->values[poll_reg_addr];
11091119
}
11101120
bool matched = false;
11111121
switch (wait_info & 0x7) {
@@ -1240,7 +1250,7 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader,
12401250
if (fake_sample_count >= 0) {
12411251
auto* pSampleCounts =
12421252
memory_->TranslatePhysical<xe_gpu_depth_sample_counts*>(
1243-
register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR].u32);
1253+
register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR]);
12441254
// 0xFFFFFEED is written to this two locations by D3D only on D3DISSUE_END
12451255
// and used to detect a finished query.
12461256
bool is_end_via_z_pass = pSampleCounts->ZPass_A == kQueryFinished &&
@@ -1599,10 +1609,10 @@ bool CommandProcessor::ExecutePacketType3_VIZ_QUERY(RingBuffer* reader,
15991609
// The scan converter writes the internal result back to the register here.
16001610
// We just fake it and say it was visible in case it is read back.
16011611
if (id < 32) {
1602-
register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_0].u32 |=
1603-
uint32_t(1) << id;
1612+
register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_0] |= uint32_t(1)
1613+
<< id;
16041614
} else {
1605-
register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_1].u32 |=
1615+
register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_1] |=
16061616
uint32_t(1) << (id - 32);
16071617
}
16081618
}
@@ -1614,9 +1624,8 @@ void CommandProcessor::InitializeTrace() {
16141624
// Write the initial register values, to be loaded directly into the
16151625
// RegisterFile since all registers, including those that may have side
16161626
// effects on setting, will be saved.
1617-
trace_writer_.WriteRegisters(
1618-
0, reinterpret_cast<const uint32_t*>(register_file_->values),
1619-
RegisterFile::kRegisterCount, false);
1627+
trace_writer_.WriteRegisters(0, register_file_->values,
1628+
RegisterFile::kRegisterCount, false);
16201629

16211630
trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(),
16221631
gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_);

0 commit comments

Comments
 (0)