Skip to content

Commit e55ef65

Browse files
committed
Merge tag 'amd-drm-next-6.12-2024-08-26' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.12-2024-08-26: amdgpu: - SDMA devcoredump support - DCN 4.0.1 updates - DC SUBVP fixes - Refactor OPP in DC - Refactor MMHUBBUB in DC - DC DML 2.1 updates - DC FAMS2 updates - RAS updates - GFX12 updates - VCN 4.0.3 updates - JPEG 4.0.3 updates - Enable wave kill (soft recovery) for compute queues - Clean up CP error interrupt handling - Enable CP bad opcode interrupts - VCN 4.x fixes - VCN 5.x fixes - GPU reset fixes - Fix vbios embedded EDID size handling - SMU 14.x updates - Misc code cleanups and spelling fixes - VCN devcoredump support - ISP MFD i2c support - DC vblank fixes - GFX 12 fixes - PSR fixes - Convert vbios embedded EDID to drm_edid - DCN 3.5 updates - DMCUB updates - Cursor fixes - Overdrive support for SMU 14.x - GFX CP padding optimizations - DCC fixes - DSC fixes - Preliminary per queue reset infrastructure - Initial per queue reset support for GFX 9 - Initial per queue reset support for GFX 7, 8 - DCN 3.2 fixes - DP MST fixes - SR-IOV fixes - GFX 9.4.3/4 devcoredump support - Add process isolation framework - Enable process isolation support for GFX 9.4.3/4 - Take IOMMU remapping into account for P2P DMA checks amdkfd: - CRIU fixes - Improved input validation for user queues - HMM fix - Enable process isolation support for GFX 9.4.3/4 - Initial per queue reset support for GFX 9 - Allow users to target recommended SDMA engines radeon: - remove .load and drm_dev_alloc - Fix vbios embedded EDID size handling - Convert vbios embedded EDID to drm_edid - Use GEM references instead of TTM - r100 cp init cleanup - Fix potential overflows in evergreen CS offset tracking UAPI: - KFD support for targetting queues on recommended SDMA engines Proposed userspace: ROCm/ROCR-Runtime@2f588a2 ROCm/ROCR-Runtime@eb30a5b drm/buddy: - Add start address support for trim function From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240826201528.55307-1-alexander.deucher@amd.com
2 parents 4461e9e + 3376f92 commit e55ef65

File tree

438 files changed

+13108
-4958
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

438 files changed

+13108
-4958
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@
118118

119119
#define MAX_GPU_INSTANCE 64
120120

121+
#define GFX_SLICE_PERIOD msecs_to_jiffies(250)
122+
121123
struct amdgpu_gpu_instance {
122124
struct amdgpu_device *adev;
123125
int mgpu_fan_enabled;
@@ -347,9 +349,9 @@ enum amdgpu_kiq_irq {
347349
AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
348350
AMDGPU_CP_KIQ_IRQ_LAST
349351
};
350-
#define SRIOV_USEC_TIMEOUT 1200000 /* wait 12 * 100ms for SRIOV */
351-
#define MAX_KIQ_REG_WAIT (amdgpu_sriov_vf(adev) ? 50000 : 5000) /* in usecs, extend for VF */
352-
#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */
352+
#define SRIOV_USEC_TIMEOUT 1200000 /* wait 12 * 100ms for SRIOV */
353+
#define MAX_KIQ_REG_WAIT 5000 /* in usecs, 5ms */
354+
#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */
353355
#define MAX_KIQ_REG_TRY 1000
354356

355357
int amdgpu_device_ip_set_clockgating_state(void *dev,
@@ -823,17 +825,6 @@ struct amdgpu_mqd {
823825
struct amdgpu_reset_domain;
824826
struct amdgpu_fru_info;
825827

826-
struct amdgpu_reset_info {
827-
/* reset dump register */
828-
u32 *reset_dump_reg_list;
829-
u32 *reset_dump_reg_value;
830-
int num_regs;
831-
832-
#ifdef CONFIG_DEV_COREDUMP
833-
struct amdgpu_coredump_info *coredump_info;
834-
#endif
835-
};
836-
837828
/*
838829
* Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
839830
*/
@@ -1157,8 +1148,6 @@ struct amdgpu_device {
11571148

11581149
struct mutex benchmark_mutex;
11591150

1160-
struct amdgpu_reset_info reset_info;
1161-
11621151
bool scpm_enabled;
11631152
uint32_t scpm_status;
11641153

@@ -1175,6 +1164,10 @@ struct amdgpu_device {
11751164
bool debug_disable_soft_recovery;
11761165
bool debug_use_vram_fw_buf;
11771166
bool debug_enable_ras_aca;
1167+
1168+
bool enforce_isolation[MAX_XCP];
1169+
/* Added this mutex for cleaner shader isolation between GFX and compute processes */
1170+
struct mutex enforce_isolation_mutex;
11781171
};
11791172

11801173
static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
@@ -1587,13 +1580,6 @@ static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return
15871580
static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { }
15881581
#endif
15891582

1590-
#if defined(CONFIG_DRM_AMD_DC)
1591-
int amdgpu_dm_display_resume(struct amdgpu_device *adev );
1592-
#else
1593-
static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return 0; }
1594-
#endif
1595-
1596-
15971583
void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
15981584
void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);
15991585

drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ static void aca_banks_release(struct aca_banks *banks)
8080
{
8181
struct aca_bank_node *node, *tmp;
8282

83+
if (list_empty(&banks->list))
84+
return;
85+
8386
list_for_each_entry_safe(node, tmp, &banks->list, node) {
8487
list_del(&node->node);
8588
kvfree(node);
@@ -453,13 +456,13 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er
453456

454457
switch (type) {
455458
case ACA_ERROR_TYPE_UE:
456-
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, count);
459+
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, count);
457460
break;
458461
case ACA_ERROR_TYPE_CE:
459-
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count);
462+
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, count);
460463
break;
461464
case ACA_ERROR_TYPE_DEFERRED:
462-
amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count);
465+
amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, count);
463466
break;
464467
default:
465468
break;
@@ -562,9 +565,13 @@ static void aca_error_fini(struct aca_error *aerr)
562565
struct aca_bank_error *bank_error, *tmp;
563566

564567
mutex_lock(&aerr->lock);
568+
if (list_empty(&aerr->list))
569+
goto out_unlock;
570+
565571
list_for_each_entry_safe(bank_error, tmp, &aerr->list, node)
566572
aca_bank_error_remove(aerr, bank_error);
567573

574+
out_unlock:
568575
mutex_destroy(&aerr->lock);
569576
}
570577

@@ -680,6 +687,9 @@ static void aca_manager_fini(struct aca_handle_manager *mgr)
680687
{
681688
struct aca_handle *handle, *tmp;
682689

690+
if (list_empty(&mgr->list))
691+
return;
692+
683693
list_for_each_entry_safe(handle, tmp, &mgr->list, node)
684694
amdgpu_aca_remove_handle(handle);
685695
}

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -364,15 +364,15 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
364364
return r;
365365
}
366366

367-
void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj)
367+
void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj)
368368
{
369-
struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
369+
struct amdgpu_bo **bo = (struct amdgpu_bo **) mem_obj;
370370

371-
amdgpu_bo_reserve(bo, true);
372-
amdgpu_bo_kunmap(bo);
373-
amdgpu_bo_unpin(bo);
374-
amdgpu_bo_unreserve(bo);
375-
amdgpu_bo_unref(&(bo));
371+
amdgpu_bo_reserve(*bo, true);
372+
amdgpu_bo_kunmap(*bo);
373+
amdgpu_bo_unpin(*bo);
374+
amdgpu_bo_unreserve(*bo);
375+
amdgpu_bo_unref(bo);
376376
}
377377

378378
int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
@@ -783,22 +783,6 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
783783
return 0;
784784
}
785785

786-
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
787-
int hub_inst, int hub_type)
788-
{
789-
if (!hub_type) {
790-
if (adev->gfxhub.funcs->query_utcl2_poison_status)
791-
return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
792-
else
793-
return false;
794-
} else {
795-
if (adev->mmhub.funcs->query_utcl2_poison_status)
796-
return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
797-
else
798-
return false;
799-
}
800-
}
801-
802786
int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
803787
{
804788
return kgd2kfd_check_and_lock_kfd();
@@ -887,3 +871,21 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
887871

888872
return r;
889873
}
874+
875+
/* Stop scheduling on KFD */
876+
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id)
877+
{
878+
if (!adev->kfd.init_complete)
879+
return 0;
880+
881+
return kgd2kfd_stop_sched(adev->kfd.dev, node_id);
882+
}
883+
884+
/* Start scheduling on KFD */
885+
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id)
886+
{
887+
if (!adev->kfd.init_complete)
888+
return 0;
889+
890+
return kgd2kfd_start_sched(adev->kfd.dev, node_id);
891+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
235235
int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
236236
void **mem_obj, uint64_t *gpu_addr,
237237
void **cpu_ptr, bool mqd_gfx9);
238-
void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj);
238+
void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj);
239239
int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
240240
void **mem_obj);
241241
void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj);
@@ -264,6 +264,8 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
264264
uint32_t *payload);
265265
int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
266266
u32 inst);
267+
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id);
268+
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);
267269

268270
/* Read user wptr from a specified user address space with page fault
269271
* disabled. The memory must be pinned and mapped to the hardware when
@@ -322,7 +324,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
322324
void **kptr, uint64_t *size);
323325
void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
324326

325-
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);
327+
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart);
326328

327329
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
328330
struct dma_fence __rcu **ef);
@@ -345,11 +347,9 @@ void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *ad
345347
pasid_notify pasid_fn, void *data, uint32_t reset);
346348

347349
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
348-
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
350+
bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
349351
void amdgpu_amdkfd_block_mmu_notifications(void *p);
350352
int amdgpu_amdkfd_criu_resume(void *p);
351-
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
352-
int hub_inst, int hub_type);
353353
int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
354354
uint64_t size, u32 alloc_flag, int8_t xcp_id);
355355
void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
@@ -426,6 +426,8 @@ void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
426426
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
427427
int kgd2kfd_check_and_lock_kfd(void);
428428
void kgd2kfd_unlock_kfd(void);
429+
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
430+
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
429431
#else
430432
static inline int kgd2kfd_init(void)
431433
{
@@ -496,5 +498,15 @@ static inline int kgd2kfd_check_and_lock_kfd(void)
496498
static inline void kgd2kfd_unlock_kfd(void)
497499
{
498500
}
501+
502+
static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
503+
{
504+
return 0;
505+
}
506+
507+
static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
508+
{
509+
return 0;
510+
}
499511
#endif
500512
#endif /* AMDGPU_AMDKFD_H_INCLUDED */

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,4 +191,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
191191
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
192192
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
193193
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
194+
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
195+
.hqd_reset = kgd_gfx_v9_hqd_reset,
194196
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,5 +418,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
418418
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
419419
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
420420
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
421-
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
421+
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
422+
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
423+
.hqd_reset = kgd_gfx_v9_hqd_reset
422424
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -541,5 +541,7 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
541541
kgd_gfx_v9_4_3_set_wave_launch_trap_override,
542542
.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
543543
.set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
544-
.clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch
544+
.clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
545+
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
546+
.hqd_reset = kgd_gfx_v9_hqd_reset
545547
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,20 @@ static void program_trap_handler_settings(struct amdgpu_device *adev,
10701070
unlock_srbm(adev);
10711071
}
10721072

1073+
uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev,
1074+
uint32_t pipe_id, uint32_t queue_id,
1075+
uint32_t inst)
1076+
{
1077+
return 0;
1078+
}
1079+
1080+
uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
1081+
uint32_t pipe_id, uint32_t queue_id,
1082+
uint32_t inst, unsigned int utimeout)
1083+
{
1084+
return 0;
1085+
}
1086+
10731087
const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
10741088
.program_sh_mem_settings = kgd_program_sh_mem_settings,
10751089
.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
@@ -1097,4 +1111,6 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
10971111
.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
10981112
.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
10991113
.program_trap_handler_settings = program_trap_handler_settings,
1114+
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
1115+
.hqd_reset = kgd_gfx_v10_hqd_reset
11001116
};

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,12 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
5656
uint32_t grace_period,
5757
uint32_t *reg_offset,
5858
uint32_t *reg_data);
59+
uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev,
60+
uint32_t pipe_id,
61+
uint32_t queue_id,
62+
uint32_t inst);
63+
uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev,
64+
uint32_t pipe_id,
65+
uint32_t queue_id,
66+
uint32_t inst,
67+
unsigned int utimeout);

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,5 +680,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
680680
.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
681681
.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
682682
.set_address_watch = kgd_gfx_v10_set_address_watch,
683-
.clear_address_watch = kgd_gfx_v10_clear_address_watch
683+
.clear_address_watch = kgd_gfx_v10_clear_address_watch,
684+
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
685+
.hqd_reset = kgd_gfx_v10_hqd_reset
684686
};

0 commit comments

Comments
 (0)