Skip to content

Commit

Permalink
drm/xe: Introduce the wedged_mode debugfs
Browse files Browse the repository at this point in the history
So, the wedged mode can be selected per device at runtime,
before the tests or before reproducing the issue.

v2: - s/busted/wedged
    - some locking consistency

v3: - remove mutex
    - toggle guc reset policy on any mode change

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240423221817.1285081-4-rodrigo.vivi@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
  • Loading branch information
rodrigovivi committed Apr 24, 2024
1 parent 8ed9aaa commit 6b8ef44
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 23 deletions.
55 changes: 55 additions & 0 deletions drivers/gpu/drm/xe/xe_debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include "xe_bo.h"
#include "xe_device.h"
#include "xe_gt_debugfs.h"
#include "xe_gt_printk.h"
#include "xe_guc_ads.h"
#include "xe_pm.h"
#include "xe_sriov.h"
#include "xe_step.h"
Expand Down Expand Up @@ -117,6 +119,56 @@ static const struct file_operations forcewake_all_fops = {
.release = forcewake_release,
};

static ssize_t wedged_mode_show(struct file *f, char __user *ubuf,
size_t size, loff_t *pos)
{
struct xe_device *xe = file_inode(f)->i_private;
char buf[32];
int len = 0;

len = scnprintf(buf, sizeof(buf), "%d\n", xe->wedged.mode);

return simple_read_from_buffer(ubuf, size, pos, buf, len);
}

static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
size_t size, loff_t *pos)
{
struct xe_device *xe = file_inode(f)->i_private;
struct xe_gt *gt;
u32 wedged_mode;
ssize_t ret;
u8 id;

ret = kstrtouint_from_user(ubuf, size, 0, &wedged_mode);
if (ret)
return ret;

if (wedged_mode > 2)
return -EINVAL;

if (xe->wedged.mode == wedged_mode)
return 0;

xe->wedged.mode = wedged_mode;

for_each_gt(gt, xe, id) {
ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads);
if (ret) {
xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
return -EIO;
}
}

return size;
}

static const struct file_operations wedged_mode_fops = {
.owner = THIS_MODULE,
.read = wedged_mode_show,
.write = wedged_mode_set,
};

void xe_debugfs_register(struct xe_device *xe)
{
struct ttm_device *bdev = &xe->ttm;
Expand All @@ -134,6 +186,9 @@ void xe_debugfs_register(struct xe_device *xe)
debugfs_create_file("forcewake_all", 0400, root, xe,
&forcewake_all_fops);

debugfs_create_file("wedged_mode", 0400, root, xe,
&wedged_mode_fops);

for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
man = ttm_manager_type(bdev, mem_type);

Expand Down
10 changes: 7 additions & 3 deletions drivers/gpu/drm/xe/xe_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ int xe_device_probe_early(struct xe_device *xe)
if (err)
return err;

xe->wedged.mode = xe_modparam.wedged_mode;

return 0;
}

Expand Down Expand Up @@ -769,7 +771,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
* xe_device_declare_wedged - Declare device wedged
* @xe: xe device instance
*
* This is a final state that can only be cleared with a module
* This is a final state that can only be cleared with a mudule
* re-probe (unbind + bind).
* In this state every IOCTL will be blocked so the GT cannot be used.
* In general it will be called upon any critical error such as gt reset
Expand All @@ -781,10 +783,12 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
*/
void xe_device_declare_wedged(struct xe_device *xe)
{
if (xe_modparam.wedged_mode == 0)
if (xe->wedged.mode == 0) {
drm_dbg(&xe->drm, "Wedged mode is forcebly disabled\n");
return;
}

if (!atomic_xchg(&xe->wedged, 1)) {
if (!atomic_xchg(&xe->wedged.flag, 1)) {
xe->needs_flr_on_fini = true;
drm_err(&xe->drm,
"CRITICAL: Xe has declared device %s as wedged.\n"
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/xe/xe_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);

static inline bool xe_device_wedged(struct xe_device *xe)
{
return atomic_read(&xe->wedged);
return atomic_read(&xe->wedged.flag);
}

void xe_device_declare_wedged(struct xe_device *xe);
Expand Down
9 changes: 7 additions & 2 deletions drivers/gpu/drm/xe/xe_device_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -459,8 +459,13 @@ struct xe_device {
/** @needs_flr_on_fini: requests function-reset on fini */
bool needs_flr_on_fini;

/** @wedged: Xe device faced a critical error and is now blocked. */
atomic_t wedged;
/** @wedged: Struct to control Wedged States and mode */
struct {
/** @wedged.flag: Xe device faced a critical error and is now blocked. */
atomic_t flag;
/** @wedged.mode: Mode controlled by kernel parameter and debugfs */
int mode;
} wedged;

/* private: */

Expand Down
60 changes: 58 additions & 2 deletions drivers/gpu/drm/xe/xe_guc_ads.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,19 @@

#include <generated/xe_wa_oob.h>

#include "abi/guc_actions_abi.h"
#include "regs/xe_engine_regs.h"
#include "regs/xe_gt_regs.h"
#include "regs/xe_guc_regs.h"
#include "xe_bo.h"
#include "xe_gt.h"
#include "xe_gt_ccs_mode.h"
#include "xe_guc.h"
#include "xe_guc_ct.h"
#include "xe_hw_engine.h"
#include "xe_lrc.h"
#include "xe_map.h"
#include "xe_mmio.h"
#include "xe_module.h"
#include "xe_platform_types.h"
#include "xe_wa.h"

Expand Down Expand Up @@ -441,14 +442,15 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)

static void guc_policies_init(struct xe_guc_ads *ads)
{
struct xe_device *xe = ads_to_xe(ads);
u32 global_flags = 0;

ads_blob_write(ads, policies.dpc_promote_time,
GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
ads_blob_write(ads, policies.max_num_work_items,
GLOBAL_POLICY_MAX_NUM_WI);

if (xe_modparam.wedged_mode == 2)
if (xe->wedged.mode == 2)
global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;

ads_blob_write(ads, policies.global_flags, global_flags);
Expand Down Expand Up @@ -806,3 +808,57 @@ void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads)
{
guc_populate_golden_lrc(ads);
}

static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_offset)
{
struct xe_guc_ct *ct = &ads_to_guc(ads)->ct;
u32 action[] = {
XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE,
policy_offset
};

return xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
}

/**
* xe_guc_ads_scheduler_policy_toggle_reset - Toggle reset policy
* @ads: Additional data structures object
*
* This function update the GuC's engine reset policy based on wedged.mode.
*
* Return: 0 on success, and negative error code otherwise.
*/
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
{
struct xe_device *xe = ads_to_xe(ads);
struct xe_gt *gt = ads_to_gt(ads);
struct xe_tile *tile = gt_to_tile(gt);
struct guc_policies *policies;
struct xe_bo *bo;
int ret = 0;

policies = kmalloc(sizeof(*policies), GFP_KERNEL);
if (!policies)
return -ENOMEM;

policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
policies->is_valid = 1;
if (xe->wedged.mode == 2)
policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
else
policies->global_flags &= ~GLOBAL_POLICY_DISABLE_ENGINE_RESET;

bo = xe_managed_bo_create_from_data(xe, tile, policies, sizeof(struct guc_policies),
XE_BO_FLAG_VRAM_IF_DGFX(tile) |
XE_BO_FLAG_GGTT);
if (IS_ERR(bo)) {
ret = PTR_ERR(bo);
goto out;
}

ret = guc_ads_action_update_policies(ads, xe_bo_ggtt_addr(bo));
out:
kfree(policies);
return ret;
}
1 change: 1 addition & 0 deletions drivers/gpu/drm/xe/xe_guc_ads.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads);
void xe_guc_ads_populate(struct xe_guc_ads *ads);
void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads);

#endif
35 changes: 20 additions & 15 deletions drivers/gpu/drm/xe/xe_guc_submit.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
#include "xe_macros.h"
#include "xe_map.h"
#include "xe_mocs.h"
#include "xe_module.h"
#include "xe_ring_ops_types.h"
#include "xe_sched_job.h"
#include "xe_trace.h"
Expand Down Expand Up @@ -868,26 +867,38 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
xe_sched_tdr_queue_imm(&q->guc->sched);
}

static void guc_submit_wedged(struct xe_guc *guc)
static bool guc_submit_hint_wedged(struct xe_guc *guc)
{
struct xe_device *xe = guc_to_xe(guc);
struct xe_exec_queue *q;
unsigned long index;
int err;

xe_device_declare_wedged(guc_to_xe(guc));
if (xe->wedged.mode != 2)
return false;

if (xe_device_wedged(xe))
return true;

xe_device_declare_wedged(xe);

xe_guc_submit_reset_prepare(guc);
xe_guc_ct_stop(&guc->ct);

err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
guc_submit_wedged_fini, guc);
if (err)
return;
if (err) {
drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
return true; /* Device is wedged anyway */
}

mutex_lock(&guc->submission_state.lock);
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
if (xe_exec_queue_get_unless_zero(q))
set_exec_queue_wedged(q);
mutex_unlock(&guc->submission_state.lock);

return true;
}

static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
Expand All @@ -898,15 +909,12 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
struct xe_guc *guc = exec_queue_to_guc(q);
struct xe_device *xe = guc_to_xe(guc);
struct xe_gpu_scheduler *sched = &ge->sched;
bool wedged = xe_device_wedged(xe);
bool wedged;

xe_assert(xe, xe_exec_queue_is_lr(q));
trace_xe_exec_queue_lr_cleanup(q);

if (!wedged && xe_modparam.wedged_mode == 2) {
guc_submit_wedged(exec_queue_to_guc(q));
wedged = true;
}
wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));

/* Kill the run_job / process_msg entry points */
xe_sched_submission_stop(sched);
Expand Down Expand Up @@ -957,7 +965,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
int err = -ETIME;
int i = 0;
bool wedged = xe_device_wedged(xe);
bool wedged;

/*
* TDR has fired before free job worker. Common if exec queue
Expand All @@ -981,10 +989,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)

trace_xe_sched_job_timedout(job);

if (!wedged && xe_modparam.wedged_mode == 2) {
guc_submit_wedged(exec_queue_to_guc(q));
wedged = true;
}
wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));

/* Kill the run_job entry point */
xe_sched_submission_stop(sched);
Expand Down

0 comments on commit 6b8ef44

Please sign in to comment.