Skip to content

Commit b3bd076

Browse files
Moshe ShemeshSaeed Mahameed
authored andcommitted
net/mlx5: Report devlink health on FW fatal issues
Report devlink health on FW fatal issues via fw_fatal_reporter. The driver recover flow for FW fatal error is now being handled by the devlink health. Having the recovery controlled by devlink health, the user has the ability to cancel the auto-recovery for debug session and run it manually. Call mlx5_enter_error_state() before calling devlink_health_report() to ensure entering device error state even if auto-recovery is off. Signed-off-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
1 parent 9b1f298 commit b3bd076

File tree

3 files changed

+31
-23
lines changed

3 files changed

+31
-23
lines changed

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -327,19 +327,6 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
327327
return 0;
328328
}
329329

330-
static void health_recover_work(struct work_struct *work)
331-
{
332-
struct mlx5_core_health *health;
333-
struct mlx5_core_dev *dev;
334-
struct mlx5_priv *priv;
335-
336-
health = container_of(work, struct mlx5_core_health, work);
337-
priv = container_of(health, struct mlx5_priv, health);
338-
dev = container_of(priv, struct mlx5_core_dev, priv);
339-
340-
mlx5_health_try_recover(dev);
341-
}
342-
343330
static const char *hsynd_str(u8 synd)
344331
{
345332
switch (synd) {
@@ -614,6 +601,29 @@ mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter,
614601
return err;
615602
}
616603

604+
static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
605+
{
606+
struct mlx5_fw_reporter_ctx fw_reporter_ctx;
607+
struct mlx5_core_health *health;
608+
struct mlx5_core_dev *dev;
609+
struct mlx5_priv *priv;
610+
611+
health = container_of(work, struct mlx5_core_health, fatal_report_work);
612+
priv = container_of(health, struct mlx5_priv, health);
613+
dev = container_of(priv, struct mlx5_core_dev, priv);
614+
615+
mlx5_enter_error_state(dev, false);
616+
if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
617+
if (mlx5_health_try_recover(dev))
618+
mlx5_core_err(dev, "health recovery failed\n");
619+
return;
620+
}
621+
fw_reporter_ctx.err_synd = health->synd;
622+
fw_reporter_ctx.miss_counter = health->miss_counter;
623+
devlink_health_report(health->fw_fatal_reporter,
624+
"FW fatal error reported", &fw_reporter_ctx);
625+
}
626+
617627
static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
618628
.name = "fw_fatal",
619629
.recover = mlx5_fw_fatal_reporter_recover,
@@ -672,7 +682,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
672682

673683
spin_lock_irqsave(&health->wq_lock, flags);
674684
if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
675-
queue_work(health->wq, &health->work);
685+
queue_work(health->wq, &health->fatal_report_work);
676686
else
677687
mlx5_core_err(dev, "new health works are not permitted at this stage\n");
678688
spin_unlock_irqrestore(&health->wq_lock, flags);
@@ -758,7 +768,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
758768
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
759769
spin_unlock_irqrestore(&health->wq_lock, flags);
760770
cancel_work_sync(&health->report_work);
761-
cancel_work_sync(&health->work);
771+
cancel_work_sync(&health->fatal_report_work);
762772
}
763773

764774
void mlx5_health_flush(struct mlx5_core_dev *dev)
@@ -795,7 +805,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
795805
if (!health->wq)
796806
goto out_err;
797807
spin_lock_init(&health->wq_lock);
798-
INIT_WORK(&health->work, health_recover_work);
808+
INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work);
799809
INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
800810

801811
return 0;

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,11 +1363,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
13631363
mlx5_enter_error_state(dev, false);
13641364
mlx5_error_sw_reset(dev);
13651365
mlx5_unload_one(dev, false);
1366-
/* In case of kernel call drain the health wq */
1367-
if (state) {
1368-
mlx5_drain_health_wq(dev);
1369-
mlx5_pci_disable_device(dev);
1370-
}
1366+
mlx5_drain_health_wq(dev);
1367+
mlx5_pci_disable_device(dev);
13711368

13721369
return state == pci_channel_io_perm_failure ?
13731370
PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
@@ -1535,7 +1532,8 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
15351532

15361533
void mlx5_disable_device(struct mlx5_core_dev *dev)
15371534
{
1538-
mlx5_pci_err_detected(dev->pdev, 0);
1535+
mlx5_error_sw_reset(dev);
1536+
mlx5_unload_one(dev, false);
15391537
}
15401538

15411539
void mlx5_recover_device(struct mlx5_core_dev *dev)

include/linux/mlx5/driver.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ struct mlx5_core_health {
442442
spinlock_t wq_lock;
443443
struct workqueue_struct *wq;
444444
unsigned long flags;
445-
struct work_struct work;
445+
struct work_struct fatal_report_work;
446446
struct work_struct report_work;
447447
struct delayed_work recover_work;
448448
struct devlink_health_reporter *fw_reporter;

0 commit comments

Comments
 (0)