Skip to content

Commit 344d760

Browse files
shivasharan-szychao66
authored andcommitted
scsi: megaraid_sas: Add watchdog thread to detect Firmware fault
ANBZ: torvalds#721 commit 3f6194a upstream. Currently driver checks for Firmware state change from ISR context, and only when there are interrupts tied with no I/O completions. We have seen multiple cases where doorbell interrupts sent by firmware to indicate FW state change are not processed by driver and it takes long time for driver to trigger OCR. And if there are no IOs running, since we only check the FW state as part of ISR code, fault goes undetected by driver and OCR will not be triggered. This patch introduces a separate workqueue that runs every one second to detect Firmware FAULT state and trigger reset immediately. As an additional gain, removing PCI reads from ISR to check FW state results in improved performance as well. Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com> Signed-off-by: Shivasharan S <shivasharan.srikanteshwara@broadcom.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com> Reviewed-by: Liu Song <liusong@linux.alibaba.com> Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com> Link: https://gitee.com/anolis/cloud-kernel/pulls/598
1 parent b1fa0c7 commit 344d760

File tree

3 files changed

+165
-55
lines changed

3 files changed

+165
-55
lines changed

drivers/scsi/megaraid/megaraid_sas.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,6 +1549,10 @@ enum FW_BOOT_CONTEXT {
15491549

15501550
#define MR_CAN_HANDLE_64_BIT_DMA_OFFSET (1 << 25)
15511551

1552+
#define MEGASAS_WATCHDOG_THREAD_INTERVAL 1000
1553+
#define MEGASAS_WAIT_FOR_NEXT_DMA_MSECS 20
1554+
#define MEGASAS_WATCHDOG_WAIT_COUNT 50
1555+
15521556
enum MR_ADAPTER_TYPE {
15531557
MFI_SERIES = 1,
15541558
THUNDERBOLT_SERIES = 2,
@@ -2255,7 +2259,9 @@ struct megasas_instance {
22552259
struct megasas_instance_template *instancet;
22562260
struct tasklet_struct isr_tasklet;
22572261
struct work_struct work_init;
2258-
struct work_struct crash_init;
2262+
struct delayed_work fw_fault_work;
2263+
struct workqueue_struct *fw_fault_work_q;
2264+
char fault_handler_work_q_name[48];
22592265

22602266
u8 flag;
22612267
u8 unload;
@@ -2544,7 +2550,6 @@ int megasas_get_target_prop(struct megasas_instance *instance,
25442550
int megasas_set_crash_dump_params(struct megasas_instance *instance,
25452551
u8 crash_buf_state);
25462552
void megasas_free_host_crash_buffer(struct megasas_instance *instance);
2547-
void megasas_fusion_crash_dump_wq(struct work_struct *work);
25482553

25492554
u32 megasas_readl(struct megasas_instance *instance,
25502555
const volatile void __iomem *addr);
@@ -2567,6 +2572,9 @@ int megasas_reset_target_fusion(struct scsi_cmnd *scmd);
25672572
u32 mega_mod64(u64 dividend, u32 divisor);
25682573
int megasas_alloc_fusion_context(struct megasas_instance *instance);
25692574
void megasas_free_fusion_context(struct megasas_instance *instance);
2575+
int megasas_fusion_start_watchdog(struct megasas_instance *instance);
2576+
void megasas_fusion_stop_watchdog(struct megasas_instance *instance);
2577+
25702578
void megasas_set_dma_settings(struct megasas_instance *instance,
25712579
struct megasas_dcmd_frame *dcmd,
25722580
dma_addr_t dma_addr, u32 dma_len);

drivers/scsi/megaraid/megaraid_sas_base.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5657,8 +5657,20 @@ static int megasas_init_fw(struct megasas_instance *instance)
56575657
}
56585658
}
56595659

5660+
/*
5661+
* Create and start watchdog thread which will monitor
5662+
* controller state every 1 sec and trigger OCR when
5663+
* it enters fault state
5664+
*/
5665+
if (instance->adapter_type != MFI_SERIES)
5666+
if (megasas_fusion_start_watchdog(instance) != SUCCESS)
5667+
goto fail_start_watchdog;
5668+
56605669
return 0;
56615670

5671+
fail_start_watchdog:
5672+
if (instance->requestorId && !instance->skip_heartbeat_timer_del)
5673+
del_timer_sync(&instance->sriov_heartbeat_timer);
56625674
fail_get_ld_pd_list:
56635675
instance->instancet->disable_intr(instance);
56645676
megasas_destroy_irqs(instance);
@@ -6520,12 +6532,10 @@ static inline void megasas_init_ctrl_params(struct megasas_instance *instance)
65206532
instance->disableOnlineCtrlReset = 1;
65216533
instance->UnevenSpanSupport = 0;
65226534

6523-
if (instance->adapter_type != MFI_SERIES) {
6535+
if (instance->adapter_type != MFI_SERIES)
65246536
INIT_WORK(&instance->work_init, megasas_fusion_ocr_wq);
6525-
INIT_WORK(&instance->crash_init, megasas_fusion_crash_dump_wq);
6526-
} else {
6537+
else
65276538
INIT_WORK(&instance->work_init, process_fw_state_change_wq);
6528-
}
65296539
}
65306540

65316541
/**
@@ -6798,6 +6808,10 @@ megasas_suspend(struct pci_dev *pdev, pm_message_t state)
67986808
if (instance->requestorId && !instance->skip_heartbeat_timer_del)
67996809
del_timer_sync(&instance->sriov_heartbeat_timer);
68006810

6811+
/* Stop the FW fault detection watchdog */
6812+
if (instance->adapter_type != MFI_SERIES)
6813+
megasas_fusion_stop_watchdog(instance);
6814+
68016815
megasas_flush_cache(instance);
68026816
megasas_shutdown_controller(instance, MR_DCMD_HIBERNATE_SHUTDOWN);
68036817

@@ -6933,8 +6947,16 @@ megasas_resume(struct pci_dev *pdev)
69336947
if (megasas_start_aen(instance))
69346948
dev_err(&instance->pdev->dev, "Start AEN failed\n");
69356949

6950+
/* Re-launch FW fault watchdog */
6951+
if (instance->adapter_type != MFI_SERIES)
6952+
if (megasas_fusion_start_watchdog(instance) != SUCCESS)
6953+
goto fail_start_watchdog;
6954+
69366955
return 0;
69376956

6957+
fail_start_watchdog:
6958+
if (instance->requestorId && !instance->skip_heartbeat_timer_del)
6959+
del_timer_sync(&instance->sriov_heartbeat_timer);
69386960
fail_init_mfi:
69396961
megasas_free_ctrl_dma_buffers(instance);
69406962
megasas_free_ctrl_mem(instance);
@@ -7002,6 +7024,10 @@ static void megasas_detach_one(struct pci_dev *pdev)
70027024
if (instance->requestorId && !instance->skip_heartbeat_timer_del)
70037025
del_timer_sync(&instance->sriov_heartbeat_timer);
70047026

7027+
/* Stop the FW fault detection watchdog */
7028+
if (instance->adapter_type != MFI_SERIES)
7029+
megasas_fusion_stop_watchdog(instance);
7030+
70057031
if (instance->fw_crash_state != UNAVAILABLE)
70067032
megasas_free_host_crash_buffer(instance);
70077033
scsi_remove_host(instance->host);

drivers/scsi/megaraid/megaraid_sas_fusion.c

Lines changed: 125 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
#include <linux/mutex.h>
4949
#include <linux/poll.h>
5050
#include <linux/vmalloc.h>
51+
#include <linux/workqueue.h>
5152

5253
#include <scsi/scsi.h>
5354
#include <scsi/scsi_cmnd.h>
@@ -95,6 +96,7 @@ static void megasas_free_rdpq_fusion(struct megasas_instance *instance);
9596
static void megasas_free_reply_fusion(struct megasas_instance *instance);
9697
static inline
9798
void megasas_configure_queue_sizes(struct megasas_instance *instance);
99+
static void megasas_fusion_crash_dump(struct megasas_instance *instance);
98100

99101
/**
100102
* megasas_check_same_4gb_region - check if allocation
@@ -1764,6 +1766,90 @@ megasas_init_adapter_fusion(struct megasas_instance *instance)
17641766
return 1;
17651767
}
17661768

1769+
/**
1770+
* megasas_fault_detect_work - Worker function of
1771+
* FW fault handling workqueue.
1772+
*/
1773+
static void
1774+
megasas_fault_detect_work(struct work_struct *work)
1775+
{
1776+
struct megasas_instance *instance =
1777+
container_of(work, struct megasas_instance,
1778+
fw_fault_work.work);
1779+
u32 fw_state, dma_state, status;
1780+
1781+
/* Check the fw state */
1782+
fw_state = instance->instancet->read_fw_status_reg(instance) &
1783+
MFI_STATE_MASK;
1784+
1785+
if (fw_state == MFI_STATE_FAULT) {
1786+
dma_state = instance->instancet->read_fw_status_reg(instance) &
1787+
MFI_STATE_DMADONE;
1788+
/* Start collecting crash, if DMA bit is done */
1789+
if (instance->crash_dump_drv_support &&
1790+
instance->crash_dump_app_support && dma_state) {
1791+
megasas_fusion_crash_dump(instance);
1792+
} else {
1793+
if (instance->unload == 0) {
1794+
status = megasas_reset_fusion(instance->host, 0);
1795+
if (status != SUCCESS) {
1796+
dev_err(&instance->pdev->dev,
1797+
"Failed from %s %d, do not re-arm timer\n",
1798+
__func__, __LINE__);
1799+
return;
1800+
}
1801+
}
1802+
}
1803+
}
1804+
1805+
if (instance->fw_fault_work_q)
1806+
queue_delayed_work(instance->fw_fault_work_q,
1807+
&instance->fw_fault_work,
1808+
msecs_to_jiffies(MEGASAS_WATCHDOG_THREAD_INTERVAL));
1809+
}
1810+
1811+
int
1812+
megasas_fusion_start_watchdog(struct megasas_instance *instance)
1813+
{
1814+
/* Check if the Fault WQ is already started */
1815+
if (instance->fw_fault_work_q)
1816+
return SUCCESS;
1817+
1818+
INIT_DELAYED_WORK(&instance->fw_fault_work, megasas_fault_detect_work);
1819+
1820+
snprintf(instance->fault_handler_work_q_name,
1821+
sizeof(instance->fault_handler_work_q_name),
1822+
"poll_megasas%d_status", instance->host->host_no);
1823+
1824+
instance->fw_fault_work_q =
1825+
create_singlethread_workqueue(instance->fault_handler_work_q_name);
1826+
if (!instance->fw_fault_work_q) {
1827+
dev_err(&instance->pdev->dev, "Failed from %s %d\n",
1828+
__func__, __LINE__);
1829+
return FAILED;
1830+
}
1831+
1832+
queue_delayed_work(instance->fw_fault_work_q,
1833+
&instance->fw_fault_work,
1834+
msecs_to_jiffies(MEGASAS_WATCHDOG_THREAD_INTERVAL));
1835+
1836+
return SUCCESS;
1837+
}
1838+
1839+
void
1840+
megasas_fusion_stop_watchdog(struct megasas_instance *instance)
1841+
{
1842+
struct workqueue_struct *wq;
1843+
1844+
if (instance->fw_fault_work_q) {
1845+
wq = instance->fw_fault_work_q;
1846+
instance->fw_fault_work_q = NULL;
1847+
if (!cancel_delayed_work_sync(&instance->fw_fault_work))
1848+
flush_workqueue(wq);
1849+
destroy_workqueue(wq);
1850+
}
1851+
}
1852+
17671853
/**
17681854
* map_cmd_status - Maps FW cmd status to OS cmd status
17691855
* @cmd : Pointer to cmd
@@ -3532,7 +3618,7 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp)
35323618
{
35333619
struct megasas_irq_context *irq_context = devp;
35343620
struct megasas_instance *instance = irq_context->instance;
3535-
u32 mfiStatus, fw_state, dma_state;
3621+
u32 mfiStatus;
35363622

35373623
if (instance->mask_interrupts)
35383624
return IRQ_NONE;
@@ -3549,31 +3635,7 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp)
35493635
return IRQ_HANDLED;
35503636
}
35513637

3552-
if (!complete_cmd_fusion(instance, irq_context->MSIxIndex)) {
3553-
instance->instancet->clear_intr(instance);
3554-
/* If we didn't complete any commands, check for FW fault */
3555-
fw_state = instance->instancet->read_fw_status_reg(instance) &
3556-
MFI_STATE_MASK;
3557-
dma_state = instance->instancet->read_fw_status_reg(instance) &
3558-
MFI_STATE_DMADONE;
3559-
if (instance->crash_dump_drv_support &&
3560-
instance->crash_dump_app_support) {
3561-
/* Start collecting crash, if DMA bit is done */
3562-
if ((fw_state == MFI_STATE_FAULT) && dma_state)
3563-
schedule_work(&instance->crash_init);
3564-
else if (fw_state == MFI_STATE_FAULT) {
3565-
if (instance->unload == 0)
3566-
schedule_work(&instance->work_init);
3567-
}
3568-
} else if (fw_state == MFI_STATE_FAULT) {
3569-
dev_warn(&instance->pdev->dev, "Iop2SysDoorbellInt"
3570-
"for scsi%d\n", instance->host->host_no);
3571-
if (instance->unload == 0)
3572-
schedule_work(&instance->work_init);
3573-
}
3574-
}
3575-
3576-
return IRQ_HANDLED;
3638+
return complete_cmd_fusion(instance, irq_context->MSIxIndex);
35773639
}
35783640

35793641
/**
@@ -4821,14 +4883,11 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int reason)
48214883
return retval;
48224884
}
48234885

4824-
/* Fusion Crash dump collection work queue */
4825-
void megasas_fusion_crash_dump_wq(struct work_struct *work)
4886+
void megasas_fusion_crash_dump(struct megasas_instance *instance)
48264887
{
4827-
struct megasas_instance *instance =
4828-
container_of(work, struct megasas_instance, crash_init);
48294888
u32 status_reg;
48304889
u8 partial_copy = 0;
4831-
4890+
int wait = 0;
48324891

48334892
status_reg = instance->instancet->read_fw_status_reg(instance);
48344893

@@ -4855,21 +4914,41 @@ void megasas_fusion_crash_dump_wq(struct work_struct *work)
48554914
"allocated: %d\n", instance->drv_buf_alloc);
48564915
}
48574916

4858-
/*
4859-
* Driver has allocated max buffers, which can be allocated
4860-
* and FW has more crash dump data, then driver will
4861-
* ignore the data.
4862-
*/
4863-
if (instance->drv_buf_index >= (instance->drv_buf_alloc)) {
4864-
dev_info(&instance->pdev->dev, "Driver is done copying "
4865-
"the buffer: %d\n", instance->drv_buf_alloc);
4866-
status_reg |= MFI_STATE_CRASH_DUMP_DONE;
4867-
partial_copy = 1;
4868-
} else {
4869-
memcpy(instance->crash_buf[instance->drv_buf_index],
4870-
instance->crash_dump_buf, CRASH_DMA_BUF_SIZE);
4871-
instance->drv_buf_index++;
4872-
status_reg &= ~MFI_STATE_DMADONE;
4917+
while (!(status_reg & MFI_STATE_CRASH_DUMP_DONE) &&
4918+
(wait < MEGASAS_WATCHDOG_WAIT_COUNT)) {
4919+
if (!(status_reg & MFI_STATE_DMADONE)) {
4920+
/*
4921+
* Next crash dump buffer is not yet DMA'd by FW
4922+
* Check after 10ms. Wait for 1 second for FW to
4923+
* post the next buffer. If not bail out.
4924+
*/
4925+
wait++;
4926+
msleep(MEGASAS_WAIT_FOR_NEXT_DMA_MSECS);
4927+
status_reg = instance->instancet->read_fw_status_reg(
4928+
instance);
4929+
continue;
4930+
}
4931+
4932+
wait = 0;
4933+
if (instance->drv_buf_index >= instance->drv_buf_alloc) {
4934+
dev_info(&instance->pdev->dev,
4935+
"Driver is done copying the buffer: %d\n",
4936+
instance->drv_buf_alloc);
4937+
status_reg |= MFI_STATE_CRASH_DUMP_DONE;
4938+
partial_copy = 1;
4939+
break;
4940+
} else {
4941+
memcpy(instance->crash_buf[instance->drv_buf_index],
4942+
instance->crash_dump_buf, CRASH_DMA_BUF_SIZE);
4943+
instance->drv_buf_index++;
4944+
status_reg &= ~MFI_STATE_DMADONE;
4945+
}
4946+
4947+
writel(status_reg, &instance->reg_set->outbound_scratch_pad_0);
4948+
readl(&instance->reg_set->outbound_scratch_pad_0);
4949+
4950+
msleep(MEGASAS_WAIT_FOR_NEXT_DMA_MSECS);
4951+
status_reg = instance->instancet->read_fw_status_reg(instance);
48734952
}
48744953

48754954
if (status_reg & MFI_STATE_CRASH_DUMP_DONE) {
@@ -4882,9 +4961,6 @@ void megasas_fusion_crash_dump_wq(struct work_struct *work)
48824961
readl(&instance->reg_set->outbound_scratch_pad_0);
48834962
if (!partial_copy)
48844963
megasas_reset_fusion(instance->host, 0);
4885-
} else {
4886-
writel(status_reg, &instance->reg_set->outbound_scratch_pad_0);
4887-
readl(&instance->reg_set->outbound_scratch_pad_0);
48884964
}
48894965
}
48904966

0 commit comments

Comments
 (0)