scsi: mpi3mr: Graceful handling of surprise removal of PCIe HBA
Implement graceful handling of surprise or orderly removal of PCIe HBA: - Detect a hot removal of the controller at certain critical places in the driver. Early detection will help to reduce the time taken for cleaning up the hot-removed controller at the driver level. - Poll the status of the port enable issued after reset once every 5 seconds to avoid a long delay in detecting unavailable controller. Link: https://lore.kernel.org/r/20220912135742.11764-5-sreekanth.reddy@broadcom.com Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Sreekanth Reddy <sreekanth.reddy@broadcom.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
This commit is contained in:
Родитель
7f9f953d53
Коммит
f2a79d2030
|
@ -118,6 +118,7 @@ extern atomic64_t event_counter;
|
|||
/* command/controller interaction timeout definitions in seconds */
|
||||
#define MPI3MR_INTADMCMD_TIMEOUT 60
|
||||
#define MPI3MR_PORTENABLE_TIMEOUT 300
|
||||
#define MPI3MR_PORTENABLE_POLL_INTERVAL 5
|
||||
#define MPI3MR_ABORTTM_TIMEOUT 60
|
||||
#define MPI3MR_RESETTM_TIMEOUT 60
|
||||
#define MPI3MR_RESET_HOST_IOWAIT_TIMEOUT 5
|
||||
|
@ -1389,4 +1390,6 @@ void mpi3mr_print_device_event_notice(struct mpi3mr_ioc *mrioc,
|
|||
void mpi3mr_refresh_sas_ports(struct mpi3mr_ioc *mrioc);
|
||||
void mpi3mr_refresh_expanders(struct mpi3mr_ioc *mrioc);
|
||||
void mpi3mr_add_event_wait_for_device_refresh(struct mpi3mr_ioc *mrioc);
|
||||
void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc);
|
||||
void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc);
|
||||
#endif /*MPI3MR_H_INCLUDED*/
|
||||
|
|
|
@ -431,6 +431,9 @@ static int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc)
|
|||
return 0;
|
||||
|
||||
do {
|
||||
if (mrioc->unrecoverable)
|
||||
break;
|
||||
|
||||
mrioc->admin_req_ci = le16_to_cpu(reply_desc->request_queue_ci);
|
||||
mpi3mr_process_admin_reply_desc(mrioc, reply_desc, &reply_dma);
|
||||
if (reply_dma)
|
||||
|
@ -516,6 +519,9 @@ int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc,
|
|||
}
|
||||
|
||||
do {
|
||||
if (mrioc->unrecoverable)
|
||||
break;
|
||||
|
||||
req_q_idx = le16_to_cpu(reply_desc->request_queue_id) - 1;
|
||||
op_req_q = &mrioc->req_qinfo[req_q_idx];
|
||||
|
||||
|
@ -577,7 +583,8 @@ int mpi3mr_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
|
|||
|
||||
mrioc = (struct mpi3mr_ioc *)shost->hostdata;
|
||||
|
||||
if ((mrioc->reset_in_progress || mrioc->prepare_for_reset))
|
||||
if ((mrioc->reset_in_progress || mrioc->prepare_for_reset ||
|
||||
mrioc->unrecoverable))
|
||||
return 0;
|
||||
|
||||
num_entries = mpi3mr_process_op_reply_q(mrioc,
|
||||
|
@ -673,7 +680,7 @@ static irqreturn_t mpi3mr_isr_poll(int irq, void *privdata)
|
|||
|
||||
/* Poll for pending IOs completions */
|
||||
do {
|
||||
if (!mrioc->intr_enabled)
|
||||
if (!mrioc->intr_enabled || mrioc->unrecoverable)
|
||||
break;
|
||||
|
||||
if (!midx)
|
||||
|
@ -1220,6 +1227,14 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
|
|||
msleep(100);
|
||||
} while (--timeout);
|
||||
|
||||
if (!pci_device_is_present(mrioc->pdev)) {
|
||||
mrioc->unrecoverable = 1;
|
||||
ioc_err(mrioc,
|
||||
"controller is not present while waiting to reset\n");
|
||||
retval = -1;
|
||||
goto out_device_not_present;
|
||||
}
|
||||
|
||||
ioc_state = mpi3mr_get_iocstate(mrioc);
|
||||
ioc_info(mrioc,
|
||||
"controller is in %s state after waiting to reset\n",
|
||||
|
@ -1277,6 +1292,13 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
|
|||
mpi3mr_iocstate_name(ioc_state));
|
||||
return 0;
|
||||
}
|
||||
if (!pci_device_is_present(mrioc->pdev)) {
|
||||
mrioc->unrecoverable = 1;
|
||||
ioc_err(mrioc,
|
||||
"controller is not present at the bringup\n");
|
||||
retval = -1;
|
||||
goto out_device_not_present;
|
||||
}
|
||||
msleep(100);
|
||||
} while (--timeout);
|
||||
|
||||
|
@ -1285,6 +1307,7 @@ out_failed:
|
|||
ioc_err(mrioc,
|
||||
"failed to bring to ready state, current state: %s\n",
|
||||
mpi3mr_iocstate_name(ioc_state));
|
||||
out_device_not_present:
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -2223,6 +2246,17 @@ void mpi3mr_check_rh_fault_ioc(struct mpi3mr_ioc *mrioc, u32 reason_code)
|
|||
{
|
||||
u32 ioc_status, host_diagnostic, timeout;
|
||||
|
||||
if (mrioc->unrecoverable) {
|
||||
ioc_err(mrioc, "controller is unrecoverable\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!pci_device_is_present(mrioc->pdev)) {
|
||||
mrioc->unrecoverable = 1;
|
||||
ioc_err(mrioc, "controller is not present\n");
|
||||
return;
|
||||
}
|
||||
|
||||
ioc_status = readl(&mrioc->sysif_regs->ioc_status);
|
||||
if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
|
||||
(ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) {
|
||||
|
@ -2414,9 +2448,21 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
|
|||
u32 fault, host_diagnostic, ioc_status;
|
||||
u32 reset_reason = MPI3MR_RESET_FROM_FAULT_WATCH;
|
||||
|
||||
if (mrioc->reset_in_progress || mrioc->unrecoverable)
|
||||
if (mrioc->reset_in_progress)
|
||||
return;
|
||||
|
||||
if (!mrioc->unrecoverable && !pci_device_is_present(mrioc->pdev)) {
|
||||
ioc_err(mrioc, "watchdog could not detect the controller\n");
|
||||
mrioc->unrecoverable = 1;
|
||||
}
|
||||
|
||||
if (mrioc->unrecoverable) {
|
||||
ioc_err(mrioc,
|
||||
"flush pending commands for unrecoverable controller\n");
|
||||
mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (mrioc->ts_update_counter++ >= MPI3MR_TSUPDATE_INTERVAL) {
|
||||
mrioc->ts_update_counter = 0;
|
||||
mpi3mr_sync_timestamp(mrioc);
|
||||
|
@ -2460,7 +2506,7 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
|
|||
ioc_info(mrioc,
|
||||
"controller requires system power cycle, marking controller as unrecoverable\n");
|
||||
mrioc->unrecoverable = 1;
|
||||
return;
|
||||
goto schedule_work;
|
||||
case MPI3_SYSIF_FAULT_CODE_SOFT_RESET_IN_PROGRESS:
|
||||
return;
|
||||
case MPI3_SYSIF_FAULT_CODE_CI_ACTIVATION_RESET:
|
||||
|
@ -3396,10 +3442,13 @@ out_failed:
|
|||
static void mpi3mr_port_enable_complete(struct mpi3mr_ioc *mrioc,
|
||||
struct mpi3mr_drv_cmd *drv_cmd)
|
||||
{
|
||||
drv_cmd->state = MPI3MR_CMD_NOTUSED;
|
||||
drv_cmd->callback = NULL;
|
||||
mrioc->scan_failed = drv_cmd->ioc_status;
|
||||
mrioc->scan_started = 0;
|
||||
if (drv_cmd->state & MPI3MR_CMD_RESET)
|
||||
mrioc->scan_failed = MPI3_IOCSTATUS_INTERNAL_ERROR;
|
||||
else
|
||||
mrioc->scan_failed = drv_cmd->ioc_status;
|
||||
drv_cmd->state = MPI3MR_CMD_NOTUSED;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -3897,8 +3946,12 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
|
|||
int retval = 0;
|
||||
u8 retry = 0;
|
||||
struct mpi3_ioc_facts_data facts_data;
|
||||
u32 pe_timeout, ioc_status;
|
||||
|
||||
retry_init:
|
||||
pe_timeout =
|
||||
(MPI3MR_PORTENABLE_TIMEOUT / MPI3MR_PORTENABLE_POLL_INTERVAL);
|
||||
|
||||
dprint_reset(mrioc, "bringing up the controller to ready state\n");
|
||||
retval = mpi3mr_bring_ioc_ready(mrioc);
|
||||
if (retval) {
|
||||
|
@ -3994,11 +4047,46 @@ retry_init:
|
|||
}
|
||||
|
||||
ioc_info(mrioc, "sending port enable\n");
|
||||
retval = mpi3mr_issue_port_enable(mrioc, 0);
|
||||
retval = mpi3mr_issue_port_enable(mrioc, 1);
|
||||
if (retval) {
|
||||
ioc_err(mrioc, "failed to issue port enable\n");
|
||||
goto out_failed;
|
||||
}
|
||||
do {
|
||||
ssleep(MPI3MR_PORTENABLE_POLL_INTERVAL);
|
||||
if (mrioc->init_cmds.state == MPI3MR_CMD_NOTUSED)
|
||||
break;
|
||||
if (!pci_device_is_present(mrioc->pdev))
|
||||
mrioc->unrecoverable = 1;
|
||||
if (mrioc->unrecoverable) {
|
||||
retval = -1;
|
||||
goto out_failed_noretry;
|
||||
}
|
||||
ioc_status = readl(&mrioc->sysif_regs->ioc_status);
|
||||
if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
|
||||
(ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) {
|
||||
mpi3mr_print_fault_info(mrioc);
|
||||
mrioc->init_cmds.is_waiting = 0;
|
||||
mrioc->init_cmds.callback = NULL;
|
||||
mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
|
||||
goto out_failed;
|
||||
}
|
||||
} while (--pe_timeout);
|
||||
|
||||
if (!pe_timeout) {
|
||||
ioc_err(mrioc, "port enable timed out\n");
|
||||
mpi3mr_check_rh_fault_ioc(mrioc,
|
||||
MPI3MR_RESET_FROM_PE_TIMEOUT);
|
||||
mrioc->init_cmds.is_waiting = 0;
|
||||
mrioc->init_cmds.callback = NULL;
|
||||
mrioc->init_cmds.state = MPI3MR_CMD_NOTUSED;
|
||||
goto out_failed;
|
||||
} else if (mrioc->scan_failed) {
|
||||
ioc_err(mrioc,
|
||||
"port enable failed with status=0x%04x\n",
|
||||
mrioc->scan_failed);
|
||||
} else
|
||||
ioc_info(mrioc, "port enable completed successfully\n");
|
||||
|
||||
ioc_info(mrioc, "controller %s completed successfully\n",
|
||||
(is_resume)?"resume":"re-initialization");
|
||||
|
@ -4417,7 +4505,7 @@ static inline void mpi3mr_drv_cmd_comp_reset(struct mpi3mr_ioc *mrioc,
|
|||
*
|
||||
* Return: Nothing.
|
||||
*/
|
||||
static void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc)
|
||||
void mpi3mr_flush_drv_cmds(struct mpi3mr_ioc *mrioc)
|
||||
{
|
||||
struct mpi3mr_drv_cmd *cmdptr;
|
||||
u8 i;
|
||||
|
@ -4850,6 +4938,7 @@ out:
|
|||
mrioc->unrecoverable = 1;
|
||||
mrioc->reset_in_progress = 0;
|
||||
retval = -1;
|
||||
mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
|
||||
}
|
||||
mrioc->prev_reset_result = retval;
|
||||
mutex_unlock(&mrioc->reset_mutex);
|
||||
|
|
|
@ -582,6 +582,39 @@ void mpi3mr_flush_host_io(struct mpi3mr_ioc *mrioc)
|
|||
mrioc->flush_io_count);
|
||||
}
|
||||
|
||||
/**
|
||||
* mpi3mr_flush_cmds_for_unrecovered_controller - Flush all pending cmds
|
||||
* @mrioc: Adapter instance reference
|
||||
*
|
||||
* This function waits for currently running IO poll threads to
|
||||
* exit and then flushes all host I/Os and any internal pending
|
||||
* cmds. This is executed after controller is marked as
|
||||
* unrecoverable.
|
||||
*
|
||||
* Return: Nothing.
|
||||
*/
|
||||
void mpi3mr_flush_cmds_for_unrecovered_controller(struct mpi3mr_ioc *mrioc)
|
||||
{
|
||||
struct Scsi_Host *shost = mrioc->shost;
|
||||
int i;
|
||||
|
||||
if (!mrioc->unrecoverable)
|
||||
return;
|
||||
|
||||
if (mrioc->op_reply_qinfo) {
|
||||
for (i = 0; i < mrioc->num_queues; i++) {
|
||||
while (atomic_read(&mrioc->op_reply_qinfo[i].in_use))
|
||||
udelay(500);
|
||||
atomic_set(&mrioc->op_reply_qinfo[i].pend_ios, 0);
|
||||
}
|
||||
}
|
||||
mrioc->flush_io_count = 0;
|
||||
blk_mq_tagset_busy_iter(&shost->tag_set,
|
||||
mpi3mr_flush_scmd, (void *)mrioc);
|
||||
mpi3mr_flush_delayed_cmd_lists(mrioc);
|
||||
mpi3mr_flush_drv_cmds(mrioc);
|
||||
}
|
||||
|
||||
/**
|
||||
* mpi3mr_alloc_tgtdev - target device allocator
|
||||
*
|
||||
|
@ -1815,6 +1848,13 @@ static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc,
|
|||
if (mrioc->stop_drv_processing)
|
||||
goto out;
|
||||
|
||||
if (mrioc->unrecoverable) {
|
||||
dprint_event_bh(mrioc,
|
||||
"ignoring event(0x%02x) in bottom half handler due to unrecoverable controller\n",
|
||||
fwevt->event_id);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!fwevt->process_evt)
|
||||
goto evt_ack;
|
||||
|
||||
|
@ -5024,6 +5064,11 @@ static void mpi3mr_remove(struct pci_dev *pdev)
|
|||
while (mrioc->reset_in_progress || mrioc->is_driver_loading)
|
||||
ssleep(1);
|
||||
|
||||
if (!pci_device_is_present(mrioc->pdev)) {
|
||||
mrioc->unrecoverable = 1;
|
||||
mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
|
||||
}
|
||||
|
||||
mpi3mr_bsg_exit(mrioc);
|
||||
mrioc->stop_drv_processing = 1;
|
||||
mpi3mr_cleanup_fwevt_list(mrioc);
|
||||
|
|
Загрузка…
Ссылка в новой задаче