From 19fce0470f05031e6af36e49ce222d0f0050d432 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 1 Dec 2020 17:52:43 -0800 Subject: [PATCH 1/8] nvme-fc: avoid calling _nvme_fc_abort_outstanding_ios from interrupt context Recent patches changed calling sequences. nvme_fc_abort_outstanding_ios used to be called from a timeout or work context. Now it is being called in an io completion context, which can be an interrupt handler. Unfortunately, the abort outstanding ios routine attempts to stop nvme queues and nested routines that may try to sleep, which is in conflict with the interrupt handler. Correct replacing the direct call with a work element scheduling, and the abort outstanding ios routine will be called in the work element. Fixes: 95ced8a2c72d ("nvme-fc: eliminate terminate_io use by nvme_fc_error_recovery") Signed-off-by: James Smart Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 38373a0e86ef..5f36cfa8136c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -166,6 +166,7 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set admin_tag_set; struct blk_mq_tag_set tag_set; + struct work_struct ioerr_work; struct delayed_work connect_work; struct kref ref; @@ -1888,6 +1889,15 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, } } +static void +nvme_fc_ctrl_ioerr_work(struct work_struct *work) +{ + struct nvme_fc_ctrl *ctrl = + container_of(work, struct nvme_fc_ctrl, ioerr_work); + + nvme_fc_error_recovery(ctrl, "transport detected io error"); +} + static void nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) { @@ -2046,7 +2056,7 @@ done: check_error: if (terminate_assoc) - nvme_fc_error_recovery(ctrl, "transport detected io error"); + queue_work(nvme_reset_wq, &ctrl->ioerr_work); } static int @@ -3233,6 +3243,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); + cancel_work_sync(&ctrl->ioerr_work); cancel_delayed_work_sync(&ctrl->connect_work); /* * kill the association on the link side. this will block @@ -3449,6 +3460,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); + INIT_WORK(&ctrl->ioerr_work, nvme_fc_ctrl_ioerr_work); spin_lock_init(&ctrl->lock); /* io queue count */ @@ -3540,6 +3552,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, fail_ctrl: nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); + cancel_work_sync(&ctrl->ioerr_work); cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); From 2b54996b7d56badc563755840838614f2fa9c4de Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 7 Dec 2020 12:29:40 -0800 Subject: [PATCH 2/8] nvme-fcloop: Fix sscanf type and list_first_entry_or_null warnings Kernel robot had the following warnings: >> fcloop.c:1506:6: warning: %x in format string (no. 1) requires >> 'unsigned int *' but the argument type is 'signed int *'. >> [invalidScanfArgType_int] >> if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) >> ^ Resolve by changing opcode from and int to an unsigned int and >> fcloop.c:1632:32: warning: Uninitialized variable: lport [uninitvar] >> ret = __wait_localport_unreg(lport); >> ^ >> fcloop.c:1615:28: warning: Uninitialized variable: nport [uninitvar] >> ret = __remoteport_unreg(nport, rport); >> ^ These aren't actual issues as the values are assigned prior to use. It appears the tool doesn't understand list_first_entry_or_null(). Regardless, quiet the tool by initializing the pointers to NULL at declaration. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 733d9363900e..68213f0a052b 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1501,7 +1501,8 @@ static ssize_t fcloop_set_cmd_drop(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int opcode, starting, amount; + unsigned int opcode; + int starting, amount; if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) return -EBADRQC; @@ -1588,8 +1589,8 @@ out_destroy_class: static void __exit fcloop_exit(void) { - struct fcloop_lport *lport; - struct fcloop_nport *nport; + struct fcloop_lport *lport = NULL; + struct fcloop_nport *nport = NULL; struct fcloop_tport *tport; struct fcloop_rport *rport; unsigned long flags; From 7ee5c78ca3895d44e918c38332921983ed678be0 Mon Sep 17 00:00:00 2001 From: Gopal Tiwari Date: Fri, 4 Dec 2020 21:46:57 +0530 Subject: [PATCH 3/8] nvme-pci: mark Samsung PM1725a as IGNORE_DEV_SUBNQN A system with more than one of these SSDs will only have one usable. Hence the kernel fails to detect nvme devices due to duplicate cntlids. [ 6.274554] nvme nvme1: Duplicate cntlid 33 with nvme0, rejecting [ 6.274566] nvme nvme1: Removing after probe failure status: -22 Adding the NVME_QUIRK_IGNORE_DEV_SUBNQN quirk to resolves the issue. Signed-off-by: Gopal Tiwari Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b4385cb0ff60..553871e6962b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3196,7 +3196,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ From 5c11f7d9f843bdd24cd29b95401938bc3f168070 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 21 Dec 2020 00:03:39 -0800 Subject: [PATCH 4/8] nvme-tcp: Fix possible race of io_work and direct send We may send a request (with or without its data) from two paths: 1. From our I/O context nvme_tcp_io_work which is triggered from: - queue_rq - r2t reception - socket data_ready and write_space callbacks 2. Directly from queue_rq if the send_list is empty (because we want to save the context switch associated with scheduling our io_work). However, given that now we have the send_mutex, we may run into a race condition where none of these contexts will send the pending payload to the controller. Both io_work send path and queue_rq send path opportunistically attempt to acquire the send_mutex however queue_rq only attempts to send a single request, and if io_work context fails to acquire the send_mutex it will complete without rescheduling itself. The race can trigger with the following sequence: 1. queue_rq sends request (no incapsule data) and blocks 2. RX path receives r2t - prepares data PDU to send, adds h2cdata PDU to the send_list and schedules io_work 3. io_work triggers and cannot acquire the send_mutex - because of (1), ends without self rescheduling 4. queue_rq completes the send, and completes ==> no context will send the h2cdata - timeout. Fix this by having queue_rq sending as much as it can from the send_list such that if it still has any left, its because the socket buffer is full and the socket write_space callback will trigger, thus guaranteeing that a context will be scheduled to send the h2cdata PDU. Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context") Reported-by: Potnuri Bharat Teja Reported-by: Samuel Jones Signed-off-by: Sagi Grimberg Tested-by: Potnuri Bharat Teja Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1ba659927442..979ee31b8dd1 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -262,6 +262,16 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } } +static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue) +{ + int ret; + + /* drain the send queue as much as we can... */ + do { + ret = nvme_tcp_try_send(queue); + } while (ret > 0); +} + static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, bool sync, bool last) { @@ -279,7 +289,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, if (queue->io_cpu == smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { queue->more_requests = !last; - nvme_tcp_try_send(queue); + nvme_tcp_send_all(queue); queue->more_requests = false; mutex_unlock(&queue->send_mutex); } else if (last) { From 62df80165d7f197c9c0652e7416164f294a96661 Mon Sep 17 00:00:00 2001 From: Lalithambika Krishnakumar Date: Wed, 23 Dec 2020 14:09:00 -0800 Subject: [PATCH 5/8] nvme: avoid possible double fetch in handling CQE While handling the completion queue, keep a local copy of the command id from the DMA-accessible completion entry. This silences a time-of-check to time-of-use (TOCTOU) warning from KF/x[1], with respect to a Thunderclap[2] vulnerability analysis. The double-read impact appears benign. There may be a theoretical window for @command_id to be used as an adversary-controlled array-index-value for mounting a speculative execution attack, but that mitigation is saved for a potential follow-on. A man-in-the-middle attack on the data payload is out of scope for this analysis and is hopefully mitigated by filesystem integrity mechanisms. [1] https://github.com/intel/kernel-fuzzer-for-xen-project [2] http://thunderclap.io/thunderclap-paper-ndss2019.pdf Signed-off-by: Lalithambika Krishna Kumar Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 553871e6962b..50d9a20568a2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -967,6 +967,7 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) { struct nvme_completion *cqe = &nvmeq->cqes[idx]; + __u16 command_id = READ_ONCE(cqe->command_id); struct request *req; /* @@ -975,17 +976,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvme_is_aen_req(nvmeq->qid, cqe->command_id))) { + if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) { nvme_complete_async_event(&nvmeq->dev->ctrl, cqe->status, &cqe->result); return; } - req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); + req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpu(cqe->sq_id)); + command_id, le16_to_cpu(cqe->sq_id)); return; } From 9b66fc02bec0ca613bc6d4c1d0049f727a95567d Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 30 Dec 2020 20:22:44 +0900 Subject: [PATCH 6/8] nvme: unexport functions with no external caller There are no callers for nvme_reset_ctrl_sync() and nvme_alloc_request_qid() so that we keep the symbols exported. Unexport those functions, mark them static and update the header file respectively. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 ++---- drivers/nvme/host/nvme.h | 3 --- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce1b61519441..70a63d7c1d02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -179,7 +179,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -192,7 +192,6 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) return ret; } -EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) { @@ -578,7 +577,7 @@ struct request *nvme_alloc_request(struct request_queue *q, } EXPORT_SYMBOL_GPL(nvme_alloc_request); -struct request *nvme_alloc_request_qid(struct request_queue *q, +static struct request *nvme_alloc_request_qid(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) { struct request *req; @@ -589,7 +588,6 @@ struct request *nvme_alloc_request_qid(struct request_queue *q, nvme_init_request(req, cmd); return req; } -EXPORT_SYMBOL_GPL(nvme_alloc_request_qid); static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7e49f61f81df..9c4fbfe44c00 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -610,8 +610,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags); -struct request *nvme_alloc_request_qid(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); @@ -630,7 +628,6 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); -int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); From 9ceb7863537748c67fa43ac4f2f565819bbd36e4 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 5 Jan 2021 10:46:54 +0200 Subject: [PATCH 7/8] nvmet-rdma: Fix list_del corruption on queue establishment failure When a queue is in NVMET_RDMA_Q_CONNECTING state, it may has some requests at rsp_wait_list. In case a disconnect occurs at this state, no one will empty this list and will return the requests to free_rsps list. Normally nvmet_rdma_queue_established() free those requests after moving the queue to NVMET_RDMA_Q_LIVE state, but in this case __nvmet_rdma_queue_disconnect() is called before. The crash happens at nvmet_rdma_free_rsps() when calling list_del(&rsp->free_list), because the request exists only at the wait list. To fix the issue, simply clear rsp_wait_list when destroying the queue. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 5c1e7cb7fe0d..bdfc22eb2a10 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1641,6 +1641,16 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) spin_lock_irqsave(&queue->state_lock, flags); switch (queue->state) { case NVMET_RDMA_Q_CONNECTING: + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *rsp; + + rsp = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, + wait_list); + list_del(&rsp->wait_list); + nvmet_rdma_put_rsp(rsp); + } + fallthrough; case NVMET_RDMA_Q_LIVE: queue->state = NVMET_RDMA_Q_DISCONNECTING; disconnect = true; From 2b59787a223b79228fed9ade1bf6936194ddb8cd Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 5 Jan 2021 10:34:02 +0000 Subject: [PATCH 8/8] nvme: remove the unused status argument from nvme_trace_bio_complete The only used argument in this function is the "req". Signed-off-by: Max Gurtovoy Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/nvme.h | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 70a63d7c1d02..f320273fc672 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -330,7 +330,7 @@ static inline void nvme_end_req(struct request *req) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); - nvme_trace_bio_complete(req, status); + nvme_trace_bio_complete(req); blk_mq_end_request(req, status); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9c4fbfe44c00..88a6b97247f5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -672,8 +672,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } -static inline void nvme_trace_bio_complete(struct request *req, - blk_status_t status) +static inline void nvme_trace_bio_complete(struct request *req) { struct nvme_ns *ns = req->q->queuedata; @@ -728,8 +727,7 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } -static inline void nvme_trace_bio_complete(struct request *req, - blk_status_t status) +static inline void nvme_trace_bio_complete(struct request *req) { } static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,