From 15bfd21fbc5d35834b9ea383dc458a1f0c9e3434 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 26 Jun 2018 09:14:58 -0600 Subject: [PATCH 1/6] block: Fix transfer when chunk sectors exceeds max A device may have boundary restrictions where the number of sectors between boundaries exceeds its max transfer size. In this case, we need to cap the max size to the smaller of the two limits. Reported-by: Jitendra Bhivare Tested-by: Jitendra Bhivare Cc: Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9154570edf29..79226ca8f80f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1119,8 +1119,8 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, if (!q->limits.chunk_sectors) return q->limits.max_sectors; - return q->limits.chunk_sectors - - (offset & (q->limits.chunk_sectors - 1)); + return min(q->limits.max_sectors, (unsigned int)(q->limits.chunk_sectors - + (offset & (q->limits.chunk_sectors - 1)))); } static inline unsigned int blk_rq_get_max_sectors(struct request *rq, From 682630f00a219a1b0696abe9c0967e660068187b Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 25 Jun 2018 20:58:17 +0300 Subject: [PATCH 2/6] nvme-rdma: fix possible double free of controller async event buffer If reconnect/reset failed where the controller async event buffer was freed, we might end up freeing it again as we call nvme_rdma_destroy_admin_queue again in the remove path. Given that the sequence is guaranteed to serialize by .ctrl_stop, we simply set ctrl->async_event_sqe.data to NULL and don't free it in future visits. Reported-by: Max Gurtovoy Tested-by: Max Gurtovoy Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9544625c0b7d..518c5b09038c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -732,8 +732,11 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_cleanup_queue(ctrl->ctrl.admin_q); nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); } - nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, - sizeof(struct nvme_command), DMA_TO_DEVICE); + if (ctrl->async_event_sqe.data) { + nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + ctrl->async_event_sqe.data = NULL; + } nvme_rdma_free_queue(&ctrl->queues[0]); } From 297ba57dcdec7ea37e702bcf1a577ac32a034e21 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 27 Jun 2018 12:55:18 -0700 Subject: [PATCH 3/6] block: Fix cloning of requests with a special payload This patch avoids that removing a path controlled by the dm-mpath driver while mkfs is running triggers the following kernel bug: kernel BUG at block/blk-core.c:3347! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 20 PID: 24369 Comm: mkfs.ext4 Not tainted 4.18.0-rc1-dbg+ #2 RIP: 0010:blk_end_request_all+0x68/0x70 Call Trace: dm_softirq_done+0x326/0x3d0 [dm_mod] blk_done_softirq+0x19b/0x1e0 __do_softirq+0x128/0x60d irq_exit+0x100/0x110 smp_call_function_single_interrupt+0x90/0x330 call_function_single_interrupt+0xf/0x20 Fixes: f9d03f96b988 ("block: improve handling of the magic discard payload") Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Acked-by: Mike Snitzer Signed-off-by: Bart Van Assche Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index afd2596ea3d3..f84a9b7b6f5a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3473,6 +3473,10 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) dst->cpu = src->cpu; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); + if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { + dst->rq_flags |= RQF_SPECIAL_PAYLOAD; + dst->special_vec = src->special_vec; + } dst->nr_phys_segments = src->nr_phys_segments; dst->ioprio = src->ioprio; dst->extra_len = src->extra_len; From 1f57f8d442f8017587eeebd8617913bfc3661d3d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Jun 2018 11:54:01 -0600 Subject: [PATCH 4/6] blk-mq: don't queue more if we get a busy return Some devices have different queue limits depending on the type of IO. A classic case is SATA NCQ, where some commands can queue, but others cannot. If we have NCQ commands inflight and encounter a non-queueable command, the driver returns busy. Currently we attempt to dispatch more from the scheduler, if we were able to queue some commands. But for the case where we ended up stopping due to BUSY, we should not attempt to retrieve more from the scheduler. If we do, we can get into a situation where we attempt to queue a non-queueable command, get BUSY, then successfully retrieve more commands from that scheduler and queue those. This can repeat forever, starving the non-queuable command indefinitely. Fix this by NOT attempting to pull more commands from the scheduler, if we get a BUSY return. This should also be more optimal in terms of letting requests stay in the scheduler for as long as possible, if we get a BUSY due to the regular out-of-tags condition. Reviewed-by: Omar Sandoval Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index b6888ff556cf..d394cdd8d8c6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1075,6 +1075,9 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ +/* + * Returns true if we did some work AND can potentially do more. + */ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool got_budget) { @@ -1205,8 +1208,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, blk_mq_run_hw_queue(hctx, true); else if (needs_restart && (ret == BLK_STS_RESOURCE)) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); + + return false; } + /* + * If the host/device is unable to accept more work, inform the + * caller of that. + */ + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) + return false; + return (queued + errors) != 0; } From fad2d4ef636654e926d374ef038f4cd4286661f6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 Jun 2018 15:51:30 -0700 Subject: [PATCH 5/6] drbd: Fix drbd_request_prepare() discard handling Fix the test that verifies whether bio_op(bio) represents a discard or write zeroes operation. Compile-tested only. Cc: Philipp Reisner Cc: Lars Ellenberg Fixes: 7435e9018f91 ("drbd: zero-out partial unaligned discards on local backend") Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index a47e4987ee46..d146fedc38bb 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1244,8 +1244,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long _drbd_start_io_acct(device, req); /* process discards always from our submitter thread */ - if ((bio_op(bio) & REQ_OP_WRITE_ZEROES) || - (bio_op(bio) & REQ_OP_DISCARD)) + if (bio_op(bio) == REQ_OP_WRITE_ZEROES || + bio_op(bio) == REQ_OP_DISCARD) goto queue_for_submitter_thread; if (rw == WRITE && req->private_bio && req->i.size From 9544bc5347207a68eb308cc8aaaed6c3a687cabd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Jun 2018 08:48:06 -0600 Subject: [PATCH 6/6] sg: remove ->sg_magic member This was introduced more than a decade ago when sg chaining was added, but we never really caught anything with it. The scatterlist entry size can be critical, since drivers allocate it, so remove the magic member. Recently it's been triggering allocation stalls and failures in NVMe. Tested-by: Jordan Glover Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/gpu/drm/i915/i915_drv.h | 3 --- include/linux/scatterlist.h | 18 ------------------ lib/scatterlist.c | 6 ------ tools/virtio/linux/scatterlist.h | 18 ------------------ 4 files changed, 45 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 34c125e2d90c..9180f67746b4 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2238,9 +2238,6 @@ static inline struct scatterlist *____sg_next(struct scatterlist *sg) **/ static inline struct scatterlist *__sg_next(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif return sg_is_last(sg) ? NULL : ____sg_next(sg); } diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 51f52020ad5f..093aa57120b0 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -9,9 +9,6 @@ #include struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif unsigned long page_link; unsigned int offset; unsigned int length; @@ -64,7 +61,6 @@ struct sg_table { * */ -#define SG_MAGIC 0x87654321 #define SG_CHAIN 0x01UL #define SG_END 0x02UL @@ -98,7 +94,6 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) */ BUG_ON((unsigned long) page & (SG_CHAIN | SG_END)); #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; @@ -129,7 +124,6 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page, static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END)); @@ -195,9 +189,6 @@ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, **/ static inline void sg_mark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif /* * Set termination bit, clear potential chain bit */ @@ -215,9 +206,6 @@ static inline void sg_mark_end(struct scatterlist *sg) **/ static inline void sg_unmark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif sg->page_link &= ~SG_END; } @@ -260,12 +248,6 @@ static inline void *sg_virt(struct scatterlist *sg) static inline void sg_init_marker(struct scatterlist *sgl, unsigned int nents) { -#ifdef CONFIG_DEBUG_SG - unsigned int i; - - for (i = 0; i < nents; i++) - sgl[i].sg_magic = SG_MAGIC; -#endif sg_mark_end(&sgl[nents - 1]); } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 06dad7a072fd..d4ae67d6cd1e 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -24,9 +24,6 @@ **/ struct scatterlist *sg_next(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif if (sg_is_last(sg)) return NULL; @@ -111,10 +108,7 @@ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) for_each_sg(sgl, sg, nents, i) ret = sg; -#ifdef CONFIG_DEBUG_SG - BUG_ON(sgl[0].sg_magic != SG_MAGIC); BUG_ON(!sg_is_last(ret)); -#endif return ret; } EXPORT_SYMBOL(sg_last); diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h index 9a45f90e2d08..369ee308b668 100644 --- a/tools/virtio/linux/scatterlist.h +++ b/tools/virtio/linux/scatterlist.h @@ -36,7 +36,6 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) */ BUG_ON((unsigned long) page & 0x03); #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; @@ -67,7 +66,6 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page, static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~0x3); @@ -116,9 +114,6 @@ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, **/ static inline void sg_mark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif /* * Set termination bit, clear potential chain bit */ @@ -136,17 +131,11 @@ static inline void sg_mark_end(struct scatterlist *sg) **/ static inline void sg_unmark_end(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif sg->page_link &= ~0x02; } static inline struct scatterlist *sg_next(struct scatterlist *sg) { -#ifdef CONFIG_DEBUG_SG - BUG_ON(sg->sg_magic != SG_MAGIC); -#endif if (sg_is_last(sg)) return NULL; @@ -160,13 +149,6 @@ static inline struct scatterlist *sg_next(struct scatterlist *sg) static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) { memset(sgl, 0, sizeof(*sgl) * nents); -#ifdef CONFIG_DEBUG_SG - { - unsigned int i; - for (i = 0; i < nents; i++) - sgl[i].sg_magic = SG_MAGIC; - } -#endif sg_mark_end(&sgl[nents - 1]); }