From f75782e4e067fd68249635699cb20dfe0489d743 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 14 Sep 2015 18:16:02 +0100 Subject: [PATCH 01/14] block: kmemleak: Track the page allocations for struct request The pages allocated for struct request contain pointers to other slab allocations (via ops->init_request). Since kmemleak does not track/scan page allocations, the slab objects will be reported as leaks (false positives). This patch adds kmemleak callbacks to allow tracking of such pages. Signed-off-by: Catalin Marinas Reported-by: Bart Van Assche Tested-by: Bart Van Assche Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index f2d67b4047a0..2077f0d2f95f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1438,6 +1439,11 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); __free_pages(page, page->private); } @@ -1510,6 +1516,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, list_add_tail(&page->lru, &tags->page_list); p = page_address(page); + /* + * Allow kmemleak to scan these pages as they contain pointers + * to additional allocations like via ops->init_request(). + */ + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, set->queue_depth - i); left -= to_do * rq_size; From 5948edbcbf43759586cdf2656d293ea7de310280 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Tue, 15 Sep 2015 08:27:25 -0600 Subject: [PATCH 02/14] fs/mpage.c: forgotten WRITE_SYNC in case of data integrity write In case of wbc->sync_mode == WB_SYNC_ALL we need to do data integrity write, thus mark request as WRITE_SYNC. akpm: afaict this change will cause the data integrity write bios to be placed onto the second queue in cfq_io_cq.cfqq[], which presumably results in special treatment. The documentation for REQ_SYNC is horrid. Signed-off-by: Roman Pen Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Reviewed-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/mpage.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index 778a4ddef77a..2ebf91652ecb 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -590,7 +591,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); alloc_new: if (bio == NULL) { @@ -617,7 +618,7 @@ alloc_new: wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); goto alloc_new; } @@ -627,7 +628,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -639,7 +640,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -695,8 +696,11 @@ mpage_writepages(struct address_space *mapping, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } } blk_finish_plug(&plug); return ret; @@ -713,8 +717,11 @@ int mpage_writepage(struct page *page, get_block_t get_block, .use_writepage = 0, }; int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } return ret; } EXPORT_SYMBOL(mpage_writepage); From 53cbf3b157a0428d40989ab1c7df9228a1976fc2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Aug 2015 10:31:46 +0800 Subject: [PATCH 03/14] fs: direct-io: don't dirtying pages for ITER_BVEC/ITER_KVEC direct read When direct read IO is submitted from kernel, it is often unnecessary to dirty pages, for example of loop, dirtying pages have been considered in the upper filesystem(over loop) side already, and they don't need to be dirtied again. So this patch doesn't dirtying pages for ITER_BVEC/ITER_KVEC direct read, and loop should be the 1st case to use ITER_BVEC/ITER_KVEC for direct read I/O. The patch is based on previous Dave's patch. Reviewed-by: Dave Kleikamp Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- fs/direct-io.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 11256291642e..3ae0e0427191 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -120,6 +120,7 @@ struct dio { int page_errors; /* errno from get_user_pages() */ int is_async; /* is IO async ? */ bool defer_completion; /* defer AIO completion to workqueue? */ + bool should_dirty; /* if pages should be dirtied */ int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ @@ -393,7 +394,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->rw == READ) + if (dio->is_async && dio->rw == READ && dio->should_dirty) bio_set_pages_dirty(bio); if (sdio->submit_io) @@ -464,14 +465,15 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (bio->bi_error) dio->io_error = -EIO; - if (dio->is_async && dio->rw == READ) { + if (dio->is_async && dio->rw == READ && dio->should_dirty) { bio_check_pages_dirty(bio); /* transfers ownership */ err = bio->bi_error; } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (dio->rw == READ && !PageCompound(page)) + if (dio->rw == READ && !PageCompound(page) && + dio->should_dirty) set_page_dirty_lock(page); page_cache_release(page); } @@ -1219,6 +1221,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, spin_lock_init(&dio->bio_lock); dio->refcount = 1; + dio->should_dirty = (iter->type == ITER_IOVEC); sdio.iter = iter; sdio.final_block_in_request = (offset + iov_iter_count(iter)) >> blkbits; From 8ee1b7b9d8761a7b1a0f37a6e596b5315d27d2f2 Mon Sep 17 00:00:00 2001 From: Kosuke Tatsukawa Date: Fri, 9 Oct 2015 00:35:38 +0000 Subject: [PATCH 04/14] blk-mq: fix waitqueue_active without memory barrier in block/blk-mq-tag.c blk_mq_tag_update_depth() seems to be missing a memory barrier which might cause the waker to not notice the waiter and fail to send a wake_up as in the following figure. blk_mq_tag_update_depth bt_get ------------------------------------------------------------------------ if (waitqueue_active(&bs->wait)) /* The CPU might reorder the test for the waitqueue up here, before prior writes complete */ prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); tag = __bt_get(hctx, bt, last_tag, tags); /* Value set in bt_update_count not visible yet */ bt_update_count(&tags->bitmap_tags, tdepth); /* blk_mq_tag_wakeup_all(tags, false); */ bt = &tags->bitmap_tags; wake_index = atomic_read(&bt->wake_index); ... io_schedule(); ------------------------------------------------------------------------ This patch adds the missing memory barrier. I found this issue when I was looking through the linux source code for places calling waitqueue_active() before wake_up*(), but without preceding memory barriers, after sending a patch to fix a similar issue in drivers/tty/n_tty.c (Details about the original issue can be found here: https://lkml.org/lkml/2015/9/28/849). Signed-off-by: Kosuke Tatsukawa Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ed96474d75cb..7a6b6e27fc26 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -75,6 +75,10 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) struct blk_mq_bitmap_tags *bt; int i, wake_index; + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); bt = &tags->bitmap_tags; wake_index = atomic_read(&bt->wake_index); for (i = 0; i < BT_WAIT_QUEUES; i++) { From 3380f4589f6d9725e275525fd0580c8ee2b5cbbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Oct 2015 18:57:06 +0200 Subject: [PATCH 05/14] blk-mq: remove unused blk_mq_clone_flush_request prototype Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-mq.h b/block/blk-mq.h index f4fea7964910..b44dce165761 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -29,8 +29,6 @@ void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); From 0809e3ac62319dc7534b64f95ac37e230d740e8a Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 20 Oct 2015 23:13:51 +0800 Subject: [PATCH 06/14] block: fix plug list flushing for nomerge queues Request queues with merging disabled will not flush the plug list after BLK_MAX_REQUEST_COUNT requests have been queued, since the code relies on blk_attempt_plug_merge to compute the request_count. Fix this by computing the number of queued requests even for nomerge queues. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-core.c | 32 +++++++++++++++++++++++++++++--- block/blk-mq.c | 9 ++++++--- block/blk.h | 1 + 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2eb722d48773..f0ae087ead06 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1594,6 +1594,30 @@ out: return ret; } +unsigned int blk_plug_queued_count(struct request_queue *q) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + unsigned int ret = 0; + + plug = current->plug; + if (!plug) + goto out; + + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry(rq, plug_list, queuelist) { + if (rq->q == q) + ret++; + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -1641,9 +1665,11 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + if (!blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return; + } else + request_count = blk_plug_queued_count(q); spin_lock_irq(q->queue_lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index d921cd5177f5..9683a561efcd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1268,9 +1268,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) - return; + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, + &same_queue_rq)) + return; + } else + request_count = blk_plug_queued_count(q); rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) diff --git a/block/blk.h b/block/blk.h index 98614ad37c81..aa27d0292af1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -86,6 +86,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count, struct request **same_queue_rq); +unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); From bdced438acd83ad83a6c6fc7f50099b820245ddb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:52 +0800 Subject: [PATCH 07/14] block: setup bi_phys_segments after splitting The number of bio->bi_phys_segments is always obtained during bio splitting, so it is natural to setup it just after bio splitting, then we can avoid to compute nr_segment again during merge. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index c4e9c37f3e38..22293fdf6514 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -11,13 +11,16 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *nsegs) { unsigned int max_discard_sectors, granularity; int alignment; sector_t tmp; unsigned split_sectors; + *nsegs = 1; + /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); @@ -51,8 +54,11 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, static struct bio *blk_bio_write_same_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *nsegs) { + *nsegs = 1; + if (!q->limits.max_write_same_sectors) return NULL; @@ -64,7 +70,8 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, - struct bio_set *bs) + struct bio_set *bs, + unsigned *segs) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; @@ -106,22 +113,30 @@ new_segment: sectors += bv.bv_len >> 9; } + *segs = nsegs; return NULL; split: + *segs = nsegs; return bio_split(bio, sectors, GFP_NOIO, bs); } void blk_queue_split(struct request_queue *q, struct bio **bio, struct bio_set *bs) { - struct bio *split; + struct bio *split, *res; + unsigned nsegs; if ((*bio)->bi_rw & REQ_DISCARD) - split = blk_bio_discard_split(q, *bio, bs); + split = blk_bio_discard_split(q, *bio, bs, &nsegs); else if ((*bio)->bi_rw & REQ_WRITE_SAME) - split = blk_bio_write_same_split(q, *bio, bs); + split = blk_bio_write_same_split(q, *bio, bs, &nsegs); else - split = blk_bio_segment_split(q, *bio, q->bio_split); + split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); + + /* physical segments can be figured out during splitting */ + res = split ? split : *bio; + res->bi_phys_segments = nsegs; + bio_set_flag(res, BIO_SEG_VALID); if (split) { bio_chain(split, *bio); From 6ac45aeb6bcad38a2783a7d6e5da4c469497eeb0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:53 +0800 Subject: [PATCH 08/14] block: avoid to merge splitted bio The splitted bio has been already too fat to merge, so mark it as NOMERGE. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 22293fdf6514..de5716d8e525 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -139,6 +139,9 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, bio_set_flag(res, BIO_SEG_VALID); if (split) { + /* there isn't chance to merge the splitted bio */ + split->bi_rw |= REQ_NOMERGE; + bio_chain(split, *bio); generic_make_request(*bio); *bio = split; From e18378a60e27ad7b3e11ecc4e2c92159585dee68 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:54 +0800 Subject: [PATCH 09/14] blk-mq: check bio_mergeable() early before merging It isn't necessary to try to merge the bio which is marked as NOMERGE. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9683a561efcd..d38371160019 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1140,7 +1140,7 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - if (!hctx_allow_merges(hctx)) { + if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: From 7460d389c01741f0dfff733af93d3b3abd9b974e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:55 +0800 Subject: [PATCH 10/14] block: check bio_mergeable() early before merging After bio splitting is introduced, one bio can be splitted and it is marked as NOMERGE because it is too fat to be merged, so check bio_mergeable() earlier to avoid to try to merge it unnecessarily. Signed-off-by: Ming Lei Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/elevator.c b/block/elevator.c index 84d63943f2de..c3555c9c672f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -420,7 +420,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * noxmerges: Only simple one-hit cache try * merges: All merge tries attempted */ - if (blk_queue_nomerges(q)) + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return ELEVATOR_NO_MERGE; /* From 676d06077f964f06af52c19e59f0409a8880612f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:56 +0800 Subject: [PATCH 11/14] blk-mq: fix for trace_block_plug() The trace point is for tracing plug event of each request queue instead of each task, so we should check the request count in the plug list from current queue instead of current task. Signed-off-by: Ming Lei Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d38371160019..24c528f182ea 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1380,7 +1380,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) plug = current->plug; if (plug) { blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) + if (!request_count) trace_block_plug(q); else if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); From cfd0c552a8272d691691f40073654d775836e23a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 20 Oct 2015 23:13:57 +0800 Subject: [PATCH 12/14] blk-mq: mark ctx as pending at batch in flush plug path Most of times, flush plug should be the hottest I/O path, so mark ctx as pending after all requests in the list are inserted. Reviewed-by: Jeff Moyer Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 24c528f182ea..159e69bd2c3c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -990,18 +990,25 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_queue); -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, + bool at_head) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - trace_block_rq_insert(hctx->queue, rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); +} +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + + __blk_mq_insert_req_list(hctx, ctx, rq, at_head); blk_mq_hctx_mark_pending(hctx, ctx); } @@ -1057,8 +1064,9 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq, false); + __blk_mq_insert_req_list(hctx, ctx, rq, false); } + blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); blk_mq_run_hw_queue(hctx, from_schedule); From cdea01b2bf98affb7e9c44530108a4a28535eee8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 30 Oct 2015 05:25:59 +0900 Subject: [PATCH 13/14] blktrace: re-write setting q->blk_trace This is really about simplifying the double xchg patterns into a single cmpxchg, with the same logic. Other than the immediate cleanup, there are some subtleties this change deals with: (i) While the load of the old bt is fully ordered wrt everything, ie: old_bt = xchg(&q->blk_trace, bt); [barrier] if (old_bt) (void) xchg(&q->blk_trace, old_bt); [barrier] blk_trace could still be changed between the xchg and the old_bt load. Note that this description is merely theoretical and afaict very small, but doing everything in a single context with cmpxchg closes this potential race. (ii) Ordering guarantees are obviously kept with cmpxchg. (iii) Gets rid of the hacky-by-nature (void)xchg pattern. Signed-off-by: Davidlohr Bueso eviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 90e72a0c3047..e3a26188b95e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; struct dentry *dir = NULL; int ret; @@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->trace_state = Blktrace_setup; ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); + if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); @@ -1481,7 +1478,7 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, struct block_device *bdev) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); @@ -1497,12 +1494,9 @@ static int blk_trace_setup_queue(struct request_queue *q, blk_trace_setup_lba(bt, bdev); - old_bt = xchg(&q->blk_trace, bt); - if (old_bt != NULL) { - (void)xchg(&q->blk_trace, old_bt); - ret = -EBUSY; + ret = -EBUSY; + if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); From 2404e607a9ee36db361bebe32787dafa1f7d6c00 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 3 Nov 2015 10:40:06 -0500 Subject: [PATCH 14/14] blk-mq: avoid excessive boot delays with large lun counts Hi, Zhangqing Luo reported long boot times on a system with thousands of LUNs when scsi-mq was enabled. He narrowed the problem down to blk_mq_add_queue_tag_set, where every queue is frozen in order to set the BLK_MQ_F_TAG_SHARED flag. Each added device will freeze all queues added before it in sequence, which involves waiting for an RCU grace period for each one. We don't need to do this. After the second queue is added, only new queues need to be initialized with the shared tag. We can do that by percolating the flag up to the blk_mq_tag_set, and updating the newly added queue's hctxs if the flag is set. This problem was introduced by commit 0d2602ca30e41 (blk-mq: improve support for shared tags maps). Reported-and-tested-by: Jason Luo Reviewed-by: Ming Lei Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-mq.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 159e69bd2c3c..22db728dbe24 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1695,7 +1695,7 @@ static int blk_mq_init_hctx(struct request_queue *q, INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->queue_num = hctx_idx; - hctx->flags = set->flags; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1882,27 +1882,26 @@ static void blk_mq_map_swqueue(struct request_queue *q, } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - struct request_queue *q; - bool shared; int i; - if (set->tag_list.next == set->tag_list.prev) - shared = false; - else - shared = true; + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +{ + struct request_queue *q; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); - - queue_for_each_hw_ctx(q, hctx, i) { - if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; - else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } + queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q); } } @@ -1913,7 +1912,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); - blk_mq_update_tag_set_depth(set); + if (list_is_singular(&set->tag_list)) { + /* just transitioned to unshared */ + set->flags &= ~BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, false); + } mutex_unlock(&set->tag_list_lock); } @@ -1923,8 +1927,17 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, q->tag_set = set; mutex_lock(&set->tag_list_lock); + + /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ + if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + set->flags |= BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, true); + } + if (set->flags & BLK_MQ_F_TAG_SHARED) + queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); - blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); }