From f75782e4e067fd68249635699cb20dfe0489d743 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 14 Sep 2015 18:16:02 +0100
Subject: [PATCH 01/14] block: kmemleak: Track the page allocations for struct
 request

The pages allocated for struct request contain pointers to other slab
allocations (via ops->init_request). Since kmemleak does not track/scan
page allocations, the slab objects will be reported as leaks (false
positives). This patch adds kmemleak callbacks to allow tracking of such
pages.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Tested-by: Bart Van Assche<bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f2d67b4047a0..2077f0d2f95f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -9,6 +9,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/kmemleak.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -1438,6 +1439,11 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 	while (!list_empty(&tags->page_list)) {
 		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
+		/*
+		 * Remove kmemleak object previously allocated in
+		 * blk_mq_init_rq_map().
+		 */
+		kmemleak_free(page_address(page));
 		__free_pages(page, page->private);
 	}
 
@@ -1510,6 +1516,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		list_add_tail(&page->lru, &tags->page_list);
 
 		p = page_address(page);
+		/*
+		 * Allow kmemleak to scan these pages as they contain pointers
+		 * to additional allocations like via ops->init_request().
+		 */
+		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
 		entries_per_page = order_to_size(this_order) / rq_size;
 		to_do = min(entries_per_page, set->queue_depth - i);
 		left -= to_do * rq_size;

From 5948edbcbf43759586cdf2656d293ea7de310280 Mon Sep 17 00:00:00 2001
From: Roman Pen <r.peniaev@gmail.com>
Date: Tue, 15 Sep 2015 08:27:25 -0600
Subject: [PATCH 02/14] fs/mpage.c: forgotten WRITE_SYNC in case of data
 integrity write

In case of wbc->sync_mode == WB_SYNC_ALL we need to do data integrity
write, thus mark request as WRITE_SYNC.

akpm: afaict this change will cause the data integrity write bios to be
placed onto the second queue in cfq_io_cq.cfqq[], which presumably results
in special treatment.  The documentation for REQ_SYNC is horrid.

Signed-off-by: Roman Pen <r.peniaev@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/mpage.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/mpage.c b/fs/mpage.c
index 778a4ddef77a..2ebf91652ecb 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
+	int wr = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -590,7 +591,7 @@ page_is_mapped:
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 
 alloc_new:
 	if (bio == NULL) {
@@ -617,7 +618,7 @@ alloc_new:
 	wbc_account_io(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 		goto alloc_new;
 	}
 
@@ -627,7 +628,7 @@ alloc_new:
 	set_page_writeback(page);
 	unlock_page(page);
 	if (boundary || (first_unmapped != blocks_per_page)) {
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 		if (boundary_block) {
 			write_boundary_block(boundary_bdev,
 					boundary_block, 1 << blkbits);
@@ -639,7 +640,7 @@ alloc_new:
 
 confused:
 	if (bio)
-		bio = mpage_bio_submit(WRITE, bio);
+		bio = mpage_bio_submit(wr, bio);
 
 	if (mpd->use_writepage) {
 		ret = mapping->a_ops->writepage(page, wbc);
@@ -695,8 +696,11 @@ mpage_writepages(struct address_space *mapping,
 		};
 
 		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
-		if (mpd.bio)
-			mpage_bio_submit(WRITE, mpd.bio);
+		if (mpd.bio) {
+			int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+				  WRITE_SYNC : WRITE);
+			mpage_bio_submit(wr, mpd.bio);
+		}
 	}
 	blk_finish_plug(&plug);
 	return ret;
@@ -713,8 +717,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
 		.use_writepage = 0,
 	};
 	int ret = __mpage_writepage(page, wbc, &mpd);
-	if (mpd.bio)
-		mpage_bio_submit(WRITE, mpd.bio);
+	if (mpd.bio) {
+		int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+			  WRITE_SYNC : WRITE);
+		mpage_bio_submit(wr, mpd.bio);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepage);

From 53cbf3b157a0428d40989ab1c7df9228a1976fc2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Mon, 17 Aug 2015 10:31:46 +0800
Subject: [PATCH 03/14] fs: direct-io: don't dirtying pages for
 ITER_BVEC/ITER_KVEC direct read

When direct read IO is submitted from kernel, it is often
unnecessary to dirty pages, for example of loop, dirtying pages
have been considered in the upper filesystem(over loop) side
already, and they don't need to be dirtied again.

So this patch doesn't dirtying pages for ITER_BVEC/ITER_KVEC
direct read, and loop should be the 1st case to use ITER_BVEC/ITER_KVEC
for direct read I/O.

The patch is based on previous Dave's patch.

Reviewed-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/direct-io.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 11256291642e..3ae0e0427191 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -120,6 +120,7 @@ struct dio {
 	int page_errors;		/* errno from get_user_pages() */
 	int is_async;			/* is IO async ? */
 	bool defer_completion;		/* defer AIO completion to workqueue? */
+	bool should_dirty;		/* if pages should be dirtied */
 	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
 	struct bio *bio_list;		/* singly linked via bi_private */
@@ -393,7 +394,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	dio->refcount++;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
-	if (dio->is_async && dio->rw == READ)
+	if (dio->is_async && dio->rw == READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
 	if (sdio->submit_io)
@@ -464,14 +465,15 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (bio->bi_error)
 		dio->io_error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (dio->is_async && dio->rw == READ && dio->should_dirty) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 		err = bio->bi_error;
 	} else {
 		bio_for_each_segment_all(bvec, bio, i) {
 			struct page *page = bvec->bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (dio->rw == READ && !PageCompound(page) &&
+					dio->should_dirty)
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -1219,6 +1221,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
+	dio->should_dirty = (iter->type == ITER_IOVEC);
 	sdio.iter = iter;
 	sdio.final_block_in_request =
 		(offset + iov_iter_count(iter)) >> blkbits;

From 8ee1b7b9d8761a7b1a0f37a6e596b5315d27d2f2 Mon Sep 17 00:00:00 2001
From: Kosuke Tatsukawa <tatsu@ab.jp.nec.com>
Date: Fri, 9 Oct 2015 00:35:38 +0000
Subject: [PATCH 04/14] blk-mq: fix waitqueue_active without memory barrier in
 block/blk-mq-tag.c

blk_mq_tag_update_depth() seems to be missing a memory barrier which
might cause the waker to not notice the waiter and fail to send a
wake_up as in the following figure.

	blk_mq_tag_update_depth			bt_get
------------------------------------------------------------------------
if (waitqueue_active(&bs->wait))
/* The CPU might reorder the test for
   the waitqueue up here, before
   prior writes complete */
					prepare_to_wait(&bs->wait, &wait,
					  TASK_UNINTERRUPTIBLE);
					tag = __bt_get(hctx, bt, last_tag,
					  tags);
					/* Value set in bt_update_count not
					   visible yet */
bt_update_count(&tags->bitmap_tags, tdepth);
/* blk_mq_tag_wakeup_all(tags, false); */
 bt = &tags->bitmap_tags;
 wake_index = atomic_read(&bt->wake_index);
					...
					io_schedule();
------------------------------------------------------------------------

This patch adds the missing memory barrier.

I found this issue when I was looking through the linux source code
for places calling waitqueue_active() before wake_up*(), but without
preceding memory barriers, after sending a patch to fix a similar
issue in drivers/tty/n_tty.c  (Details about the original issue can be
found here: https://lkml.org/lkml/2015/9/28/849).

Signed-off-by: Kosuke Tatsukawa <tatsu@ab.jp.nec.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index ed96474d75cb..7a6b6e27fc26 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -75,6 +75,10 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 	struct blk_mq_bitmap_tags *bt;
 	int i, wake_index;
 
+	/*
+	 * Make sure all changes prior to this are visible from other CPUs.
+	 */
+	smp_mb();
 	bt = &tags->bitmap_tags;
 	wake_index = atomic_read(&bt->wake_index);
 	for (i = 0; i < BT_WAIT_QUEUES; i++) {

From 3380f4589f6d9725e275525fd0580c8ee2b5cbbc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Oct 2015 18:57:06 +0200
Subject: [PATCH 05/14] blk-mq: remove unused blk_mq_clone_flush_request
 prototype

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4fea7964910..b44dce165761 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -29,8 +29,6 @@ void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
-void blk_mq_clone_flush_request(struct request *flush_rq,
-		struct request *orig_rq);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 

From 0809e3ac62319dc7534b64f95ac37e230d740e8a Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 20 Oct 2015 23:13:51 +0800
Subject: [PATCH 06/14] block: fix plug list flushing for nomerge queues

Request queues with merging disabled will not flush the plug list after
BLK_MAX_REQUEST_COUNT requests have been queued, since the code relies
on blk_attempt_plug_merge to compute the request_count.  Fix this by
computing the number of queued requests even for nomerge queues.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 32 +++++++++++++++++++++++++++++---
 block/blk-mq.c   |  9 ++++++---
 block/blk.h      |  1 +
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2eb722d48773..f0ae087ead06 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1594,6 +1594,30 @@ out:
 	return ret;
 }
 
+unsigned int blk_plug_queued_count(struct request_queue *q)
+{
+	struct blk_plug *plug;
+	struct request *rq;
+	struct list_head *plug_list;
+	unsigned int ret = 0;
+
+	plug = current->plug;
+	if (!plug)
+		goto out;
+
+	if (q->mq_ops)
+		plug_list = &plug->mq_list;
+	else
+		plug_list = &plug->list;
+
+	list_for_each_entry(rq, plug_list, queuelist) {
+		if (rq->q == q)
+			ret++;
+	}
+out:
+	return ret;
+}
+
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cmd_type = REQ_TYPE_FS;
@@ -1641,9 +1665,11 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (!blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
-		return;
+	if (!blk_queue_nomerges(q)) {
+		if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
+			return;
+	} else
+		request_count = blk_plug_queued_count(q);
 
 	spin_lock_irq(q->queue_lock);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d921cd5177f5..9683a561efcd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1268,9 +1268,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_split(q, &bio, q->bio_split);
 
-	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
-		return;
+	if (!is_flush_fua && !blk_queue_nomerges(q)) {
+		if (blk_attempt_plug_merge(q, bio, &request_count,
+					   &same_queue_rq))
+			return;
+	} else
+		request_count = blk_plug_queued_count(q);
 
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
diff --git a/block/blk.h b/block/blk.h
index 98614ad37c81..aa27d0292af1 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -86,6 +86,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 			    unsigned int *request_count,
 			    struct request **same_queue_rq);
+unsigned int blk_plug_queued_count(struct request_queue *q);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);

From bdced438acd83ad83a6c6fc7f50099b820245ddb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 20 Oct 2015 23:13:52 +0800
Subject: [PATCH 07/14] block: setup bi_phys_segments after splitting

The number of bio->bi_phys_segments is always obtained
during bio splitting, so it is natural to setup it
just after bio splitting, then we can avoid to compute
nr_segment again during merge.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index c4e9c37f3e38..22293fdf6514 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -11,13 +11,16 @@
 
 static struct bio *blk_bio_discard_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs)
+					 struct bio_set *bs,
+					 unsigned *nsegs)
 {
 	unsigned int max_discard_sectors, granularity;
 	int alignment;
 	sector_t tmp;
 	unsigned split_sectors;
 
+	*nsegs = 1;
+
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
 
@@ -51,8 +54,11 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
 					    struct bio *bio,
-					    struct bio_set *bs)
+					    struct bio_set *bs,
+					    unsigned *nsegs)
 {
+	*nsegs = 1;
+
 	if (!q->limits.max_write_same_sectors)
 		return NULL;
 
@@ -64,7 +70,8 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
 
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 					 struct bio *bio,
-					 struct bio_set *bs)
+					 struct bio_set *bs,
+					 unsigned *segs)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
@@ -106,22 +113,30 @@ new_segment:
 		sectors += bv.bv_len >> 9;
 	}
 
+	*segs = nsegs;
 	return NULL;
 split:
+	*segs = nsegs;
 	return bio_split(bio, sectors, GFP_NOIO, bs);
 }
 
 void blk_queue_split(struct request_queue *q, struct bio **bio,
 		     struct bio_set *bs)
 {
-	struct bio *split;
+	struct bio *split, *res;
+	unsigned nsegs;
 
 	if ((*bio)->bi_rw & REQ_DISCARD)
-		split = blk_bio_discard_split(q, *bio, bs);
+		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
 	else if ((*bio)->bi_rw & REQ_WRITE_SAME)
-		split = blk_bio_write_same_split(q, *bio, bs);
+		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
 	else
-		split = blk_bio_segment_split(q, *bio, q->bio_split);
+		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+
+	/* physical segments can be figured out during splitting */
+	res = split ? split : *bio;
+	res->bi_phys_segments = nsegs;
+	bio_set_flag(res, BIO_SEG_VALID);
 
 	if (split) {
 		bio_chain(split, *bio);

From 6ac45aeb6bcad38a2783a7d6e5da4c469497eeb0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 20 Oct 2015 23:13:53 +0800
Subject: [PATCH 08/14] block: avoid to merge splitted bio

The splitted bio has been already too fat to merge, so mark it
as NOMERGE.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 22293fdf6514..de5716d8e525 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -139,6 +139,9 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	bio_set_flag(res, BIO_SEG_VALID);
 
 	if (split) {
+		/* there isn't chance to merge the splitted bio */
+		split->bi_rw |= REQ_NOMERGE;
+
 		bio_chain(split, *bio);
 		generic_make_request(*bio);
 		*bio = split;

From e18378a60e27ad7b3e11ecc4e2c92159585dee68 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 20 Oct 2015 23:13:54 +0800
Subject: [PATCH 09/14] blk-mq: check bio_mergeable() early before merging

It isn't necessary to try to merge the bio which is marked
as NOMERGE.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9683a561efcd..d38371160019 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1140,7 +1140,7 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 					 struct blk_mq_ctx *ctx,
 					 struct request *rq, struct bio *bio)
 {
-	if (!hctx_allow_merges(hctx)) {
+	if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
 		blk_mq_bio_to_request(rq, bio);
 		spin_lock(&ctx->lock);
 insert_rq:

From 7460d389c01741f0dfff733af93d3b3abd9b974e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 20 Oct 2015 23:13:55 +0800
Subject: [PATCH 10/14] block: check bio_mergeable() early before merging

After bio splitting is introduced, one bio can be splitted
and it is marked as NOMERGE because it is too fat to be merged,
so check bio_mergeable() earlier to avoid to try to merge it
unnecessarily.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/elevator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/elevator.c b/block/elevator.c
index 84d63943f2de..c3555c9c672f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -420,7 +420,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	 * 	noxmerges: Only simple one-hit cache try
 	 * 	merges:	   All merge tries attempted
 	 */
-	if (blk_queue_nomerges(q))
+	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
 		return ELEVATOR_NO_MERGE;
 
 	/*

From 676d06077f964f06af52c19e59f0409a8880612f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 20 Oct 2015 23:13:56 +0800
Subject: [PATCH 11/14] blk-mq: fix for trace_block_plug()

The trace point is for tracing plug event of each request
queue instead of each task, so we should check the request
count in the plug list from current queue instead of
current task.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d38371160019..24c528f182ea 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1380,7 +1380,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	plug = current->plug;
 	if (plug) {
 		blk_mq_bio_to_request(rq, bio);
-		if (list_empty(&plug->mq_list))
+		if (!request_count)
 			trace_block_plug(q);
 		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
 			blk_flush_plug_list(plug, false);

From cfd0c552a8272d691691f40073654d775836e23a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 20 Oct 2015 23:13:57 +0800
Subject: [PATCH 12/14] blk-mq: mark ctx as pending at batch in flush plug path

Most of times, flush plug should be the hottest I/O path,
so mark ctx as pending after all requests in the list are
inserted.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 24c528f182ea..159e69bd2c3c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -990,18 +990,25 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq, bool at_head)
+static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
+					    struct blk_mq_ctx *ctx,
+					    struct request *rq,
+					    bool at_head)
 {
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-
 	trace_block_rq_insert(hctx->queue, rq);
 
 	if (at_head)
 		list_add(&rq->queuelist, &ctx->rq_list);
 	else
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
+}
 
+static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
+				    struct request *rq, bool at_head)
+{
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+
+	__blk_mq_insert_req_list(hctx, ctx, rq, at_head);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
@@ -1057,8 +1064,9 @@ static void blk_mq_insert_requests(struct request_queue *q,
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		rq->mq_ctx = ctx;
-		__blk_mq_insert_request(hctx, rq, false);
+		__blk_mq_insert_req_list(hctx, ctx, rq, false);
 	}
+	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
 
 	blk_mq_run_hw_queue(hctx, from_schedule);

From cdea01b2bf98affb7e9c44530108a4a28535eee8 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.ne>
Date: Fri, 30 Oct 2015 05:25:59 +0900
Subject: [PATCH 13/14] blktrace: re-write setting q->blk_trace

This is really about simplifying the double xchg patterns into
a single cmpxchg, with the same logic. Other than the immediate
cleanup, there are some subtleties this change deals with:

(i) While the load of the old bt is fully ordered wrt everything,
ie:

        old_bt = xchg(&q->blk_trace, bt);             [barrier]
        if (old_bt)
	     (void) xchg(&q->blk_trace, old_bt);    [barrier]

blk_trace could still be changed between the xchg and the old_bt
load. Note that this description is merely theoretical and afaict
very small, but doing everything in a single context with cmpxchg
closes this potential race.

(ii) Ordering guarantees are obviously kept with cmpxchg.

(iii) Gets rid of the hacky-by-nature (void)xchg pattern.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
eviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 90e72a0c3047..e3a26188b95e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		       struct block_device *bdev,
 		       struct blk_user_trace_setup *buts)
 {
-	struct blk_trace *old_bt, *bt = NULL;
+	struct blk_trace *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret;
 
@@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->trace_state = Blktrace_setup;
 
 	ret = -EBUSY;
-	old_bt = xchg(&q->blk_trace, bt);
-	if (old_bt) {
-		(void) xchg(&q->blk_trace, old_bt);
+	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto err;
-	}
 
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
@@ -1481,7 +1478,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q,
 				 struct block_device *bdev)
 {
-	struct blk_trace *old_bt, *bt = NULL;
+	struct blk_trace *bt = NULL;
 	int ret = -ENOMEM;
 
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
@@ -1497,12 +1494,9 @@ static int blk_trace_setup_queue(struct request_queue *q,
 
 	blk_trace_setup_lba(bt, bdev);
 
-	old_bt = xchg(&q->blk_trace, bt);
-	if (old_bt != NULL) {
-		(void)xchg(&q->blk_trace, old_bt);
-		ret = -EBUSY;
+	ret = -EBUSY;
+	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto free_bt;
-	}
 
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();

From 2404e607a9ee36db361bebe32787dafa1f7d6c00 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 3 Nov 2015 10:40:06 -0500
Subject: [PATCH 14/14] blk-mq: avoid excessive boot delays with large lun
 counts

Hi,

Zhangqing Luo reported long boot times on a system with thousands of
LUNs when scsi-mq was enabled.  He narrowed the problem down to
blk_mq_add_queue_tag_set, where every queue is frozen in order to set
the BLK_MQ_F_TAG_SHARED flag.  Each added device will freeze all queues
added before it in sequence, which involves waiting for an RCU grace
period for each one.  We don't need to do this.  After the second queue
is added, only new queues need to be initialized with the shared tag.
We can do that by percolating the flag up to the blk_mq_tag_set, and
updating the newly added queue's hctxs if the flag is set.

This problem was introduced by commit 0d2602ca30e41 (blk-mq: improve
support for shared tags maps).

Reported-and-tested-by: Jason Luo <zhangqing.luo@oracle.com>
Reviewed-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 159e69bd2c3c..22db728dbe24 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1695,7 +1695,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
 	hctx->queue_num = hctx_idx;
-	hctx->flags = set->flags;
+	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
 
 	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 					blk_mq_hctx_notify, hctx);
@@ -1882,27 +1882,26 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	}
 }
 
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
+static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct request_queue *q;
-	bool shared;
 	int i;
 
-	if (set->tag_list.next == set->tag_list.prev)
-		shared = false;
-	else
-		shared = true;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (shared)
+			hctx->flags |= BLK_MQ_F_TAG_SHARED;
+		else
+			hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+	}
+}
+
+static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
+{
+	struct request_queue *q;
 
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_freeze_queue(q);
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (shared)
-				hctx->flags |= BLK_MQ_F_TAG_SHARED;
-			else
-				hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
-		}
+		queue_set_hctx_shared(q, shared);
 		blk_mq_unfreeze_queue(q);
 	}
 }
@@ -1913,7 +1912,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 
 	mutex_lock(&set->tag_list_lock);
 	list_del_init(&q->tag_set_list);
-	blk_mq_update_tag_set_depth(set);
+	if (list_is_singular(&set->tag_list)) {
+		/* just transitioned to unshared */
+		set->flags &= ~BLK_MQ_F_TAG_SHARED;
+		/* update existing queue */
+		blk_mq_update_tag_set_depth(set, false);
+	}
 	mutex_unlock(&set->tag_list_lock);
 }
 
@@ -1923,8 +1927,17 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 	q->tag_set = set;
 
 	mutex_lock(&set->tag_list_lock);
+
+	/* Check to see if we're transitioning to shared (from 1 to 2 queues). */
+	if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
+		set->flags |= BLK_MQ_F_TAG_SHARED;
+		/* update existing queue */
+		blk_mq_update_tag_set_depth(set, true);
+	}
+	if (set->flags & BLK_MQ_F_TAG_SHARED)
+		queue_set_hctx_shared(q, true);
 	list_add_tail(&q->tag_set_list, &set->tag_list);
-	blk_mq_update_tag_set_depth(set);
+
 	mutex_unlock(&set->tag_list_lock);
 }