From 414b4ff5eecff0097d09c4a7da12e435fd503692 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 12:43:49 +0100
Subject: [PATCH 01/65] block: add REQ_FLUSH_SEQ

rq == &q->flush_rq was used to determine whether a rq is part of a
flush sequence, which worked because all requests in a flush sequence
were sequenced using the single dedicated request.  This is about to
change, so introduce REQ_FLUSH_SEQ flag to distinguish flush sequence
requests.

This patch doesn't cause any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c          | 4 ++--
 block/blk-flush.c         | 1 +
 block/blk.h               | 2 +-
 include/linux/blk_types.h | 2 ++
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f1b390..fc7d8ad76f44 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
 	struct request_queue *q = rq->q;
 
-	if (&q->flush_rq != rq) {
+	if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -1789,7 +1789,7 @@ static void blk_account_io_done(struct request *req)
 	 * normal IO on queueing nor completion.  Accounting the
 	 * containing request is enough.
 	 */
-	if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
+	if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 54b123d6563e..8592869bcbe7 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,6 +130,7 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		BUG();
 	}
 
+	rq->cmd_flags |= REQ_FLUSH_SEQ;
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	return rq;
 }
diff --git a/block/blk.h b/block/blk.h
index 2db8f32838e7..9d2ee8f4d9af 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -61,7 +61,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
 			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-			    rq == &q->flush_rq)
+			    (rq->cmd_flags & REQ_FLUSH_SEQ))
 				return rq;
 			rq = blk_do_flush(q, rq);
 			if (rq)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 46ad5197537a..dddedfc0af81 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -148,6 +148,7 @@ enum rq_flag_bits {
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_FLUSH,		/* request for cache flush */
+	__REQ_FLUSH_SEQ,	/* request for flush sequence */
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
 	__REQ_SECURE,		/* secure discard (used with __REQ_DISCARD) */
@@ -188,6 +189,7 @@ enum rq_flag_bits {
 #define REQ_ALLOCED		(1 << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1 << __REQ_COPY_USER)
 #define REQ_FLUSH		(1 << __REQ_FLUSH)
+#define REQ_FLUSH_SEQ		(1 << __REQ_FLUSH_SEQ)
 #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
 #define REQ_SECURE		(1 << __REQ_SECURE)

From 143a87f4c9c629067afea5b6703d66ea88c82f8e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 12:43:52 +0100
Subject: [PATCH 02/65] block: improve flush bio completion

bio's for flush are completed twice - once during the data phase and
one more time after the whole sequence is complete.  The first
completion shouldn't notify completion to the issuer.

This was achieved by skipping all bio completion steps in
req_bio_endio() for the first completion; however, this has two
drawbacks.

* Error is not recorded in bio and must be tracked somewhere else.

* Partial completion is not supported.

Both don't cause problems for the current users; however, they make
further improvements difficult.  Change req_bio_endio() such that it
only skips the actual notification part for the first completion.  bio
completion is implemented with partial completions on mind anyway so
this is as simple as moving the REQ_FLUSH_SEQ conditional such that
only calling of bio_endio() is skipped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 52 +++++++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fc7d8ad76f44..617bb9e40927 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,37 +136,31 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
 	struct request_queue *q = rq->q;
 
-	if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) {
-		if (error)
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			error = -EIO;
+	if (error)
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = -EIO;
 
-		if (unlikely(nbytes > bio->bi_size)) {
-			printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-			       __func__, nbytes, bio->bi_size);
-			nbytes = bio->bi_size;
-		}
-
-		if (unlikely(rq->cmd_flags & REQ_QUIET))
-			set_bit(BIO_QUIET, &bio->bi_flags);
-
-		bio->bi_size -= nbytes;
-		bio->bi_sector += (nbytes >> 9);
-
-		if (bio_integrity(bio))
-			bio_integrity_advance(bio, nbytes);
-
-		if (bio->bi_size == 0)
-			bio_endio(bio, error);
-	} else {
-		/*
-		 * Okay, this is the sequenced flush request in
-		 * progress, just record the error;
-		 */
-		if (error && !q->flush_err)
-			q->flush_err = error;
+	if (unlikely(nbytes > bio->bi_size)) {
+		printk(KERN_ERR "%s: want %u bytes done, %u left\n",
+		       __func__, nbytes, bio->bi_size);
+		nbytes = bio->bi_size;
 	}
+
+	if (unlikely(rq->cmd_flags & REQ_QUIET))
+		set_bit(BIO_QUIET, &bio->bi_flags);
+
+	bio->bi_size -= nbytes;
+	bio->bi_sector += (nbytes >> 9);
+
+	if (bio_integrity(bio))
+		bio_integrity_advance(bio, nbytes);
+
+	/* don't actually finish bio if it's part of flush sequence */
+	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+		bio_endio(bio, error);
+	else if (error && !q->flush_err)
+		q->flush_err = error;
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)

From ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 12:43:54 +0100
Subject: [PATCH 03/65] block: reimplement FLUSH/FUA to support merge

The current FLUSH/FUA support has evolved from the implementation
which had to perform queue draining.  As such, sequencing is done
queue-wide one flush request after another.  However, with the
draining requirement gone, there's no reason to keep the queue-wide
sequential approach.

This patch reimplements FLUSH/FUA support such that each FLUSH/FUA
request is sequenced individually.  The actual FLUSH execution is
double buffered and whenever a request wants to execute one for either
PRE or POSTFLUSH, it queues on the pending queue.  Once certain
conditions are met, a flush request is issued and on its completion
all pending requests proceed to the next sequence.

This allows arbitrary merging of different type of flushes.  How they
are merged can be primarily controlled and tuned by adjusting the
above said 'conditions' used to determine when to issue the next
flush.

This is inspired by Darrick's patches to merge multiple zero-data
flushes which helps workloads with highly concurrent fsync requests.

* As flush requests are never put on the IO scheduler, request fields
  used for flush share space with rq->rb_node.  rq->completion_data is
  moved out of the union.  This increases the request size by one
  pointer.

  As rq->elevator_private* are used only by the iosched too, it is
  possible to reduce the request size further.  However, to do that,
  we need to modify request allocation path such that iosched data is
  not allocated for flush requests.

* FLUSH/FUA processing happens on insertion now instead of dispatch.

- Comments updated as per Vivek and Mike.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "Darrick J. Wong" <djwong@us.ibm.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         |  10 +-
 block/blk-flush.c        | 452 +++++++++++++++++++++++++++------------
 block/blk.h              |  12 +-
 block/elevator.c         |   7 +
 include/linux/blkdev.h   |  18 +-
 include/linux/elevator.h |   1 +
 6 files changed, 338 insertions(+), 162 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 617bb9e40927..05746093b45e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -134,8 +134,6 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	struct request_queue *q = rq->q;
-
 	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -159,8 +157,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	/* don't actually finish bio if it's part of flush sequence */
 	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 		bio_endio(bio, error);
-	else if (error && !q->flush_err)
-		q->flush_err = error;
 }
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -519,7 +515,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
-	INIT_LIST_HEAD(&q->pending_flushes);
+	INIT_LIST_HEAD(&q->flush_queue[0]);
+	INIT_LIST_HEAD(&q->flush_queue[1]);
+	INIT_LIST_HEAD(&q->flush_data_in_flight);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1198,7 +1196,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	spin_lock_irq(q->queue_lock);
 
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-		where = ELEVATOR_INSERT_FRONT;
+		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
 	}
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8592869bcbe7..a867e3f524f3 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
 /*
  * Functions to sequence FLUSH and FUA writes.
+ *
+ * Copyright (C) 2011		Max Planck Institute for Gravitational Physics
+ * Copyright (C) 2011		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
+ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
+ * properties and hardware capability.
+ *
+ * If a request doesn't have data, only REQ_FLUSH makes sense, which
+ * indicates a simple flush request.  If there is data, REQ_FLUSH indicates
+ * that the device cache should be flushed before the data is executed, and
+ * REQ_FUA means that the data must be on non-volatile media on request
+ * completion.
+ *
+ * If the device doesn't have writeback cache, FLUSH and FUA don't make any
+ * difference.  The requests are either completed immediately if there's no
+ * data or executed as normal requests otherwise.
+ *
+ * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
+ *
+ * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
+ * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ *
+ * The actual execution of flush is double buffered.  Whenever a request
+ * needs to execute PRE or POSTFLUSH, it queues at
+ * q->flush_queue[q->flush_pending_idx].  Once certain criteria are met, a
+ * flush is issued and the pending_idx is toggled.  When the flush
+ * completes, all the requests which were pending are proceeded to the next
+ * step.  This allows arbitrary merging of different types of FLUSH/FUA
+ * requests.
+ *
+ * Currently, the following conditions are used to determine when to issue
+ * flush.
+ *
+ * C1. At any given time, only one flush shall be in progress.  This makes
+ *     double buffering sufficient.
+ *
+ * C2. Flush is deferred if any request is executing DATA of its sequence.
+ *     This avoids issuing separate POSTFLUSHes for requests which shared
+ *     PREFLUSH.
+ *
+ * C3. The second condition is ignored if there is a request which has
+ *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
+ *     starvation in the unlikely case where there are continuous stream of
+ *     FUA (without FLUSH) requests.
+ *
+ * For devices which support FUA, it isn't clear whether C2 (and thus C3)
+ * is beneficial.
+ *
+ * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
+ * Once while executing DATA and again after the whole sequence is
+ * complete.  The first completion updates the contained bio but doesn't
+ * finish it so that the bio submitter is notified only after the whole
+ * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * req_bio_endio().
+ *
+ * The above peculiarity requires that each FLUSH/FUA request has only one
+ * bio attached to it, which is guaranteed as they aren't allowed to be
+ * merged in the usual way.
  */
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/bio.h>
@@ -11,185 +74,290 @@
 
 /* FLUSH/FUA sequences */
 enum {
-	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
-	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
-	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
-	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
-	QUEUE_FSEQ_DONE		= (1 << 4),
-};
+	REQ_FSEQ_PREFLUSH	= (1 << 0), /* pre-flushing in progress */
+	REQ_FSEQ_DATA		= (1 << 1), /* data write in progress */
+	REQ_FSEQ_POSTFLUSH	= (1 << 2), /* post-flushing in progress */
+	REQ_FSEQ_DONE		= (1 << 3),
 
-static struct request *queue_next_fseq(struct request_queue *q);
-
-unsigned blk_flush_cur_seq(struct request_queue *q)
-{
-	if (!q->flush_seq)
-		return 0;
-	return 1 << ffz(q->flush_seq);
-}
-
-static struct request *blk_flush_complete_seq(struct request_queue *q,
-					      unsigned seq, int error)
-{
-	struct request *next_rq = NULL;
-
-	if (error && !q->flush_err)
-		q->flush_err = error;
-
-	BUG_ON(q->flush_seq & seq);
-	q->flush_seq |= seq;
-
-	if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
-		/* not complete yet, queue the next flush sequence */
-		next_rq = queue_next_fseq(q);
-	} else {
-		/* complete this flush request */
-		__blk_end_request_all(q->orig_flush_rq, q->flush_err);
-		q->orig_flush_rq = NULL;
-		q->flush_seq = 0;
-
-		/* dispatch the next flush if there's one */
-		if (!list_empty(&q->pending_flushes)) {
-			next_rq = list_entry_rq(q->pending_flushes.next);
-			list_move(&next_rq->queuelist, &q->queue_head);
-		}
-	}
-	return next_rq;
-}
-
-static void blk_flush_complete_seq_end_io(struct request_queue *q,
-					  unsigned seq, int error)
-{
-	bool was_empty = elv_queue_empty(q);
-	struct request *next_rq;
-
-	next_rq = blk_flush_complete_seq(q, seq, error);
+	REQ_FSEQ_ACTIONS	= REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
+				  REQ_FSEQ_POSTFLUSH,
 
 	/*
-	 * Moving a request silently to empty queue_head may stall the
-	 * queue.  Kick the queue in those cases.
+	 * If flush has been pending longer than the following timeout,
+	 * it's issued even if flush_data requests are still in flight.
 	 */
-	if (was_empty && next_rq)
-		__blk_run_queue(q);
+	FLUSH_PENDING_TIMEOUT	= 5 * HZ,
+};
+
+static bool blk_kick_flush(struct request_queue *q);
+
+static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
+{
+	unsigned int policy = 0;
+
+	if (fflags & REQ_FLUSH) {
+		if (rq->cmd_flags & REQ_FLUSH)
+			policy |= REQ_FSEQ_PREFLUSH;
+		if (blk_rq_sectors(rq))
+			policy |= REQ_FSEQ_DATA;
+		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+			policy |= REQ_FSEQ_POSTFLUSH;
+	}
+	return policy;
 }
 
-static void pre_flush_end_io(struct request *rq, int error)
+static unsigned int blk_flush_cur_seq(struct request *rq)
 {
-	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
+	return 1 << ffz(rq->flush.seq);
 }
 
-static void flush_data_end_io(struct request *rq, int error)
+static void blk_flush_restore_request(struct request *rq)
 {
-	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
+	/*
+	 * After flush data completion, @rq->bio is %NULL but we need to
+	 * complete the bio again.  @rq->biotail is guaranteed to equal the
+	 * original @rq->bio.  Restore it.
+	 */
+	rq->bio = rq->biotail;
+
+	/* make @rq a normal request */
+	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+	rq->end_io = NULL;
 }
 
-static void post_flush_end_io(struct request *rq, int error)
+/**
+ * blk_flush_complete_seq - complete flush sequence
+ * @rq: FLUSH/FUA request being sequenced
+ * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
+ * @error: whether an error occurred
+ *
+ * @rq just completed @seq part of its flush sequence, record the
+ * completion and trigger the next step.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if requests were added to the dispatch queue, %false otherwise.
+ */
+static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
+				   int error)
 {
-	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
-}
+	struct request_queue *q = rq->q;
+	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+	bool queued = false;
 
-static void init_flush_request(struct request *rq, struct gendisk *disk)
-{
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = WRITE_FLUSH;
-	rq->rq_disk = disk;
-}
+	BUG_ON(rq->flush.seq & seq);
+	rq->flush.seq |= seq;
 
-static struct request *queue_next_fseq(struct request_queue *q)
-{
-	struct request *orig_rq = q->orig_flush_rq;
-	struct request *rq = &q->flush_rq;
+	if (likely(!error))
+		seq = blk_flush_cur_seq(rq);
+	else
+		seq = REQ_FSEQ_DONE;
 
-	blk_rq_init(q, rq);
-
-	switch (blk_flush_cur_seq(q)) {
-	case QUEUE_FSEQ_PREFLUSH:
-		init_flush_request(rq, orig_rq->rq_disk);
-		rq->end_io = pre_flush_end_io;
+	switch (seq) {
+	case REQ_FSEQ_PREFLUSH:
+	case REQ_FSEQ_POSTFLUSH:
+		/* queue for flush */
+		if (list_empty(pending))
+			q->flush_pending_since = jiffies;
+		list_move_tail(&rq->flush.list, pending);
 		break;
-	case QUEUE_FSEQ_DATA:
-		init_request_from_bio(rq, orig_rq->bio);
+
+	case REQ_FSEQ_DATA:
+		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
+		list_add(&rq->queuelist, &q->queue_head);
+		queued = true;
+		break;
+
+	case REQ_FSEQ_DONE:
 		/*
-		 * orig_rq->rq_disk may be different from
-		 * bio->bi_bdev->bd_disk if orig_rq got here through
-		 * remapping drivers.  Make sure rq->rq_disk points
-		 * to the same one as orig_rq.
+		 * @rq was previously adjusted by blk_flush_issue() for
+		 * flush sequencing and may already have gone through the
+		 * flush data request completion path.  Restore @rq for
+		 * normal completion and end it.
 		 */
-		rq->rq_disk = orig_rq->rq_disk;
-		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
-		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
-		rq->end_io = flush_data_end_io;
-		break;
-	case QUEUE_FSEQ_POSTFLUSH:
-		init_flush_request(rq, orig_rq->rq_disk);
-		rq->end_io = post_flush_end_io;
+		BUG_ON(!list_empty(&rq->queuelist));
+		list_del_init(&rq->flush.list);
+		blk_flush_restore_request(rq);
+		__blk_end_request_all(rq, error);
 		break;
+
 	default:
 		BUG();
 	}
 
-	rq->cmd_flags |= REQ_FLUSH_SEQ;
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-	return rq;
+	return blk_kick_flush(q) | queued;
 }
 
-struct request *blk_do_flush(struct request_queue *q, struct request *rq)
+static void flush_end_io(struct request *flush_rq, int error)
 {
-	unsigned int fflags = q->flush_flags; /* may change, cache it */
-	bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
-	bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
-	bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
-	unsigned skip = 0;
+	struct request_queue *q = flush_rq->q;
+	struct list_head *running = &q->flush_queue[q->flush_running_idx];
+	bool was_empty = elv_queue_empty(q);
+	bool queued = false;
+	struct request *rq, *n;
 
-	/*
-	 * Special case.  If there's data but flush is not necessary,
-	 * the request can be issued directly.
-	 *
-	 * Flush w/o data should be able to be issued directly too but
-	 * currently some drivers assume that rq->bio contains
-	 * non-zero data if it isn't NULL and empty FLUSH requests
-	 * getting here usually have bio's without data.
-	 */
-	if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
-		rq->cmd_flags &= ~REQ_FLUSH;
-		if (!has_fua)
-			rq->cmd_flags &= ~REQ_FUA;
-		return rq;
+	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
+
+	/* account completion of the flush request */
+	q->flush_running_idx ^= 1;
+	elv_completed_request(q, flush_rq);
+
+	/* and push the waiting requests to the next stage */
+	list_for_each_entry_safe(rq, n, running, flush.list) {
+		unsigned int seq = blk_flush_cur_seq(rq);
+
+		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+		queued |= blk_flush_complete_seq(rq, seq, error);
 	}
 
-	/*
-	 * Sequenced flushes can't be processed in parallel.  If
-	 * another one is already in progress, queue for later
-	 * processing.
-	 */
-	if (q->flush_seq) {
-		list_move_tail(&rq->queuelist, &q->pending_flushes);
-		return NULL;
-	}
+	/* after populating an empty queue, kick it to avoid stall */
+	if (queued && was_empty)
+		__blk_run_queue(q);
+}
+
+/**
+ * blk_kick_flush - consider issuing flush request
+ * @q: request_queue being kicked
+ *
+ * Flush related states of @q have changed, consider issuing flush request.
+ * Please read the comment at the top of this file for more info.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if flush was issued, %false otherwise.
+ */
+static bool blk_kick_flush(struct request_queue *q)
+{
+	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+	struct request *first_rq =
+		list_first_entry(pending, struct request, flush.list);
+
+	/* C1 described at the top of this file */
+	if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
+		return false;
+
+	/* C2 and C3 */
+	if (!list_empty(&q->flush_data_in_flight) &&
+	    time_before(jiffies,
+			q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+		return false;
 
 	/*
-	 * Start a new flush sequence
+	 * Issue flush and toggle pending_idx.  This makes pending_idx
+	 * different from running_idx, which means flush is in flight.
 	 */
-	q->flush_err = 0;
-	q->flush_seq |= QUEUE_FSEQ_STARTED;
+	blk_rq_init(q, &q->flush_rq);
+	q->flush_rq.cmd_type = REQ_TYPE_FS;
+	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+	q->flush_rq.rq_disk = first_rq->rq_disk;
+	q->flush_rq.end_io = flush_end_io;
 
-	/* adjust FLUSH/FUA of the original request and stash it away */
+	q->flush_pending_idx ^= 1;
+	elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_FRONT);
+	return true;
+}
+
+static void flush_data_end_io(struct request *rq, int error)
+{
+	struct request_queue *q = rq->q;
+	bool was_empty = elv_queue_empty(q);
+
+	/* after populating an empty queue, kick it to avoid stall */
+	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
+		__blk_run_queue(q);
+}
+
+/**
+ * blk_insert_flush - insert a new FLUSH/FUA request
+ * @rq: request to insert
+ *
+ * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
+ * @rq is being submitted.  Analyze what needs to be done and put it on the
+ * right queue.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_insert_flush(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	unsigned int fflags = q->flush_flags;	/* may change, cache */
+	unsigned int policy = blk_flush_policy(fflags, rq);
+
+	BUG_ON(rq->end_io);
+	BUG_ON(!rq->bio || rq->bio != rq->biotail);
+
+	/*
+	 * @policy now records what operations need to be done.  Adjust
+	 * REQ_FLUSH and FUA for the driver.
+	 */
 	rq->cmd_flags &= ~REQ_FLUSH;
-	if (!has_fua)
+	if (!(fflags & REQ_FUA))
 		rq->cmd_flags &= ~REQ_FUA;
-	blk_dequeue_request(rq);
-	q->orig_flush_rq = rq;
 
-	/* skip unneded sequences and return the first one */
-	if (!do_preflush)
-		skip |= QUEUE_FSEQ_PREFLUSH;
-	if (!blk_rq_sectors(rq))
-		skip |= QUEUE_FSEQ_DATA;
-	if (!do_postflush)
-		skip |= QUEUE_FSEQ_POSTFLUSH;
-	return blk_flush_complete_seq(q, skip, 0);
+	/*
+	 * If there's data but flush is not necessary, the request can be
+	 * processed directly without going through flush machinery.  Queue
+	 * for normal execution.
+	 */
+	if ((policy & REQ_FSEQ_DATA) &&
+	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+		list_add(&rq->queuelist, &q->queue_head);
+		return;
+	}
+
+	/*
+	 * @rq should go through flush machinery.  Mark it part of flush
+	 * sequence and submit for further processing.
+	 */
+	memset(&rq->flush, 0, sizeof(rq->flush));
+	INIT_LIST_HEAD(&rq->flush.list);
+	rq->cmd_flags |= REQ_FLUSH_SEQ;
+	rq->end_io = flush_data_end_io;
+
+	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
+}
+
+/**
+ * blk_abort_flushes - @q is being aborted, abort flush requests
+ * @q: request_queue being aborted
+ *
+ * To be called from elv_abort_queue().  @q is being aborted.  Prepare all
+ * FLUSH/FUA requests for abortion.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_abort_flushes(struct request_queue *q)
+{
+	struct request *rq, *n;
+	int i;
+
+	/*
+	 * Requests in flight for data are already owned by the dispatch
+	 * queue or the device driver.  Just restore for normal completion.
+	 */
+	list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
+		list_del_init(&rq->flush.list);
+		blk_flush_restore_request(rq);
+	}
+
+	/*
+	 * We need to give away requests on flush queues.  Restore for
+	 * normal completion and put them on the dispatch queue.
+	 */
+	for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
+		list_for_each_entry_safe(rq, n, &q->flush_queue[i],
+					 flush.list) {
+			list_del_init(&rq->flush.list);
+			blk_flush_restore_request(rq);
+			list_add_tail(&rq->queuelist, &q->queue_head);
+		}
+	}
 }
 
 static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk.h b/block/blk.h
index 9d2ee8f4d9af..284b500852bd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,21 +51,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-struct request *blk_do_flush(struct request_queue *q, struct request *rq);
+void blk_insert_flush(struct request *rq);
+void blk_abort_flushes(struct request_queue *q);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 
 	while (1) {
-		while (!list_empty(&q->queue_head)) {
+		if (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-			    (rq->cmd_flags & REQ_FLUSH_SEQ))
-				return rq;
-			rq = blk_do_flush(q, rq);
-			if (rq)
-				return rq;
+			return rq;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/block/elevator.c b/block/elevator.c
index 2569512830d3..270e0972eb9f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -673,6 +673,11 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
+	case ELEVATOR_INSERT_FLUSH:
+		rq->cmd_flags |= REQ_SOFTBARRIER;
+		blk_insert_flush(rq);
+		break;
+
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
@@ -785,6 +790,8 @@ void elv_abort_queue(struct request_queue *q)
 {
 	struct request *rq;
 
+	blk_abort_flushes(q);
+
 	while (!list_empty(&q->queue_head)) {
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 36ab42c9bb99..6d7e9afd08c3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,13 +99,18 @@ struct request {
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the
-	 * completion_data share space with the rb_node.
+	 * flush fields share space with the rb_node.
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
-		void *completion_data;
+		struct {
+			unsigned int			seq;
+			struct list_head		list;
+		} flush;
 	};
 
+	void *completion_data;
+
 	/*
 	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.
@@ -362,11 +367,12 @@ struct request_queue
 	 * for flush operations
 	 */
 	unsigned int		flush_flags;
-	unsigned int		flush_seq;
-	int			flush_err;
+	unsigned int		flush_pending_idx:1;
+	unsigned int		flush_running_idx:1;
+	unsigned long		flush_pending_since;
+	struct list_head	flush_queue[2];
+	struct list_head	flush_data_in_flight;
 	struct request		flush_rq;
-	struct request		*orig_flush_rq;
-	struct list_head	pending_flushes;
 
 	struct mutex		sysfs_lock;
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4fd978e7eb83..86120c916fcc 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -167,6 +167,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_BACK	2
 #define ELEVATOR_INSERT_SORT	3
 #define ELEVATOR_INSERT_REQUEUE	4
+#define ELEVATOR_INSERT_FLUSH	5
 
 /*
  * return values from elevator_may_queue_fn

From 9d5a4e946ce5352f19400b6370f4cd8e72806278 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 11 Feb 2011 11:05:46 +0100
Subject: [PATCH 04/65] block: skip elevator data initialization for flush
 requests

Skip elevator initialization for flush requests by passing priv=0 to
blk_alloc_request() in get_request().  As such elv_set_request() is
never called for flush requests.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 05746093b45e..ab4a7696956d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -737,6 +737,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
 		__freed_request(q, sync ^ 1);
 }
 
+/*
+ * Determine if elevator data should be initialized when allocating the
+ * request associated with @bio.
+ */
+static bool blk_rq_should_init_elevator(struct bio *bio)
+{
+	if (!bio)
+		return true;
+
+	/*
+	 * Flush requests do not use the elevator so skip initialization.
+	 * This allows a request to share the flush and elevator data.
+	 */
+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+		return false;
+
+	return true;
+}
+
 /*
  * Get a free request, queue_lock must be held.
  * Returns NULL on failure, with queue_lock held.
@@ -749,7 +768,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
-	int may_queue, priv;
+	int may_queue, priv = 0;
 
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
@@ -793,9 +812,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 
-	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	if (priv)
-		rl->elvpriv++;
+	if (blk_rq_should_init_elevator(bio)) {
+		priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+		if (priv)
+			rl->elvpriv++;
+	}
 
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;

From c186794dbb466b45cf40f942f2d09d6d5b4b0e42 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 11 Feb 2011 11:08:00 +0100
Subject: [PATCH 05/65] block: share request flush fields with 
 elevator_private

Flush requests are never put on the IO scheduler.  Convert request
structure's elevator_private* into an array and have the flush fields
share a union with it.

Reclaim the space lost in 'struct request' by moving 'completion_data'
back in the union with 'rb_node'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c    | 18 +++++++++---------
 block/elevator.c       |  2 +-
 include/linux/blkdev.h | 23 ++++++++++++-----------
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4cd59b0d7c15..968455c57e1a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
 #define RQ_CIC(rq)		\
-	((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
-#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private3)
+	((struct cfq_io_context *) (rq)->elevator_private[0])
+#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
+#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private[2])
 
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -3589,12 +3589,12 @@ static void cfq_put_request(struct request *rq)
 
 		put_io_context(RQ_CIC(rq)->ioc);
 
-		rq->elevator_private = NULL;
-		rq->elevator_private2 = NULL;
+		rq->elevator_private[0] = NULL;
+		rq->elevator_private[1] = NULL;
 
 		/* Put down rq reference on cfqg */
 		cfq_put_cfqg(RQ_CFQG(rq));
-		rq->elevator_private3 = NULL;
+		rq->elevator_private[2] = NULL;
 
 		cfq_put_queue(cfqq);
 	}
@@ -3685,9 +3685,9 @@ new_queue:
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	rq->elevator_private = cic;
-	rq->elevator_private2 = cfqq;
-	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+	rq->elevator_private[0] = cic;
+	rq->elevator_private[1] = cfqq;
+	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
 	return 0;
 
 queue_fail:
diff --git a/block/elevator.c b/block/elevator.c
index 270e0972eb9f..f98e92edc937 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -764,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	if (e->ops->elevator_set_req_fn)
 		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 
-	rq->elevator_private = NULL;
+	rq->elevator_private[0] = NULL;
 	return 0;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6d7e9afd08c3..12bb426949e9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -99,25 +99,26 @@ struct request {
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the
-	 * flush fields share space with the rb_node.
+	 * completion_data share space with the rb_node.
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
-		struct {
-			unsigned int			seq;
-			struct list_head		list;
-		} flush;
+		void *completion_data;
 	};
 
-	void *completion_data;
-
 	/*
 	 * Three pointers are available for the IO schedulers, if they need
-	 * more they have to dynamically allocate it.
+	 * more they have to dynamically allocate it.  Flush requests are
+	 * never put on the IO scheduler. So let the flush fields share
+	 * space with the three elevator_private pointers.
 	 */
-	void *elevator_private;
-	void *elevator_private2;
-	void *elevator_private3;
+	union {
+		void *elevator_private[3];
+		struct {
+			unsigned int		seq;
+			struct list_head	list;
+		} flush;
+	};
 
 	struct gendisk *rq_disk;
 	unsigned long start_time;

From 0bbfeb8320421989d3e12bd95fae86b9ac0712aa Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 1 Mar 2011 15:05:08 -0500
Subject: [PATCH 06/65] cfq-iosched: Always provide group isolation.

Effectively, make group_isolation=1 the default and remove the tunable.
The setting group_isolation=0 was because by default we idle on
sync-noidle tree and on fast devices, this can be very harmful for
throughput.

However, this problem can also be addressed by tuning slice_idle and
possibly group_idle on faster storage devices.

This change simplifies the CFQ code by removing the feature entirely.

Signed-off-by: Justin TerAvest <teravest@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/cgroups/blkio-controller.txt | 28 ----------------
 block/cfq-iosched.c                        | 37 +---------------------
 2 files changed, 1 insertion(+), 64 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 4ed7b5ceeed2..d915c16df42c 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -343,34 +343,6 @@ Common files among various policies
 
 CFQ sysfs tunable
 =================
-/sys/block/<disk>/queue/iosched/group_isolation
------------------------------------------------
-
-If group_isolation=1, it provides stronger isolation between groups at the
-expense of throughput. By default group_isolation is 0. In general that
-means that if group_isolation=0, expect fairness for sequential workload
-only. Set group_isolation=1 to see fairness for random IO workload also.
-
-Generally CFQ will put random seeky workload in sync-noidle category. CFQ
-will disable idling on these queues and it does a collective idling on group
-of such queues. Generally these are slow moving queues and if there is a
-sync-noidle service tree in each group, that group gets exclusive access to
-disk for certain period. That means it will bring the throughput down if
-group does not have enough IO to drive deeper queue depths and utilize disk
-capacity to the fullest in the slice allocated to it. But the flip side is
-that even a random reader should get better latencies and overall throughput
-if there are lots of sequential readers/sync-idle workload running in the
-system.
-
-If group_isolation=0, then CFQ automatically moves all the random seeky queues
-in the root group. That means there will be no service differentiation for
-that kind of workload. This leads to better throughput as we do collective
-idling on root sync-noidle tree.
-
-By default one should run with group_isolation=0. If that is not sufficient
-and one wants stronger isolation between groups, then set group_isolation=1
-but this will come at cost of reduced throughput.
-
 /sys/block/<disk>/queue/iosched/slice_idle
 ------------------------------------------
 On a faster hardware CFQ can be slow, especially with sequential workload.
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f27ff3efe6cd..3202c7e87fb3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -146,7 +146,6 @@ struct cfq_queue {
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
-	struct cfq_group *orig_cfqg;
 	/* Number of sectors dispatched from queue in single dispatch round */
 	unsigned long nr_sectors;
 };
@@ -285,7 +284,6 @@ struct cfq_data {
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
-	unsigned int cfq_group_isolation;
 
 	unsigned int cic_index;
 	struct list_head cic_list;
@@ -1187,32 +1185,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	int new_cfqq = 1;
 	int group_changed = 0;
 
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	if (!cfqd->cfq_group_isolation
-	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
-	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
-		/* Move this cfq to root group */
-		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
-			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-		cfqq->orig_cfqg = cfqq->cfqg;
-		cfqq->cfqg = &cfqd->root_group;
-		cfqd->root_group.ref++;
-		group_changed = 1;
-	} else if (!cfqd->cfq_group_isolation
-		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
-		/* cfqq is sequential now needs to go to its original group */
-		BUG_ON(cfqq->cfqg != &cfqd->root_group);
-		if (!RB_EMPTY_NODE(&cfqq->rb_node))
-			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
-		cfq_put_cfqg(cfqq->cfqg);
-		cfqq->cfqg = cfqq->orig_cfqg;
-		cfqq->orig_cfqg = NULL;
-		group_changed = 1;
-		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
-	}
-#endif
-
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
@@ -2542,7 +2514,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
-	struct cfq_group *cfqg, *orig_cfqg;
+	struct cfq_group *cfqg;
 
 	BUG_ON(cfqq->ref <= 0);
 
@@ -2554,7 +2526,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	cfqg = cfqq->cfqg;
-	orig_cfqg = cfqq->orig_cfqg;
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
@@ -2564,8 +2535,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 	cfq_put_cfqg(cfqg);
-	if (orig_cfqg)
-		cfq_put_cfqg(orig_cfqg);
 }
 
 /*
@@ -3953,7 +3922,6 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_group_idle = cfq_group_idle;
 	cfqd->cfq_latency = 1;
-	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
@@ -4029,7 +3997,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
-SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -4063,7 +4030,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
-STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -4081,7 +4047,6 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_idle),
 	CFQ_ATTR(group_idle),
 	CFQ_ATTR(low_latency),
-	CFQ_ATTR(group_isolation),
 	__ATTR_NULL
 };
 

From 53f22956effe1c9e7961b8c6e4362ecca5e460b7 Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Wed, 2 Mar 2011 11:00:15 -0500
Subject: [PATCH 07/65] block/genhd: Change some numerals into macros

Rename the numerals in the diskstats_show() into the macros.

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 6a5b772aa201..73d85a8e3c85 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   "%u %lu %lu %llu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
-			   part_stat_read(hd, ios[0]),
-			   part_stat_read(hd, merges[0]),
-			   (unsigned long long)part_stat_read(hd, sectors[0]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[0])),
-			   part_stat_read(hd, ios[1]),
-			   part_stat_read(hd, merges[1]),
-			   (unsigned long long)part_stat_read(hd, sectors[1]),
-			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+			   part_stat_read(hd, ios[READ]),
+			   part_stat_read(hd, merges[READ]),
+			   (unsigned long long)part_stat_read(hd, sectors[READ]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
+			   part_stat_read(hd, ios[WRITE]),
+			   part_stat_read(hd, merges[WRITE]),
+			   (unsigned long long)part_stat_read(hd, sectors[WRITE]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
 			   part_in_flight(hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))

From c94a96ac93b4f5b8d1ff8430b1afa1a25610cf53 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 2 Mar 2011 19:04:42 -0500
Subject: [PATCH 08/65] block: Initialize ->queue_lock to internal lock at
 queue allocation time

There does not seem to be a clear convention whether q->queue_lock is
initialized or not when blk_cleanup_queue() is called. In the past it
was not necessary but now blk_throtl_exit() takes up queue lock by
default and needs queue lock to be available.

In fact elevator_exit() code also has similar requirement just that it
is less stringent in the sense that elevator_exit() is called only if
elevator is initialized.

Two problems have been noticed because of ambiguity about spin lock
status.

      - If a driver calls blk_alloc_queue() and then soon calls
        blk_cleanup_queue() almost immediately, (because some other
	driver structure allocation failed or some other error happened)
	then blk_throtl_exit() will run into issues as queue lock is not
	initialized. Loop driver ran into this issue recently and I
	noticed error paths in md driver too. Similar error paths should
	exist in other drivers too.

      - If some driver provided external spin lock and zapped the lock
        before blk_cleanup_queue(), then it can lead to issues.

So this patch initializes the default queue lock at queue allocation time.

block throttling code is one of the users of queue lock and it is
initialized at the queue allocation time, so it makes sense to
initialize ->queue_lock also to internal lock. A driver can overide that
lock later. This will take care of the issue where a driver does not have
to worry about initializing the queue lock to default before calling
blk_cleanup_queue()

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c     | 16 +++++++++++++++-
 block/blk-settings.c |  7 -------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3cc17e6064d6..bc2b7c5004e1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -446,6 +446,11 @@ void blk_put_queue(struct request_queue *q)
 	kobject_put(&q->kobj);
 }
 
+/*
+ * Note: If a driver supplied the queue lock, it should not zap that lock
+ * unexpectedly as some queue cleanup components like elevator_exit() and
+ * blk_throtl_exit() need queue lock.
+ */
 void blk_cleanup_queue(struct request_queue *q)
 {
 	/*
@@ -540,6 +545,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
 
+	/*
+	 * By default initialize queue_lock to internal lock and driver can
+	 * override it later if need be.
+	 */
+	q->queue_lock = &q->__queue_lock;
+
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -624,7 +635,10 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 	q->unprep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
-	q->queue_lock		= lock;
+
+	/* Override internal queue lock with supplied lock pointer */
+	if (lock)
+		q->queue_lock		= lock;
 
 	/*
 	 * This also sets hw/phys segments, boundary and size
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af18..df649fa59ded 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -175,13 +175,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	blk_set_default_limits(&q->limits);
 	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 
-	/*
-	 * If the caller didn't supply a lock, fall back to our embedded
-	 * per-queue locks
-	 */
-	if (!q->queue_lock)
-		q->queue_lock = &q->__queue_lock;
-
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */

From cd25f54961273c2e4cbd47441e04832468382a5e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 2 Mar 2011 19:04:50 -0500
Subject: [PATCH 09/65] loop: No need to initialize ->queue_lock explicitly
 before calling blk_cleanup_queue()

Now we initialize ->queue_lock at queue allocation time so driver does
not have to worry about initializing it before calling
blk_cleanup_queue().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 49e6a545eb63..44e18c073c44 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1641,9 +1641,6 @@ out:
 
 static void loop_free(struct loop_device *lo)
 {
-	if (!lo->lo_queue->queue_lock)
-		lo->lo_queue->queue_lock = &lo->lo_queue->__queue_lock;
-
 	blk_cleanup_queue(lo->lo_queue);
 	put_disk(lo->lo_disk);
 	list_del(&lo->lo_list);

From da527770007fce8e4541947d47918248286da875 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 2 Mar 2011 19:05:33 -0500
Subject: [PATCH 10/65] block: Move blk_throtl_exit() call to
 blk_cleanup_queue()

Move blk_throtl_exit() in blk_cleanup_queue() as blk_throtl_exit() is
written in such a way that it needs queue lock. In blk_release_queue()
there is no gurantee that ->queue_lock is still around.

Initially blk_throtl_exit() was in blk_cleanup_queue() but Ingo reported
one problem.

  https://lkml.org/lkml/2010/10/23/86

  And a quick fix moved blk_throtl_exit() to blk_release_queue().

        commit 7ad58c028652753814054f4e3ac58f925e7343f4
        Author: Jens Axboe <jaxboe@fusionio.com>
        Date:   Sat Oct 23 20:40:26 2010 +0200

        block: fix use-after-free bug in blk throttle code

This patch reverts above change and does not try to shutdown the
throtl work in blk_sync_queue(). By avoiding call to
throtl_shutdown_timer_wq() from blk_sync_queue(), we should also avoid
the problem reported by Ingo.

blk_sync_queue() seems to be used only by md driver and it seems to be
using it to make sure q->unplug_fn is not called as md registers its
own unplug functions and it is about to free up the data structures
used by unplug_fn(). Block throttle does not call back into unplug_fn()
or into md. So there is no need to cancel blk throttle work.

In fact I think cancelling block throttle work is bad because it might
happen that some bios are throttled and scheduled to be dispatched later
with the help of pending work and if work is cancelled, these bios might
never be dispatched.

Block layer also uses blk_sync_queue() during blk_cleanup_queue() and
blk_release_queue() time. That should be safe as we are also calling
blk_throtl_exit() which should make sure all the throttling related
data structures are cleaned up.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 7 ++++++-
 block/blk-sysfs.c      | 2 --
 block/blk-throttle.c   | 6 +++---
 include/linux/blkdev.h | 2 --
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index bc2b7c5004e1..accff29ad674 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -380,13 +380,16 @@ EXPORT_SYMBOL(blk_stop_queue);
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
+ *     This function does not cancel any asynchronous activity arising
+ *     out of elevator or throttling code. That would require elevaotor_exit()
+ *     and blk_throtl_exit() to be called with queue lock initialized.
+ *
  */
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
-	throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -469,6 +472,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (q->elevator)
 		elevator_exit(q->elevator);
 
+	blk_throtl_exit(q);
+
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 41fb69150b4d..261c75c665ae 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_sync_queue(q);
 
-	blk_throtl_exit(q);
-
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a89043a3caa4..c0f623742165 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -965,7 +965,7 @@ static void throtl_update_blkio_group_write_iops(void *key,
 	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-void throtl_shutdown_timer_wq(struct request_queue *q)
+static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 
@@ -1099,7 +1099,7 @@ void blk_throtl_exit(struct request_queue *q)
 
 	BUG_ON(!td);
 
-	throtl_shutdown_timer_wq(q);
+	throtl_shutdown_wq(q);
 
 	spin_lock_irq(q->queue_lock);
 	throtl_release_tgs(td);
@@ -1129,7 +1129,7 @@ void blk_throtl_exit(struct request_queue *q)
 	 * update limits through cgroup and another work got queued, cancel
 	 * it.
 	 */
-	throtl_shutdown_timer_wq(q);
+	throtl_shutdown_wq(q);
 	throtl_td_free(td);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e3ee74fc5903..23fb92506c31 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1144,7 +1144,6 @@ extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
 extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
-extern void throtl_shutdown_timer_wq(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
 {
@@ -1154,7 +1153,6 @@ static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
 static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
-static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \

From 93803e0140c6216b68fe926ccc611297120da273 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 7 Mar 2011 08:59:06 +0100
Subject: [PATCH 11/65] cfq-iosched: fix race in cfq_set_request()

We need to hold the queue lock over the reference increment,
it's not atomic anymore.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3202c7e87fb3..fb2141ec205c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3675,12 +3675,11 @@ new_queue:
 
 	cfqq->allocated[rw]++;
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
 	cfqq->ref++;
 	rq->elevator_private[0] = cic;
 	rq->elevator_private[1] = cfqq;
 	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 0;
 
 queue_fail:

From ef8a41df8c140f10108de75b01b6369d6e49113c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 7 Mar 2011 09:26:29 +0100
Subject: [PATCH 12/65] cfq-iosched: give busy sync queue no dispatch limit

If there are a sync and an async queue and the sync queue's think time
is small, we can ignore the sync queue's dispatch quantum. Because the
sync queue will always preempt the async queue, we don't need to care
about async's latency.  This can fix a performance regression of
aiostress test, which is introduced by commit f8ae6e3eb825. The issue
should exist even without the commit, but the commit amplifies the
impact.

The initial post does the same optimization for RT queue too, but since
I have no real workload for it, Vivek suggests to drop it.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fb2141ec205c..135b1a48da23 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -237,6 +237,7 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
+	unsigned int busy_sync_queues;
 
 	int rq_in_driver;
 	int rq_in_flight[2];
@@ -1344,6 +1345,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->busy_sync_queues++;
 
 	cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -1370,6 +1373,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
+	if (cfq_cfqq_sync(cfqq))
+		cfqd->busy_sync_queues--;
 }
 
 /*
@@ -2377,22 +2382,39 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Does this cfqq already have too much IO in flight?
 	 */
 	if (cfqq->dispatched >= max_dispatch) {
+		bool promote_sync = false;
 		/*
 		 * idle queue must always only have a single IO in flight
 		 */
 		if (cfq_class_idle(cfqq))
 			return false;
 
+		/*
+		 * If there is only one sync queue, and its think time is
+		 * small, we can ignore async queue here and give the sync
+		 * queue no dispatch limit. The reason is a sync queue can
+		 * preempt async queue, limiting the sync queue doesn't make
+		 * sense. This is useful for aiostress test.
+		 */
+		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) {
+			struct cfq_io_context *cic = RQ_CIC(cfqq->next_rq);
+
+			if (sample_valid(cic->ttime_samples) &&
+				cic->ttime_mean < cfqd->cfq_slice_idle)
+				promote_sync = true;
+		}
+
 		/*
 		 * We have other queues, don't allow more IO from this one
 		 */
-		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
+		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
+				!promote_sync)
 			return false;
 
 		/*
 		 * Sole queue user, no limit
 		 */
-		if (cfqd->busy_queues == 1)
+		if (cfqd->busy_queues == 1 || promote_sync)
 			max_dispatch = -1;
 		else
 			/*

From a60327107b56573c305ecc78e471dbdbb4d2f426 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 7 Mar 2011 09:28:09 +0100
Subject: [PATCH 13/65] cfq-iosched: Fix update_vdisktime logic

The update_vdisktime logic is broken since commit
b54ce60eb7f61f8e314b8b241b0469eda3bb1d42, st->min_vdisktime never makes
a progress. Fix it.

Thanks Vivek for pointing it out.

Signed-off-by: Gui Jianfeng <guijianfen@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 135b1a48da23..938ae52aa927 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -557,15 +557,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
-	u64 vdisktime = st->min_vdisktime;
 	struct cfq_group *cfqg;
 
 	if (st->left) {
 		cfqg = rb_entry_cfqg(st->left);
-		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
+						  cfqg->vdisktime);
 	}
-
-	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
 }
 
 /*

From 231d704b4ab7491473c0b1a9cd0c6e0d1cba85b9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 7 Mar 2011 21:05:14 +0100
Subject: [PATCH 14/65] blk-throttle: process limit change only through one
 function

With the help of cgroup interface one can go and upate the bps/iops
limits of existing group. Once the limits are udpated, a thread is
woken up to see if some blocked group needs recalculation based on new
limits and needs to be requeued.

There was also a piece of code where I was checking for group limit
update when a fresh bio comes in. This patch gets rid of that piece of
code and keeps processing the limit change at one place
throtl_process_limit_change().  It just keeps the code simple and easy
to understand.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 061dee66e2a6..a29f09240c0f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1009,14 +1009,8 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
-		 * Still update the disptime if rate limits on this group
-		 * were changed.
 		 */
-		if (!tg->limits_changed)
-			update_disptime = false;
-		else
-			tg->limits_changed = false;
-
+		update_disptime = false;
 		goto queue_bio;
 	}
 

From de701c74a34005e637e1ca2634fbf28fd1debba2 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 7 Mar 2011 21:09:32 +0100
Subject: [PATCH 15/65] blk-throttle: Some cleanups and race fixes in limit 
 update code

When throttle group limits are updated through cgroups, a thread is
woken up to process these updates. While reviewing that code, oleg noted
couple of race conditions existed in the code and he also suggested that
code can be simplified.

This patch fixes the races simplifies the code based on Oleg's suggestions:

	- Use xchg().
	- Introduced a common function throtl_update_blkio_group_common()
          which is shared now by all iops/bps update functions.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>

Fixed a merge issue, throtl_schedule_delayed_work() takes throtl_data
as the argument now, not the queue.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 98 ++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 57 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a29f09240c0f..32dd3e4b041d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -102,7 +102,7 @@ struct throtl_data
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
 
-	atomic_t limits_changed;
+	bool limits_changed;
 };
 
 enum tg_state_flags {
@@ -201,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
+	td->limits_changed = false;
 
 	/*
 	 * Take the initial reference that will be released on destroy
@@ -737,34 +738,27 @@ static void throtl_process_limit_change(struct throtl_data *td)
 	struct throtl_grp *tg;
 	struct hlist_node *pos, *n;
 
-	if (!atomic_read(&td->limits_changed))
+	if (!td->limits_changed)
 		return;
 
-	throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+	xchg(&td->limits_changed, false);
 
-	/*
-	 * Make sure updates from throtl_update_blkio_group_read_bps() group
-	 * of functions to tg->limits_changed are visible. We do not
-	 * want update td->limits_changed to be visible but update to
-	 * tg->limits_changed not being visible yet on this cpu. Hence
-	 * the read barrier.
-	 */
-	smp_rmb();
+	throtl_log(td, "limits changed");
 
 	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
-		if (throtl_tg_on_rr(tg) && tg->limits_changed) {
-			throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
-				" riops=%u wiops=%u", tg->bps[READ],
-				tg->bps[WRITE], tg->iops[READ],
-				tg->iops[WRITE]);
-			tg_update_disptime(td, tg);
-			tg->limits_changed = false;
-		}
-	}
+		if (!tg->limits_changed)
+			continue;
 
-	smp_mb__before_atomic_dec();
-	atomic_dec(&td->limits_changed);
-	smp_mb__after_atomic_dec();
+		if (!xchg(&tg->limits_changed, false))
+			continue;
+
+		throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
+			tg->iops[READ], tg->iops[WRITE]);
+
+		if (throtl_tg_on_rr(tg))
+			tg_update_disptime(td, tg);
+	}
 }
 
 /* Dispatch throttled bios. Should be called without queue lock held. */
@@ -898,6 +892,15 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
 	spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
 
+static void throtl_update_blkio_group_common(struct throtl_data *td,
+				struct throtl_grp *tg)
+{
+	xchg(&tg->limits_changed, true);
+	xchg(&td->limits_changed, true);
+	/* Schedule a work now to process the limit change */
+	throtl_schedule_delayed_work(td, 0);
+}
+
 /*
  * For all update functions, key should be a valid pointer because these
  * update functions are called under blkcg_lock, that means, blkg is
@@ -911,61 +914,40 @@ static void throtl_update_blkio_group_read_bps(void *key,
 				struct blkio_group *blkg, u64 read_bps)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->bps[READ] = read_bps;
-	/* Make sure read_bps is updated before setting limits_changed */
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-
-	/* Make sure tg->limits_changed is updated before td->limits_changed */
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-
-	/* Schedule a work now to process the limit change */
-	throtl_schedule_delayed_work(td, 0);
+	tg->bps[READ] = read_bps;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_update_blkio_group_write_bps(void *key,
 				struct blkio_group *blkg, u64 write_bps)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td, 0);
+	tg->bps[WRITE] = write_bps;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_update_blkio_group_read_iops(void *key,
 			struct blkio_group *blkg, unsigned int read_iops)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->iops[READ] = read_iops;
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td, 0);
+	tg->iops[READ] = read_iops;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_update_blkio_group_write_iops(void *key,
 			struct blkio_group *blkg, unsigned int write_iops)
 {
 	struct throtl_data *td = key;
+	struct throtl_grp *tg = tg_of_blkg(blkg);
 
-	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
-	smp_wmb();
-	tg_of_blkg(blkg)->limits_changed = true;
-	smp_mb__before_atomic_inc();
-	atomic_inc(&td->limits_changed);
-	smp_mb__after_atomic_inc();
-	throtl_schedule_delayed_work(td, 0);
+	tg->iops[WRITE] = write_iops;
+	throtl_update_blkio_group_common(td, tg);
 }
 
 static void throtl_shutdown_wq(struct request_queue *q)
@@ -1012,6 +994,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		 */
 		update_disptime = false;
 		goto queue_bio;
+
 	}
 
 	/* Bio is with-in rate limit of group */
@@ -1052,7 +1035,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
-	atomic_set(&td->limits_changed, 0);
+	td->limits_changed = false;
 
 	/* Init root group */
 	tg = &td->root_tg;
@@ -1064,6 +1047,7 @@ int blk_throtl_init(struct request_queue *q)
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
 	tg->iops[0] = tg->iops[1] = -1;
+	td->limits_changed = false;
 
 	/*
 	 * Set root group reference to 2. One reference will be dropped when

From df677140281beb608f6748c341af7612f7bfe7a0 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 8 Mar 2011 08:28:01 +0100
Subject: [PATCH 16/65] block: biovec_slab vs. CONFIG_BLK_DEV_INTEGRITY

The block integrity subsystem no longer uses the bio_vec slabs so this
code can safely be compiled in.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/bio.c            | 4 +---
 include/linux/bio.h | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 4bd454fa844e..5694b756ed01 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -43,7 +43,7 @@ static mempool_t *bio_split_pool __read_mostly;
  * unsigned short
  */
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -1656,12 +1656,10 @@ static void __init biovec_init_slabs(void)
 		int size;
 		struct biovec_slab *bvs = bvec_slabs + i;
 
-#ifndef CONFIG_BLK_DEV_INTEGRITY
 		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
 			bvs->slab = NULL;
 			continue;
 		}
-#endif
 
 		size = bvs->nr_vecs * sizeof(struct bio_vec);
 		bvs->slab = kmem_cache_create(bvs->name, size, 0,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 35dcdb3589bc..ce33e6868a2f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -304,7 +304,6 @@ struct biovec_slab {
 };
 
 extern struct bio_set *fs_bio_set;
-extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
 
 /*
  * a small number of entries is fine, not going to be performance critical.

From df457f845e5449be2e7d96668791f789b3770ac7 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 8 Mar 2011 19:45:00 +0100
Subject: [PATCH 17/65] blk-cgroup: Lower minimum weight from 100 to 10.

We've found that we still get good, useful isolation at weights this
low. I'd like to adjust the minimum so that any other changes can take
these values into account.

Signed-off-by: Justin TerAvest <teravest@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/cgroups/blkio-controller.txt | 2 +-
 block/blk-cgroup.h                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index d915c16df42c..465351d4cf85 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -140,7 +140,7 @@ Proportional weight policy files
 	- Specifies per cgroup weight. This is default weight of the group
 	  on all the devices until and unless overridden by per device rule.
 	  (See blkio.weight_device).
-	  Currently allowed range of weights is from 100 to 1000.
+	  Currently allowed range of weights is from 10 to 1000.
 
 - blkio.weight_device
 	- One can specify per cgroup per device rules using this interface.
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861bdd549..57e7234c5ae5 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -240,7 +240,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
 
 #endif
 
-#define BLKIO_WEIGHT_MIN	100
+#define BLKIO_WEIGHT_MIN	10
 #define BLKIO_WEIGHT_MAX	1000
 #define BLKIO_WEIGHT_DEFAULT	500
 

From facc31ddc3570a3a0d8951c94f16b898e01b464d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: [PATCH 18/65] block: Don't implicitly trigger event check on
 disk_unblock_events()

Currently, disk_unblock_events() implicitly kick event check if the
block count reaches zero.  This behavior is not described in the
comment and hinders with future changes.  Make the unblocker
explicitly check events by calling disk_check_events() as necessary.

This patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 block/genhd.c  | 2 +-
 fs/block_dev.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index 3e2b57b55e38..c91a2dac6b6b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk)
 void disk_unblock_events(struct gendisk *disk)
 {
 	if (disk->ev)
-		__disk_unblock_events(disk, true);
+		__disk_unblock_events(disk, false);
 }
 
 /**
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 889287019599..fffdf86c1757 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1446,6 +1446,7 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 		if (bdev_free) {
 			if (bdev->bd_write_holder) {
 				disk_unblock_events(bdev->bd_disk);
+				disk_check_events(bdev->bd_disk);
 				bdev->bd_write_holder = false;
 			} else
 				disk_check_events(bdev->bd_disk);

From 6936217cc7e58573026bdba25b1bfb778e8f2267 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: [PATCH 19/65] block: Don't check events on close unless it was
 blocked

The block event mechanism currently always checks events when the
device is being closed regardless of the open mode.  The intention was
to allow detection of EJECT_REQUEST when a device is closed whether
disk event polling is enabled or not.

This is unnecessary as, for devices of interest, events are checked
from either userland or kernel and in the former case ->check_events()
is performed on open of each poll attempt anyway.  Furthermore, this
unconditional event check on close makes the code susceptible to event
loop if the block driver doesn't clear reported events correctly - an
event triggers userland to open and close the device which in turn
causes another event, rinse and repeat.

Check events on close only if it was blocked by excl write open.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 fs/block_dev.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fffdf86c1757..7dd2c658d429 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1448,13 +1448,11 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 				disk_unblock_events(bdev->bd_disk);
 				disk_check_events(bdev->bd_disk);
 				bdev->bd_write_holder = false;
-			} else
-				disk_check_events(bdev->bd_disk);
+			}
 		}
 
 		mutex_unlock(&bdev->bd_mutex);
-	} else
-		disk_check_events(bdev->bd_disk);
+	}
 
 	return __blkdev_put(bdev, mode, 0);
 }

From 69e02c59a7d962dced8047401b81a8d897e1702e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: [PATCH 20/65] block: Don't check events while open is in progress

Not all block drivers clear events immediately after reporting.  Some
do so in ->revalidate_disk() or other steps during ->open().  There is
a slim chance event poll may happen between the clearing event check
from check_disk_change() and the actual clearing of the events which
would result in spurious events.

Block event checks while block device open is in progress.  There is
no need to kick explicit event check afterwards as events are always
checked during open.

-v2: The original patch could have called disk_unblock_events() with
     an already released or %NULL @disk causing oops.  Fixed by making
     sure references are put after disk_unblock_events() is called.
     It also makes the error path of __blkdev_get() a bit simpler.
     This problem was reported by Jens.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 fs/block_dev.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7dd2c658d429..d42cad2757ae 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1087,6 +1087,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	if (!disk)
 		goto out;
 
+	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
@@ -1108,10 +1109,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					 */
 					disk_put_part(bdev->bd_part);
 					bdev->bd_part = NULL;
-					module_put(disk->fops->owner);
-					put_disk(disk);
 					bdev->bd_disk = NULL;
 					mutex_unlock(&bdev->bd_mutex);
+					disk_unblock_events(disk);
+					module_put(disk->fops->owner);
+					put_disk(disk);
 					goto restart;
 				}
 				if (ret)
@@ -1148,9 +1150,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
-		module_put(disk->fops->owner);
-		put_disk(disk);
-		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
 				ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1160,11 +1159,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
 		}
+		/* only one opener holds refs to the module and disk */
+		module_put(disk->fops->owner);
+		put_disk(disk);
 	}
 	bdev->bd_openers++;
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
 	return 0;
 
  out_clear:
@@ -1177,9 +1180,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
  out:
-	if (disk)
-		module_put(disk->fops->owner);
+	module_put(disk->fops->owner);
 	put_disk(disk);
 	bdput(bdev);
 

From 5b03a1b140e13a28ff6be1526892a9dc538ddef6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: [PATCH 21/65] ide: Convert to bdops->check_events()

Convert ->media_changed() to the new ->check_events() method.  The
conversion is mostly mechanical.  The only notable change is that
cdrom now doesn't generate any event if @slot_nr isn't CDSL_CURRENT.
It used to return -EINVAL which would be treated as media changed.  As
media changer isn't supported anyway, this doesn't make any
difference.

This makes ide emit the standard disk events and allows kernel event
polling.  Currently, only MEDIA_CHANGE event is implemented.  Adding
support for EJECT_REQUEST shouldn't be difficult; however, given that
ide driver is already deprecated, it probably is best to leave it
alone.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-ide@vger.kernel.org
---
 drivers/ide/ide-cd.c       | 10 ++++++----
 drivers/ide/ide-cd.h       |  3 ++-
 drivers/ide/ide-cd_ioctl.c |  8 ++++----
 drivers/ide/ide-gd.c       | 14 ++++++++------
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 0c73fe39a236..a2e29099ee0c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1177,7 +1177,7 @@ static struct cdrom_device_ops ide_cdrom_dops = {
 	.open			= ide_cdrom_open_real,
 	.release		= ide_cdrom_release_real,
 	.drive_status		= ide_cdrom_drive_status,
-	.media_changed		= ide_cdrom_check_media_change_real,
+	.check_events		= ide_cdrom_check_events_real,
 	.tray_move		= ide_cdrom_tray_move,
 	.lock_door		= ide_cdrom_lock_door,
 	.select_speed		= ide_cdrom_select_speed,
@@ -1702,10 +1702,11 @@ static int idecd_ioctl(struct block_device *bdev, fmode_t mode,
 }
 
 
-static int idecd_media_changed(struct gendisk *disk)
+static unsigned int idecd_check_events(struct gendisk *disk,
+				       unsigned int clearing)
 {
 	struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
-	return cdrom_media_changed(&info->devinfo);
+	return cdrom_check_events(&info->devinfo, clearing);
 }
 
 static int idecd_revalidate_disk(struct gendisk *disk)
@@ -1723,7 +1724,7 @@ static const struct block_device_operations idecd_ops = {
 	.open			= idecd_open,
 	.release		= idecd_release,
 	.ioctl			= idecd_ioctl,
-	.media_changed		= idecd_media_changed,
+	.check_events		= idecd_check_events,
 	.revalidate_disk	= idecd_revalidate_disk
 };
 
@@ -1790,6 +1791,7 @@ static int ide_cd_probe(ide_drive_t *drive)
 	ide_cd_read_toc(drive, &sense);
 	g->fops = &idecd_ops;
 	g->flags |= GENHD_FL_REMOVABLE;
+	g->events = DISK_EVENT_MEDIA_CHANGE;
 	add_disk(g);
 	return 0;
 
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 93a3cf1b0f3f..1efc936f5b66 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -111,7 +111,8 @@ int cdrom_check_status(ide_drive_t *, struct request_sense *);
 int ide_cdrom_open_real(struct cdrom_device_info *, int);
 void ide_cdrom_release_real(struct cdrom_device_info *);
 int ide_cdrom_drive_status(struct cdrom_device_info *, int);
-int ide_cdrom_check_media_change_real(struct cdrom_device_info *, int);
+unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *,
+					 unsigned int clearing, int slot_nr);
 int ide_cdrom_tray_move(struct cdrom_device_info *, int);
 int ide_cdrom_lock_door(struct cdrom_device_info *, int);
 int ide_cdrom_select_speed(struct cdrom_device_info *, int);
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 766b3deeb23c..2a6bc50e8a41 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -79,8 +79,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
 	return CDS_DRIVE_NOT_READY;
 }
 
-int ide_cdrom_check_media_change_real(struct cdrom_device_info *cdi,
-				       int slot_nr)
+unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
+					 unsigned int clearing, int slot_nr)
 {
 	ide_drive_t *drive = cdi->handle;
 	int retval;
@@ -89,9 +89,9 @@ int ide_cdrom_check_media_change_real(struct cdrom_device_info *cdi,
 		(void) cdrom_check_status(drive, NULL);
 		retval = (drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED) ? 1 : 0;
 		drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
-		return retval;
+		return retval ? DISK_EVENT_MEDIA_CHANGE : 0;
 	} else {
-		return -EINVAL;
+		return 0;
 	}
 }
 
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 35c4b43585e3..c4ffd4888939 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -285,11 +285,12 @@ static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int ide_gd_media_changed(struct gendisk *disk)
+static unsigned int ide_gd_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
-	int ret;
+	bool ret;
 
 	/* do not scan partitions twice if this is a removable device */
 	if (drive->dev_flags & IDE_DFLAG_ATTACH) {
@@ -297,10 +298,10 @@ static int ide_gd_media_changed(struct gendisk *disk)
 		return 0;
 	}
 
-	ret = !!(drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED);
+	ret = drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED;
 	drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
 
-	return ret;
+	return ret ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static void ide_gd_unlock_native_capacity(struct gendisk *disk)
@@ -318,7 +319,7 @@ static int ide_gd_revalidate_disk(struct gendisk *disk)
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
 
-	if (ide_gd_media_changed(disk))
+	if (ide_gd_check_events(disk, 0))
 		drive->disk_ops->get_capacity(drive);
 
 	set_capacity(disk, ide_gd_capacity(drive));
@@ -340,7 +341,7 @@ static const struct block_device_operations ide_gd_ops = {
 	.release		= ide_gd_release,
 	.ioctl			= ide_gd_ioctl,
 	.getgeo			= ide_gd_getgeo,
-	.media_changed		= ide_gd_media_changed,
+	.check_events		= ide_gd_check_events,
 	.unlock_native_capacity	= ide_gd_unlock_native_capacity,
 	.revalidate_disk	= ide_gd_revalidate_disk
 };
@@ -412,6 +413,7 @@ static int ide_gd_probe(ide_drive_t *drive)
 	if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
 		g->flags = GENHD_FL_REMOVABLE;
 	g->fops = &ide_gd_ops;
+	g->events = DISK_EVENT_MEDIA_CHANGE;
 	add_disk(g);
 	return 0;
 

From 1a8a74f03fcde1725ff26520220f3d21dbd8faa1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: [PATCH 22/65] floppy,{ami|ata}flop: Convert to bdops->check_events()

Convert the floppy drivers from ->media_changed() to ->check_events().
Both floppy and ataflop buffer media changed state bit and clear them
on revalidation and will behave correctly with kernel event polling.

I can't tell how amiflop clears its event and it's possible that it
may generate spurious events when polled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/block/amiflop.c |  9 +++++----
 drivers/block/ataflop.c | 14 ++++++++------
 drivers/block/floppy.c  | 10 ++++++----
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 7888501ad9ee..5e7d3fe2c13b 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1658,12 +1658,12 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 }
 
 /*
- * floppy-change is never called from an interrupt, so we can relax a bit
+ * check_events is never called from an interrupt, so we can relax a bit
  * here, sleep etc. Note that floppy-on tries to set current_DOR to point
  * to the desired drive, but it will probably not survive the sleep if
  * several floppies are used at the same time: thus the loop.
  */
-static int amiga_floppy_change(struct gendisk *disk)
+static unsigned amiga_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	struct amiga_floppy_struct *p = disk->private_data;
 	int drive = p - unit;
@@ -1686,7 +1686,7 @@ static int amiga_floppy_change(struct gendisk *disk)
 		p->dirty = 0;
 		writepending = 0; /* if this was true before, too bad! */
 		writefromint = 0;
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 	return 0;
 }
@@ -1697,7 +1697,7 @@ static const struct block_device_operations floppy_fops = {
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
 	.getgeo		= fd_getgeo,
-	.media_changed	= amiga_floppy_change,
+	.check_events	= amiga_check_events,
 };
 
 static int __init fd_probe_drives(void)
@@ -1736,6 +1736,7 @@ static int __init fd_probe_drives(void)
 		disk->major = FLOPPY_MAJOR;
 		disk->first_minor = drive;
 		disk->fops = &floppy_fops;
+		disk->events = DISK_EVENT_MEDIA_CHANGE;
 		sprintf(disk->disk_name, "fd%d", drive);
 		disk->private_data = &unit[drive];
 		set_capacity(disk, 880*2);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 605a67e40bbf..c871eae14120 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1324,23 +1324,24 @@ static void finish_fdc_done( int dummy )
  * due to unrecognised disk changes.
  */
 
-static int check_floppy_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	struct atari_floppy_struct *p = disk->private_data;
 	unsigned int drive = p - unit;
 	if (test_bit (drive, &fake_change)) {
 		/* simulated change (e.g. after formatting) */
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 	if (test_bit (drive, &changed_floppies)) {
 		/* surely changed (the WP signal changed at least once) */
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 	if (UD.wpstat) {
 		/* WP is on -> could be changed: to be sure, buffers should be
 		 * invalidated...
 		 */
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 
 	return 0;
@@ -1570,7 +1571,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
 		 * or the next access will revalidate - and clear UDT :-(
 		 */
 
-		if (check_floppy_change(disk))
+		if (floppy_check_events(disk, 0))
 		        floppy_revalidate(disk);
 
 		if (UD.flags & FTD_MSG)
@@ -1904,7 +1905,7 @@ static const struct block_device_operations floppy_fops = {
 	.open		= floppy_unlocked_open,
 	.release	= floppy_release,
 	.ioctl		= fd_ioctl,
-	.media_changed	= check_floppy_change,
+	.check_events	= floppy_check_events,
 	.revalidate_disk= floppy_revalidate,
 };
 
@@ -1963,6 +1964,7 @@ static int __init atari_floppy_init (void)
 		unit[i].disk->first_minor = i;
 		sprintf(unit[i].disk->disk_name, "fd%d", i);
 		unit[i].disk->fops = &floppy_fops;
+		unit[i].disk->events = DISK_EVENT_MEDIA_CHANGE;
 		unit[i].disk->private_data = &unit[i];
 		unit[i].disk->queue = blk_init_queue(do_fd_request,
 					&ataflop_lock);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 77fc76f8aea9..3851dbc03e05 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3770,13 +3770,14 @@ out2:
 /*
  * Check if the disk has been changed or if a change has been faked.
  */
-static int check_floppy_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	int drive = (long)disk->private_data;
 
 	if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
 	    test_bit(FD_VERIFY_BIT, &UDRS->flags))
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 
 	if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
 		lock_fdc(drive, false);
@@ -3788,7 +3789,7 @@ static int check_floppy_change(struct gendisk *disk)
 	    test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
 	    test_bit(drive, &fake_change) ||
 	    drive_no_geom(drive))
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	return 0;
 }
 
@@ -3898,7 +3899,7 @@ static const struct block_device_operations floppy_fops = {
 	.release		= floppy_release,
 	.ioctl			= fd_ioctl,
 	.getgeo			= fd_getgeo,
-	.media_changed		= check_floppy_change,
+	.check_events		= floppy_check_events,
 	.revalidate_disk	= floppy_revalidate,
 };
 
@@ -4205,6 +4206,7 @@ static int __init floppy_init(void)
 		disks[dr]->major = FLOPPY_MAJOR;
 		disks[dr]->first_minor = TOMINOR(dr);
 		disks[dr]->fops = &floppy_fops;
+		disks[dr]->events = DISK_EVENT_MEDIA_CHANGE;
 		sprintf(disks[dr]->disk_name, "fd%d", dr);
 
 		init_timer(&motor_off_timer[dr]);

From 1c27030bd21e7e2c68ef5be9f28c63778cf4b27f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 23/65] gdrom,viocd: Convert to bdops->check_events()

Convert gdrom and viocd from ->media_changed() to ->check_events().

It's unclear how the conditions are cleared and it's possible that it
may generate spurious events when polled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/cdrom/gdrom.c | 16 ++++++++++------
 drivers/cdrom/viocd.c | 17 ++++++++++-------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 64a21461c408..b2b034fea34e 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -395,10 +395,12 @@ static int gdrom_drivestatus(struct cdrom_device_info *cd_info, int ignore)
 	return CDS_NO_INFO;
 }
 
-static int gdrom_mediachanged(struct cdrom_device_info *cd_info, int ignore)
+static unsigned int gdrom_check_events(struct cdrom_device_info *cd_info,
+				       unsigned int clearing, int ignore)
 {
 	/* check the sense key */
-	return (__raw_readb(GDROM_ERROR_REG) & 0xF0) == 0x60;
+	return (__raw_readb(GDROM_ERROR_REG) & 0xF0) == 0x60 ?
+		DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 /* reset the G1 bus */
@@ -483,7 +485,7 @@ static struct cdrom_device_ops gdrom_ops = {
 	.open			= gdrom_open,
 	.release		= gdrom_release,
 	.drive_status		= gdrom_drivestatus,
-	.media_changed		= gdrom_mediachanged,
+	.check_events		= gdrom_check_events,
 	.get_last_session	= gdrom_get_last_session,
 	.reset			= gdrom_hardreset,
 	.audio_ioctl		= gdrom_audio_ioctl,
@@ -509,9 +511,10 @@ static int gdrom_bdops_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-static int gdrom_bdops_mediachanged(struct gendisk *disk)
+static unsigned int gdrom_bdops_check_events(struct gendisk *disk,
+					     unsigned int clearing)
 {
-	return cdrom_media_changed(gd.cd_info);
+	return cdrom_check_events(gd.cd_info, clearing);
 }
 
 static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode,
@@ -530,7 +533,7 @@ static const struct block_device_operations gdrom_bdops = {
 	.owner			= THIS_MODULE,
 	.open			= gdrom_bdops_open,
 	.release		= gdrom_bdops_release,
-	.media_changed		= gdrom_bdops_mediachanged,
+	.check_events		= gdrom_bdops_check_events,
 	.ioctl			= gdrom_bdops_ioctl,
 };
 
@@ -800,6 +803,7 @@ static int __devinit probe_gdrom(struct platform_device *devptr)
 		goto probe_fail_cdrom_register;
 	}
 	gd.disk->fops = &gdrom_bdops;
+	gd.disk->events = DISK_EVENT_MEDIA_CHANGE;
 	/* latch on to the interrupt */
 	err = gdrom_set_interrupt_handlers();
 	if (err)
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index be73a9b493a6..4e874c5fa605 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -186,10 +186,11 @@ static int viocd_blk_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 }
 
-static int viocd_blk_media_changed(struct gendisk *disk)
+static unsigned int viocd_blk_check_events(struct gendisk *disk,
+					   unsigned int clearing)
 {
 	struct disk_info *di = disk->private_data;
-	return cdrom_media_changed(&di->viocd_info);
+	return cdrom_check_events(&di->viocd_info, clearing);
 }
 
 static const struct block_device_operations viocd_fops = {
@@ -197,7 +198,7 @@ static const struct block_device_operations viocd_fops = {
 	.open =			viocd_blk_open,
 	.release =		viocd_blk_release,
 	.ioctl =		viocd_blk_ioctl,
-	.media_changed =	viocd_blk_media_changed,
+	.check_events =		viocd_blk_check_events,
 };
 
 static int viocd_open(struct cdrom_device_info *cdi, int purpose)
@@ -320,7 +321,8 @@ static void do_viocd_request(struct request_queue *q)
 	}
 }
 
-static int viocd_media_changed(struct cdrom_device_info *cdi, int disc_nr)
+static unsigned int viocd_check_events(struct cdrom_device_info *cdi,
+				       unsigned int clearing, int disc_nr)
 {
 	struct viocd_waitevent we;
 	HvLpEvent_Rc hvrc;
@@ -340,7 +342,7 @@ static int viocd_media_changed(struct cdrom_device_info *cdi, int disc_nr)
 	if (hvrc != 0) {
 		pr_warning("bad rc on HvCallEvent_signalLpEventFast %d\n",
 			   (int)hvrc);
-		return -EIO;
+		return 0;
 	}
 
 	wait_for_completion(&we.com);
@@ -354,7 +356,7 @@ static int viocd_media_changed(struct cdrom_device_info *cdi, int disc_nr)
 		return 0;
 	}
 
-	return we.changed;
+	return we.changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int viocd_lock_door(struct cdrom_device_info *cdi, int locking)
@@ -550,7 +552,7 @@ static int viocd_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
 static struct cdrom_device_ops viocd_dops = {
 	.open = viocd_open,
 	.release = viocd_release,
-	.media_changed = viocd_media_changed,
+	.check_events = viocd_check_events,
 	.lock_door = viocd_lock_door,
 	.generic_packet = viocd_packet,
 	.audio_ioctl = viocd_audio_ioctl,
@@ -624,6 +626,7 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	gendisk->queue = q;
 	gendisk->fops = &viocd_fops;
 	gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE;
+	gendisk->events = DISK_EVENT_MEDIA_CHANGE;
 	set_capacity(gendisk, 0);
 	gendisk->private_data = d;
 	d->viocd_disk = gendisk;

From b1b56b93f331bd61492fdb99e7986f7a528ca730 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 24/65] paride: Convert to bdops->check_events()

Convert paride drivers from ->media_changed() to ->check_events().

pcd and pd buffer and clear events after reporting; however, pf
unconditionally reports MEDIA_CHANGE and will generate spurious events
when polled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Tim Waugh <tim@cyberelk.net>
---
 drivers/block/paride/pcd.c | 18 +++++++++++-------
 drivers/block/paride/pd.c  |  7 ++++---
 drivers/block/paride/pf.c  | 10 ++++++----
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 62cec6afd7ad..2f2ccf686251 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -172,7 +172,8 @@ module_param_array(drive3, int, NULL, 0);
 static int pcd_open(struct cdrom_device_info *cdi, int purpose);
 static void pcd_release(struct cdrom_device_info *cdi);
 static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr);
-static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr);
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+				     unsigned int clearing, int slot_nr);
 static int pcd_tray_move(struct cdrom_device_info *cdi, int position);
 static int pcd_lock_door(struct cdrom_device_info *cdi, int lock);
 static int pcd_drive_reset(struct cdrom_device_info *cdi);
@@ -257,10 +258,11 @@ static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 }
 
-static int pcd_block_media_changed(struct gendisk *disk)
+static unsigned int pcd_block_check_events(struct gendisk *disk,
+					   unsigned int clearing)
 {
 	struct pcd_unit *cd = disk->private_data;
-	return cdrom_media_changed(&cd->info);
+	return cdrom_check_events(&cd->info, clearing);
 }
 
 static const struct block_device_operations pcd_bdops = {
@@ -268,14 +270,14 @@ static const struct block_device_operations pcd_bdops = {
 	.open		= pcd_block_open,
 	.release	= pcd_block_release,
 	.ioctl		= pcd_block_ioctl,
-	.media_changed	= pcd_block_media_changed,
+	.check_events	= pcd_block_check_events,
 };
 
 static struct cdrom_device_ops pcd_dops = {
 	.open		= pcd_open,
 	.release	= pcd_release,
 	.drive_status	= pcd_drive_status,
-	.media_changed	= pcd_media_changed,
+	.check_events	= pcd_check_events,
 	.tray_move	= pcd_tray_move,
 	.lock_door	= pcd_lock_door,
 	.get_mcn	= pcd_get_mcn,
@@ -318,6 +320,7 @@ static void pcd_init_units(void)
 		disk->first_minor = unit;
 		strcpy(disk->disk_name, cd->name);	/* umm... */
 		disk->fops = &pcd_bdops;
+		disk->events = DISK_EVENT_MEDIA_CHANGE;
 	}
 }
 
@@ -502,13 +505,14 @@ static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc)
 
 #define DBMSG(msg)	((verbose>1)?(msg):NULL)
 
-static int pcd_media_changed(struct cdrom_device_info *cdi, int slot_nr)
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+				     unsigned int clearing, int slot_nr)
 {
 	struct pcd_unit *cd = cdi->handle;
 	int res = cd->changed;
 	if (res)
 		cd->changed = 0;
-	return res;
+	return res ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int pcd_lock_door(struct cdrom_device_info *cdi, int lock)
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index c0ee1558b9bb..21dfdb776869 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -794,7 +794,7 @@ static int pd_release(struct gendisk *p, fmode_t mode)
 	return 0;
 }
 
-static int pd_check_media(struct gendisk *p)
+static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
 {
 	struct pd_unit *disk = p->private_data;
 	int r;
@@ -803,7 +803,7 @@ static int pd_check_media(struct gendisk *p)
 	pd_special_command(disk, pd_media_check);
 	r = disk->changed;
 	disk->changed = 0;
-	return r;
+	return r ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int pd_revalidate(struct gendisk *p)
@@ -822,7 +822,7 @@ static const struct block_device_operations pd_fops = {
 	.release	= pd_release,
 	.ioctl		= pd_ioctl,
 	.getgeo		= pd_getgeo,
-	.media_changed	= pd_check_media,
+	.check_events	= pd_check_events,
 	.revalidate_disk= pd_revalidate
 };
 
@@ -837,6 +837,7 @@ static void pd_probe_drive(struct pd_unit *disk)
 	p->fops = &pd_fops;
 	p->major = major;
 	p->first_minor = (disk - pd) << PD_BITS;
+	p->events = DISK_EVENT_MEDIA_CHANGE;
 	disk->gd = p;
 	p->private_data = disk;
 	p->queue = pd_queue;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 635f25dd9e10..7adeb1edbf43 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -243,7 +243,8 @@ static struct pf_unit units[PF_UNITS];
 static int pf_identify(struct pf_unit *pf);
 static void pf_lock(struct pf_unit *pf, int func);
 static void pf_eject(struct pf_unit *pf);
-static int pf_check_media(struct gendisk *disk);
+static unsigned int pf_check_events(struct gendisk *disk,
+				    unsigned int clearing);
 
 static char pf_scratch[512];	/* scratch block buffer */
 
@@ -270,7 +271,7 @@ static const struct block_device_operations pf_fops = {
 	.release	= pf_release,
 	.ioctl		= pf_ioctl,
 	.getgeo		= pf_getgeo,
-	.media_changed	= pf_check_media,
+	.check_events	= pf_check_events,
 };
 
 static void __init pf_init_units(void)
@@ -293,6 +294,7 @@ static void __init pf_init_units(void)
 		disk->first_minor = unit;
 		strcpy(disk->disk_name, pf->name);
 		disk->fops = &pf_fops;
+		disk->events = DISK_EVENT_MEDIA_CHANGE;
 		if (!(*drives[unit])[D_PRT])
 			pf_drive_count++;
 	}
@@ -377,9 +379,9 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
 
 }
 
-static int pf_check_media(struct gendisk *disk)
+static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
 {
-	return 1;
+	return DISK_EVENT_MEDIA_CHANGE;
 }
 
 static inline int status_reg(struct pf_unit *pf)

From 507daea2276677785ba8096d90b9e8049632fc46 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 25/65] dac960: Convert to bdops->check_events()

Convert from ->media_changed() to ->check_events().

DAC960 media change notification seems to be one way (once set, never
cleared) and will generate spurious events when polled once the
condition triggers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/block/DAC960.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 1f286ab461d3..79882104e431 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -140,13 +140,14 @@ static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int DAC960_media_changed(struct gendisk *disk)
+static unsigned int DAC960_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	DAC960_Controller_T *p = disk->queue->queuedata;
 	int drive_nr = (long)disk->private_data;
 
 	if (!p->LogicalDriveInitiallyAccessible[drive_nr])
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	return 0;
 }
 
@@ -163,7 +164,7 @@ static const struct block_device_operations DAC960_BlockDeviceOperations = {
 	.owner			= THIS_MODULE,
 	.open			= DAC960_open,
 	.getgeo			= DAC960_getgeo,
-	.media_changed		= DAC960_media_changed,
+	.check_events		= DAC960_check_events,
 	.revalidate_disk	= DAC960_revalidate_disk,
 };
 
@@ -2546,6 +2547,7 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
 	disk->major = MajorNumber;
 	disk->first_minor = n << DAC960_MaxPartitionsBits;
 	disk->fops = &DAC960_BlockDeviceOperations;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
    }
   /*
     Indicate the Block Device Registration completed successfully,

From 4bbde77787270e17418dd32c7eb32e42ad16cfc7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 26/65] swim[3]: Convert to bdops->check_events()

Convert from ->media_changed() to ->check_events().

Both swim and swim3 buffer media changed state and clear it on
revalidation.  They will behave correctly with kernel event polling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Laurent Vivier <laurent@lvivier.info>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 drivers/block/swim.c  |  8 +++++---
 drivers/block/swim3.c | 11 +++++++----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 75333d0a3327..24a482f2fbd6 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -741,11 +741,12 @@ static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-static int floppy_check_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	struct floppy_state *fs = disk->private_data;
 
-	return fs->ejected;
+	return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int floppy_revalidate(struct gendisk *disk)
@@ -772,7 +773,7 @@ static const struct block_device_operations floppy_fops = {
 	.release	 = floppy_release,
 	.ioctl		 = floppy_ioctl,
 	.getgeo		 = floppy_getgeo,
-	.media_changed	 = floppy_check_change,
+	.check_events	 = floppy_check_events,
 	.revalidate_disk = floppy_revalidate,
 };
 
@@ -857,6 +858,7 @@ static int __devinit swim_floppy_init(struct swim_priv *swd)
 		swd->unit[drive].disk->first_minor = drive;
 		sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
 		swd->unit[drive].disk->fops = &floppy_fops;
+		swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE;
 		swd->unit[drive].disk->private_data = &swd->unit[drive];
 		swd->unit[drive].disk->queue = swd->queue;
 		set_capacity(swd->unit[drive].disk, 2880);
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index bf3a5b859299..4c10f56facbf 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -250,7 +250,8 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long param);
 static int floppy_open(struct block_device *bdev, fmode_t mode);
 static int floppy_release(struct gendisk *disk, fmode_t mode);
-static int floppy_check_change(struct gendisk *disk);
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing);
 static int floppy_revalidate(struct gendisk *disk);
 
 static bool swim3_end_request(int err, unsigned int nr_bytes)
@@ -975,10 +976,11 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-static int floppy_check_change(struct gendisk *disk)
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
 {
 	struct floppy_state *fs = disk->private_data;
-	return fs->ejected;
+	return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int floppy_revalidate(struct gendisk *disk)
@@ -1025,7 +1027,7 @@ static const struct block_device_operations floppy_fops = {
 	.open		= floppy_unlocked_open,
 	.release	= floppy_release,
 	.ioctl		= floppy_ioctl,
-	.media_changed	= floppy_check_change,
+	.check_events	= floppy_check_events,
 	.revalidate_disk= floppy_revalidate,
 };
 
@@ -1161,6 +1163,7 @@ static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device
 	disk->major = FLOPPY_MAJOR;
 	disk->first_minor = i;
 	disk->fops = &floppy_fops;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
 	disk->private_data = &floppy_states[i];
 	disk->queue = swim3_queue;
 	disk->flags |= GENHD_FL_REMOVABLE;

From aaa7c01546d3dc944f5758f671284177131ccce3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 27/65] ub: Convert to bdops->check_events()

Convert from ->media_changed() to ->check_events().

ub buffers media changed state and clears it on revalidation.  It will
behave correctly with kernel event polling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
---
 drivers/block/ub.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 9ae3bb713286..68b9430c7cfe 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -1788,7 +1788,8 @@ static int ub_bd_revalidate(struct gendisk *disk)
  *
  * The return code is bool!
  */
-static int ub_bd_media_changed(struct gendisk *disk)
+static unsigned int ub_bd_check_events(struct gendisk *disk,
+				       unsigned int clearing)
 {
 	struct ub_lun *lun = disk->private_data;
 
@@ -1806,10 +1807,10 @@ static int ub_bd_media_changed(struct gendisk *disk)
 	 */
 	if (ub_sync_tur(lun->udev, lun) != 0) {
 		lun->changed = 1;
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 
-	return lun->changed;
+	return lun->changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static const struct block_device_operations ub_bd_fops = {
@@ -1817,7 +1818,7 @@ static const struct block_device_operations ub_bd_fops = {
 	.open		= ub_bd_unlocked_open,
 	.release	= ub_bd_release,
 	.ioctl		= ub_bd_ioctl,
-	.media_changed	= ub_bd_media_changed,
+	.check_events	= ub_bd_check_events,
 	.revalidate_disk = ub_bd_revalidate,
 };
 
@@ -2333,6 +2334,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
 	disk->major = UB_MAJOR;
 	disk->first_minor = lun->id * UB_PARTS_PER_LUN;
 	disk->fops = &ub_bd_fops;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
 	disk->private_data = lun;
 	disk->driverfs_dev = &sc->intf->dev;
 

From 3a200911ada2d5b955c2c6daa22364c6de83c52c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 28/65] xsysace: Convert to bdops->check_events()

Convert from ->media_changed() to ->check_events().

xsysace buffers media changed state and clears it on revalidation.  It
will behave correctly with kernel event polling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/block/xsysace.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 829161edae53..16a2fa96b810 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -867,12 +867,12 @@ static void ace_request(struct request_queue * q)
 	}
 }
 
-static int ace_media_changed(struct gendisk *gd)
+static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing)
 {
 	struct ace_device *ace = gd->private_data;
-	dev_dbg(ace->dev, "ace_media_changed(): %i\n", ace->media_change);
+	dev_dbg(ace->dev, "ace_check_events(): %i\n", ace->media_change);
 
-	return ace->media_change;
+	return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 static int ace_revalidate_disk(struct gendisk *gd)
@@ -953,7 +953,7 @@ static const struct block_device_operations ace_fops = {
 	.owner = THIS_MODULE,
 	.open = ace_open,
 	.release = ace_release,
-	.media_changed = ace_media_changed,
+	.check_events = ace_check_events,
 	.revalidate_disk = ace_revalidate_disk,
 	.getgeo = ace_getgeo,
 };
@@ -1005,6 +1005,7 @@ static int __devinit ace_setup(struct ace_device *ace)
 	ace->gd->major = ace_major;
 	ace->gd->first_minor = ace->id * ACE_NUM_MINORS;
 	ace->gd->fops = &ace_fops;
+	ace->gd->events = DISK_EVENT_MEDIA_CHANGE;
 	ace->gd->queue = ace->queue;
 	ace->gd->private_data = ace;
 	snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a');

From f47350fdecb7ce9fe715e2df4431c4b51c4ef46b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 29/65] i2o_block: Convert to bdops->check_events()

Convert from ->media_changed() to ->check_events().

i2o_block buffers media changed state and clears it after reporting.
It will behave correctly with kernel event polling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
---
 drivers/message/i2o/i2o_block.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index ae7cad185898..f81c25d4a125 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -695,20 +695,22 @@ static int i2o_block_ioctl(struct block_device *bdev, fmode_t mode,
 };
 
 /**
- *	i2o_block_media_changed - Have we seen a media change?
+ *	i2o_block_check_events - Have we seen a media change?
  *	@disk: gendisk which should be verified
+ *	@clearing: events being cleared
  *
  *	Verifies if the media has changed.
  *
  *	Returns 1 if the media was changed or 0 otherwise.
  */
-static int i2o_block_media_changed(struct gendisk *disk)
+static unsigned int i2o_block_check_events(struct gendisk *disk,
+					   unsigned int clearing)
 {
 	struct i2o_block_device *p = disk->private_data;
 
 	if (p->media_change_flag) {
 		p->media_change_flag = 0;
-		return 1;
+		return DISK_EVENT_MEDIA_CHANGE;
 	}
 	return 0;
 }
@@ -950,7 +952,7 @@ static const struct block_device_operations i2o_block_fops = {
 	.ioctl = i2o_block_ioctl,
 	.compat_ioctl = i2o_block_ioctl,
 	.getgeo = i2o_block_getgeo,
-	.media_changed = i2o_block_media_changed
+	.check_events = i2o_block_check_events,
 };
 
 /**
@@ -1002,6 +1004,7 @@ static struct i2o_block_device *i2o_block_device_alloc(void)
 	gd->major = I2O_MAJOR;
 	gd->queue = queue;
 	gd->fops = &i2o_block_fops;
+	gd->events = DISK_EVENT_MEDIA_CHANGE;
 	gd->private_data = dev;
 
 	dev->gd = gd;

From ffe80cea354519d846767b8165598c2903706c2a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 30/65] s390/tape_block: Convert to bdops->check_events()

Convert from ->media_changed() to ->check_events().

s390/tape_block buffers media changed state and clears it on
revalidation.  It will behave correctly with kernel event polling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/char/tape_block.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 55d2d0f4eabc..ad8431636012 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -48,14 +48,14 @@
 static DEFINE_MUTEX(tape_block_mutex);
 static int tapeblock_open(struct block_device *, fmode_t);
 static int tapeblock_release(struct gendisk *, fmode_t);
-static int tapeblock_medium_changed(struct gendisk *);
+static unsigned int tapeblock_check_events(struct gendisk *, unsigned int);
 static int tapeblock_revalidate_disk(struct gendisk *);
 
 static const struct block_device_operations tapeblock_fops = {
 	.owner		 = THIS_MODULE,
 	.open		 = tapeblock_open,
 	.release	 = tapeblock_release,
-	.media_changed   = tapeblock_medium_changed,
+	.check_events	 = tapeblock_check_events,
 	.revalidate_disk = tapeblock_revalidate_disk,
 };
 
@@ -237,6 +237,7 @@ tapeblock_setup_device(struct tape_device * device)
 	disk->major = tapeblock_major;
 	disk->first_minor = device->first_minor;
 	disk->fops = &tapeblock_fops;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
 	disk->private_data = tape_get_device(device);
 	disk->queue = blkdat->request_queue;
 	set_capacity(disk, 0);
@@ -340,8 +341,8 @@ tapeblock_revalidate_disk(struct gendisk *disk)
 	return 0;
 }
 
-static int
-tapeblock_medium_changed(struct gendisk *disk)
+static unsigned int
+tapeblock_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	struct tape_device *device;
 
@@ -349,7 +350,7 @@ tapeblock_medium_changed(struct gendisk *disk)
 	DBF_LH(6, "tapeblock_medium_changed(%p) = %d\n",
 		device, device->blk_data.medium_changed);
 
-	return device->blk_data.medium_changed;
+	return device->blk_data.medium_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
 /*

From 6fac80e3aa9e5bfc6cadb9ea8d0584c557c4b198 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 31/65] umem: Drop dummy ->media_changed()

umem doesn't implement media changed detection and there's no need to
implement dummy callback anymore.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/block/umem.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8be57151f5d6..9b102abe209c 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -779,20 +779,10 @@ static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return 0;
 }
 
-/*
- * Future support for removable devices
- */
-static int mm_check_change(struct gendisk *disk)
-{
-/*  struct cardinfo *dev = disk->private_data; */
-	return 0;
-}
-
 static const struct block_device_operations mm_fops = {
 	.owner		= THIS_MODULE,
 	.getgeo		= mm_getgeo,
 	.revalidate_disk = mm_revalidate,
-	.media_changed	= mm_check_change,
 };
 
 static int __devinit mm_pci_probe(struct pci_dev *dev,

From 3c0d206092f50be82523dba2b0e0a02de9e6aad3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:28 +0100
Subject: [PATCH 32/65] pktcdvd: Convert to bdops->check_events()

Convert from ->media_changed() to ->check_events().

pktcdvd needs to forward all event related operations to the
underlying device.  Forward ->check_events() instead of
->media_changed() and inherit disk->[async_]events.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Peter Osterlund <petero2@telia.com>
---
 drivers/block/pktcdvd.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 77d70eebb6b2..a077db27b7c9 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2796,7 +2796,8 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	return ret;
 }
 
-static int pkt_media_changed(struct gendisk *disk)
+static unsigned int pkt_check_events(struct gendisk *disk,
+				     unsigned int clearing)
 {
 	struct pktcdvd_device *pd = disk->private_data;
 	struct gendisk *attached_disk;
@@ -2806,9 +2807,9 @@ static int pkt_media_changed(struct gendisk *disk)
 	if (!pd->bdev)
 		return 0;
 	attached_disk = pd->bdev->bd_disk;
-	if (!attached_disk)
+	if (!attached_disk || !attached_disk->fops->check_events)
 		return 0;
-	return attached_disk->fops->media_changed(attached_disk);
+	return attached_disk->fops->check_events(attached_disk, clearing);
 }
 
 static const struct block_device_operations pktcdvd_ops = {
@@ -2816,7 +2817,7 @@ static const struct block_device_operations pktcdvd_ops = {
 	.open =			pkt_open,
 	.release =		pkt_close,
 	.ioctl =		pkt_ioctl,
-	.media_changed =	pkt_media_changed,
+	.check_events =		pkt_check_events,
 };
 
 static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
@@ -2889,6 +2890,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	if (ret)
 		goto out_new_dev;
 
+	/* inherit events of the host device */
+	disk->events = pd->bdev->bd_disk->events;
+	disk->async_events = pd->bdev->bd_disk->async_events;
+
 	add_disk(disk);
 
 	pkt_sysfs_dev_new(pd);

From cafb0bfca1a73efd6d8a4a6a6a716e6134b96c24 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:29 +0100
Subject: [PATCH 33/65] staging: Convert to bdops->check_events()

Convert two staging drivers - blkvsc_drv and cyasblkdev_block - from
->media_changed() to ->check_events().  The former always indicated
media changed while the latter always indicated media not changed.
Not sure what the drivers are trying to achieve but keep the original
behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/staging/hv/blkvsc_drv.c                       | 11 +++++++----
 .../westbridge/astoria/block/cyasblkdev_block.c       | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/hv/blkvsc_drv.c b/drivers/staging/hv/blkvsc_drv.c
index 4fb809485d9e..1ecd7e17f571 100644
--- a/drivers/staging/hv/blkvsc_drv.c
+++ b/drivers/staging/hv/blkvsc_drv.c
@@ -131,7 +131,8 @@ static void blkvsc_shutdown(struct device *device);
 
 static int blkvsc_open(struct block_device *bdev,  fmode_t mode);
 static int blkvsc_release(struct gendisk *disk, fmode_t mode);
-static int blkvsc_media_changed(struct gendisk *gd);
+static unsigned int blkvsc_check_events(struct gendisk *gd,
+					unsigned int clearing);
 static int blkvsc_revalidate_disk(struct gendisk *gd);
 static int blkvsc_getgeo(struct block_device *bd, struct hd_geometry *hg);
 static int blkvsc_ioctl(struct block_device *bd, fmode_t mode,
@@ -162,7 +163,7 @@ static const struct block_device_operations block_ops = {
 	.owner = THIS_MODULE,
 	.open = blkvsc_open,
 	.release = blkvsc_release,
-	.media_changed = blkvsc_media_changed,
+	.check_events = blkvsc_check_events,
 	.revalidate_disk = blkvsc_revalidate_disk,
 	.getgeo = blkvsc_getgeo,
 	.ioctl  = blkvsc_ioctl,
@@ -367,6 +368,7 @@ static int blkvsc_probe(struct device *device)
 	else
 		blkdev->gd->first_minor = 0;
 	blkdev->gd->fops = &block_ops;
+	blkdev->gd->events = DISK_EVENT_MEDIA_CHANGE;
 	blkdev->gd->private_data = blkdev;
 	blkdev->gd->driverfs_dev = &(blkdev->device_ctx->device);
 	sprintf(blkdev->gd->disk_name, "hd%c", 'a' + devnum);
@@ -1352,10 +1354,11 @@ static int blkvsc_release(struct gendisk *disk, fmode_t mode)
 	return 0;
 }
 
-static int blkvsc_media_changed(struct gendisk *gd)
+static unsigned int blkvsc_check_events(struct gendisk *gd,
+					unsigned int clearing)
 {
 	DPRINT_DBG(BLKVSC_DRV, "- enter\n");
-	return 1;
+	return DISK_EVENT_MEDIA_CHANGE;
 }
 
 static int blkvsc_revalidate_disk(struct gendisk *gd)
diff --git a/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c b/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
index e1851f00be56..842cd9214a5e 100644
--- a/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
+++ b/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
@@ -381,10 +381,10 @@ static int cyasblkdev_blk_ioctl(
 	return -ENOTTY;
 }
 
-/* Media_changed block_device opp
+/* check_events block_device opp
  * this one is called by kernel to confirm if the media really changed
  * as we indicated by issuing check_disk_change() call */
-int cyasblkdev_media_changed(struct gendisk *gd)
+unsigned int cyasblkdev_check_events(struct gendisk *gd, unsigned int clearing)
 {
 	struct cyasblkdev_blk_data *bd;
 
@@ -402,7 +402,7 @@ int cyasblkdev_media_changed(struct gendisk *gd)
 		#endif
 	}
 
-	/* return media change state "1" yes, 0 no */
+	/* return media change state - DISK_EVENT_MEDIA_CHANGE yes, 0 no */
 	return 0;
 }
 
@@ -432,7 +432,7 @@ static struct block_device_operations cyasblkdev_bdops = {
 	.ioctl			= cyasblkdev_blk_ioctl,
 	/* .getgeo		= cyasblkdev_blk_getgeo, */
 	/* added to support media removal( real and simulated) media */
-	.media_changed  = cyasblkdev_media_changed,
+	.check_events		= cyasblkdev_check_events,
 	/* added to support media removal( real and simulated) media */
 	.revalidate_disk = cyasblkdev_revalidate_disk,
 	.owner			= THIS_MODULE,
@@ -1090,6 +1090,7 @@ static int cyasblkdev_add_disks(int bus_num,
 		bd->user_disk_0->first_minor = devidx << CYASBLKDEV_SHIFT;
 		bd->user_disk_0->minors = 8;
 		bd->user_disk_0->fops = &cyasblkdev_bdops;
+		bd->user_disk_0->events = DISK_EVENT_MEDIA_CHANGE;
 		bd->user_disk_0->private_data = bd;
 		bd->user_disk_0->queue = bd->queue.queue;
 		bd->dbgprn_flags = DBGPRN_RD_RQ;
@@ -1190,6 +1191,7 @@ static int cyasblkdev_add_disks(int bus_num,
 		bd->user_disk_1->first_minor = (devidx + 1) << CYASBLKDEV_SHIFT;
 		bd->user_disk_1->minors = 8;
 		bd->user_disk_1->fops = &cyasblkdev_bdops;
+		bd->user_disk_0->events = DISK_EVENT_MEDIA_CHANGE;
 		bd->user_disk_1->private_data = bd;
 		bd->user_disk_1->queue = bd->queue.queue;
 		bd->dbgprn_flags = DBGPRN_RD_RQ;
@@ -1278,6 +1280,7 @@ static int cyasblkdev_add_disks(int bus_num,
 				(devidx + 2) << CYASBLKDEV_SHIFT;
 			bd->system_disk->minors = 8;
 			bd->system_disk->fops = &cyasblkdev_bdops;
+			bd->system_disk->events = DISK_EVENT_MEDIA_CHANGE;
 			bd->system_disk->private_data = bd;
 			bd->system_disk->queue = bd->queue.queue;
 			/* don't search for vfat

From 3cca6dc1c81e2407928dc4c6105252146fd3924f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 2 Mar 2011 11:08:00 -0500
Subject: [PATCH 34/65] block: add API for delaying work/request_fn a little
 bit

Currently we use plugging for that, but as plugging is going away,
we need an alternative mechanism.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 29 +++++++++++++++++++++++++++++
 include/linux/blkdev.h |  6 ++++++
 2 files changed, 35 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3cc17e6064d6..e958c7a1e462 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -197,6 +197,32 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
+static void blk_delay_work(struct work_struct *work)
+{
+	struct request_queue *q;
+
+	q = container_of(work, struct request_queue, delay_work.work);
+	spin_lock_irq(q->queue_lock);
+	q->request_fn(q);
+	spin_unlock_irq(q->queue_lock);
+}
+
+/**
+ * blk_delay_queue - restart queueing after defined interval
+ * @q:		The &struct request_queue in question
+ * @msecs:	Delay in msecs
+ *
+ * Description:
+ *   Sometimes queueing needs to be postponed for a little while, to allow
+ *   resources to come back. This function will make sure that queueing is
+ *   restarted around the specified time.
+ */
+void blk_delay_queue(struct request_queue *q, unsigned long msecs)
+{
+	schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
+}
+EXPORT_SYMBOL(blk_delay_queue);
+
 /*
  * "plug" the device if there are no outstanding requests: this will
  * force the transfer to start only after we have put all the requests
@@ -363,6 +389,7 @@ EXPORT_SYMBOL(blk_start_queue);
 void blk_stop_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
+	cancel_delayed_work(&q->delay_work);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
@@ -387,6 +414,7 @@ void blk_sync_queue(struct request_queue *q)
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
 	throtl_shutdown_timer_wq(q);
+	cancel_delayed_work_sync(&q->delay_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -534,6 +562,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
+	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e3ee74fc5903..f55b2a8b6610 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -300,6 +300,11 @@ struct request_queue
 	unsigned long		unplug_delay;	/* After this many jiffies */
 	struct work_struct	unplug_work;
 
+	/*
+	 * Delayed queue handling
+	 */
+	struct delayed_work	delay_work;
+
 	struct backing_dev_info	backing_dev_info;
 
 	/*
@@ -677,6 +682,7 @@ extern int blk_insert_cloned_request(struct request_queue *q,
 extern void blk_plug_device(struct request_queue *);
 extern void blk_plug_device_unlocked(struct request_queue *);
 extern int blk_remove_plug(struct request_queue *);
+extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			  unsigned int, void __user *);

From 0a41e90bb7c931fd53d73ea770f5b251af6c91ce Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 16 Apr 2010 21:11:21 +0200
Subject: [PATCH 35/65] ide-cd: convert to blk_delay_queue() for a short pause

It was always abuse to reuse the plugging infrastructure for this,
convert it to the (new) real API for delaying queueing a bit.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Acked-by: David S. Miller <davem@davemloft.net>
---
 drivers/ide/ide-cd.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 0c73fe39a236..7ce9caf4521b 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -258,17 +258,10 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
 	if (time_after(jiffies, info->write_timeout))
 		return 0;
 	else {
-		struct request_queue *q = drive->queue;
-		unsigned long flags;
-
 		/*
-		 * take a breather relying on the unplug timer to kick us again
+		 * take a breather
 		 */
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_plug_device(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-
+		blk_delay_queue(drive->queue, 1);
 		return 1;
 	}
 }
@@ -1514,8 +1507,6 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 	blk_queue_dma_alignment(q, 31);
 	blk_queue_update_dma_pad(q, 15);
 
-	q->unplug_delay = max((1 * HZ) / 1000, 1);
-
 	drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
 	drive->atapi_flags = IDE_AFLAG_NO_EJECT | ide_cd_flags(id);
 

From a488e74976bf0a9bccecdd094378394942dacef1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 16 Apr 2010 21:13:15 +0200
Subject: [PATCH 36/65] scsi: convert to blk_delay_queue()

It was always abuse to reuse the plugging infrastructure for this,
convert it to the (new) real API for delaying queueing a bit. A
default delay of 3 msec is defined, to match the previous
behaviour.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/scsi_lib.c | 44 ++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9045c52abd25..5a0ae7a944ce 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -67,6 +67,13 @@ static struct scsi_host_sg_pool scsi_sg_pools[] = {
 
 struct kmem_cache *scsi_sdb_cache;
 
+/*
+ * When to reinvoke queueing after a resource shortage. It's 3 msecs to
+ * not change behaviour from the previous unplug mechanism, experimentation
+ * may prove this needs changing.
+ */
+#define SCSI_QUEUE_DELAY	3
+
 static void scsi_run_queue(struct request_queue *q);
 
 /*
@@ -149,14 +156,7 @@ static int __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
 	/*
 	 * Requeue this command.  It will go before all other commands
 	 * that are already in the queue.
-	 *
-	 * NOTE: there is magic here about the way the queue is plugged if
-	 * we have no outstanding commands.
-	 * 
-	 * Although we *don't* plug the queue, we call the request
-	 * function.  The SCSI request function detects the blocked condition
-	 * and plugs the queue appropriately.
-         */
+	 */
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_requeue_request(q, cmd->request);
 	spin_unlock_irqrestore(q->queue_lock, flags);
@@ -1194,11 +1194,11 @@ int scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 	case BLKPREP_DEFER:
 		/*
 		 * If we defer, the blk_peek_request() returns NULL, but the
-		 * queue must be restarted, so we plug here if no returning
-		 * command will automatically do that.
+		 * queue must be restarted, so we schedule a callback to happen
+		 * shortly.
 		 */
 		if (sdev->device_busy == 0)
-			blk_plug_device(q);
+			blk_delay_queue(q, SCSI_QUEUE_DELAY);
 		break;
 	default:
 		req->cmd_flags |= REQ_DONTPREP;
@@ -1237,7 +1237,7 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
 				   sdev_printk(KERN_INFO, sdev,
 				   "unblocking device at zero depth\n"));
 		} else {
-			blk_plug_device(q);
+			blk_delay_queue(q, SCSI_QUEUE_DELAY);
 			return 0;
 		}
 	}
@@ -1467,7 +1467,7 @@ static void scsi_request_fn(struct request_queue *q)
 	 * the host is no longer able to accept any more requests.
 	 */
 	shost = sdev->host;
-	while (!blk_queue_plugged(q)) {
+	for (;;) {
 		int rtn;
 		/*
 		 * get next queueable request.  We do this early to make sure
@@ -1546,15 +1546,8 @@ static void scsi_request_fn(struct request_queue *q)
 		 */
 		rtn = scsi_dispatch_cmd(cmd);
 		spin_lock_irq(q->queue_lock);
-		if(rtn) {
-			/* we're refusing the command; because of
-			 * the way locks get dropped, we need to 
-			 * check here if plugging is required */
-			if(sdev->device_busy == 0)
-				blk_plug_device(q);
-
-			break;
-		}
+		if (rtn)
+			goto out_delay;
 	}
 
 	goto out;
@@ -1573,9 +1566,10 @@ static void scsi_request_fn(struct request_queue *q)
 	spin_lock_irq(q->queue_lock);
 	blk_requeue_request(q, req);
 	sdev->device_busy--;
-	if(sdev->device_busy == 0)
-		blk_plug_device(q);
- out:
+out_delay:
+	if (sdev->device_busy == 0)
+		blk_delay_queue(q, SCSI_QUEUE_DELAY);
+out:
 	/* must be careful here...if we trigger the ->remove() function
 	 * we cannot be holding the q lock */
 	spin_unlock_irq(q->queue_lock);

From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 8 Mar 2011 13:19:51 +0100
Subject: [PATCH 37/65] block: initial patch for on-stack per-task plugging

This patch adds support for creating a queuing context outside
of the queue itself. This enables us to batch up pieces of IO
before grabbing the block device queue lock and submitting them to
the IO scheduler.

The context is created on the stack of the process and assigned in
the task structure, so that we can auto-unplug it if we hit a schedule
event.

The current queue plugging happens implicitly if IO is submitted to
an empty device, yet callers have to remember to unplug that IO when
they are going to wait for it. This is an ugly API and has caused bugs
in the past. Additionally, it requires hacks in the vm (->sync_page()
callback) to handle that logic. By switching to an explicit plugging
scheme we make the API a lot nicer and can get rid of the ->sync_page()
hack in the vm.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c          | 369 ++++++++++++++++++++++++++++----------
 block/blk-flush.c         |   3 +-
 block/elevator.c          |   6 +-
 include/linux/blk_types.h |   2 +
 include/linux/blkdev.h    |  42 +++++
 include/linux/elevator.h  |   1 +
 include/linux/sched.h     |   6 +
 kernel/exit.c             |   1 +
 kernel/fork.c             |   3 +
 kernel/sched.c            |  12 ++
 10 files changed, 344 insertions(+), 101 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index e958c7a1e462..6efb55cc5af0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
+#include <linux/list_sort.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -203,7 +204,7 @@ static void blk_delay_work(struct work_struct *work)
 
 	q = container_of(work, struct request_queue, delay_work.work);
 	spin_lock_irq(q->queue_lock);
-	q->request_fn(q);
+	__blk_run_queue(q);
 	spin_unlock_irq(q->queue_lock);
 }
 
@@ -686,6 +687,8 @@ int blk_get_queue(struct request_queue *q)
 
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
+	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
@@ -1051,6 +1054,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(blk_requeue_request);
 
+static void add_acct_request(struct request_queue *q, struct request *rq,
+			     int where)
+{
+	drive_stat_acct(rq, 1);
+	__elv_add_request(q, rq, where, 0);
+}
+
 /**
  * blk_insert_request - insert a special request into a request queue
  * @q:		request queue where request should be inserted
@@ -1093,8 +1103,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
-	drive_stat_acct(rq, 1);
-	__elv_add_request(q, rq, where, 0);
+	add_acct_request(q, rq, where);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
@@ -1215,6 +1224,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
 
+static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+				   struct bio *bio)
+{
+	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+
+	/*
+	 * Debug stuff, kill later
+	 */
+	if (!rq_mergeable(req)) {
+		blk_dump_rq_flags(req, "back");
+		return false;
+	}
+
+	if (!ll_back_merge_fn(q, req, bio))
+		return false;
+
+	trace_block_bio_backmerge(q, bio);
+
+	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+		blk_rq_set_mixed_merge(req);
+
+	req->biotail->bi_next = bio;
+	req->biotail = bio;
+	req->__data_len += bio->bi_size;
+	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+	drive_stat_acct(req, 0);
+	return true;
+}
+
+static bool bio_attempt_front_merge(struct request_queue *q,
+				    struct request *req, struct bio *bio)
+{
+	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	sector_t sector;
+
+	/*
+	 * Debug stuff, kill later
+	 */
+	if (!rq_mergeable(req)) {
+		blk_dump_rq_flags(req, "front");
+		return false;
+	}
+
+	if (!ll_front_merge_fn(q, req, bio))
+		return false;
+
+	trace_block_bio_frontmerge(q, bio);
+
+	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+		blk_rq_set_mixed_merge(req);
+
+	sector = bio->bi_sector;
+
+	bio->bi_next = req->bio;
+	req->bio = bio;
+
+	/*
+	 * may not be valid. if the low level driver said
+	 * it didn't need a bounce buffer then it better
+	 * not touch req->buffer either...
+	 */
+	req->buffer = bio_data(bio);
+	req->__sector = bio->bi_sector;
+	req->__data_len += bio->bi_size;
+	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+	drive_stat_acct(req, 0);
+	return true;
+}
+
+/*
+ * Attempts to merge with the plugged list in the current process. Returns
+ * true if merge was succesful, otherwise false.
+ */
+static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
+			       struct bio *bio)
+{
+	struct blk_plug *plug;
+	struct request *rq;
+	bool ret = false;
+
+	plug = tsk->plug;
+	if (!plug)
+		goto out;
+
+	list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+		int el_ret;
+
+		if (rq->q != q)
+			continue;
+
+		el_ret = elv_try_merge(rq, bio);
+		if (el_ret == ELEVATOR_BACK_MERGE) {
+			ret = bio_attempt_back_merge(q, rq, bio);
+			if (ret)
+				break;
+		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
+			ret = bio_attempt_front_merge(q, rq, bio);
+			if (ret)
+				break;
+		}
+	}
+out:
+	return ret;
+}
+
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cpu = bio->bi_comp_cpu;
@@ -1230,26 +1346,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-/*
- * Only disabling plugging for non-rotational devices if it does tagging
- * as well, otherwise we do need the proper merging
- */
-static inline bool queue_should_plug(struct request_queue *q)
-{
-	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
-}
-
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
-	struct request *req;
-	int el_ret;
-	unsigned int bytes = bio->bi_size;
-	const unsigned short prio = bio_prio(bio);
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
-	const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
-	const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
-	int where = ELEVATOR_INSERT_SORT;
-	int rw_flags;
+	struct blk_plug *plug;
+	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
+	struct request *req;
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -1258,78 +1360,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
-	spin_lock_irq(q->queue_lock);
-
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
 	}
 
-	if (elv_queue_empty(q))
-		goto get_rq;
+	/*
+	 * Check if we can merge with the plugged list before grabbing
+	 * any locks.
+	 */
+	if (attempt_plug_merge(current, q, bio))
+		goto out;
+
+	spin_lock_irq(q->queue_lock);
 
 	el_ret = elv_merge(q, &req, bio);
-	switch (el_ret) {
-	case ELEVATOR_BACK_MERGE:
-		BUG_ON(!rq_mergeable(req));
-
-		if (!ll_back_merge_fn(q, req, bio))
-			break;
-
-		trace_block_bio_backmerge(q, bio);
-
-		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
-			blk_rq_set_mixed_merge(req);
-
-		req->biotail->bi_next = bio;
-		req->biotail = bio;
-		req->__data_len += bytes;
-		req->ioprio = ioprio_best(req->ioprio, prio);
-		if (!blk_rq_cpu_valid(req))
-			req->cpu = bio->bi_comp_cpu;
-		drive_stat_acct(req, 0);
-		elv_bio_merged(q, req, bio);
-		if (!attempt_back_merge(q, req))
-			elv_merged_request(q, req, el_ret);
-		goto out;
-
-	case ELEVATOR_FRONT_MERGE:
-		BUG_ON(!rq_mergeable(req));
-
-		if (!ll_front_merge_fn(q, req, bio))
-			break;
-
-		trace_block_bio_frontmerge(q, bio);
-
-		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
-			blk_rq_set_mixed_merge(req);
-			req->cmd_flags &= ~REQ_FAILFAST_MASK;
-			req->cmd_flags |= ff;
+	if (el_ret == ELEVATOR_BACK_MERGE) {
+		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+		if (bio_attempt_back_merge(q, req, bio)) {
+			if (!attempt_back_merge(q, req))
+				elv_merged_request(q, req, el_ret);
+			goto out_unlock;
+		}
+	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
+		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+		if (bio_attempt_front_merge(q, req, bio)) {
+			if (!attempt_front_merge(q, req))
+				elv_merged_request(q, req, el_ret);
+			goto out_unlock;
 		}
-
-		bio->bi_next = req->bio;
-		req->bio = bio;
-
-		/*
-		 * may not be valid. if the low level driver said
-		 * it didn't need a bounce buffer then it better
-		 * not touch req->buffer either...
-		 */
-		req->buffer = bio_data(bio);
-		req->__sector = bio->bi_sector;
-		req->__data_len += bytes;
-		req->ioprio = ioprio_best(req->ioprio, prio);
-		if (!blk_rq_cpu_valid(req))
-			req->cpu = bio->bi_comp_cpu;
-		drive_stat_acct(req, 0);
-		elv_bio_merged(q, req, bio);
-		if (!attempt_front_merge(q, req))
-			elv_merged_request(q, req, el_ret);
-		goto out;
-
-	/* ELV_NO_MERGE: elevator says don't/can't merge. */
-	default:
-		;
 	}
 
 get_rq:
@@ -1356,20 +1416,35 @@ get_rq:
 	 */
 	init_request_from_bio(req, bio);
 
-	spin_lock_irq(q->queue_lock);
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE))
-		req->cpu = blk_cpu_to_group(smp_processor_id());
-	if (queue_should_plug(q) && elv_queue_empty(q))
-		blk_plug_device(q);
+	    bio_flagged(bio, BIO_CPU_AFFINE)) {
+		req->cpu = blk_cpu_to_group(get_cpu());
+		put_cpu();
+	}
 
-	/* insert the request into the elevator */
-	drive_stat_acct(req, 1);
-	__elv_add_request(q, req, where, 0);
+	plug = current->plug;
+	if (plug && !sync) {
+		if (!plug->should_sort && !list_empty(&plug->list)) {
+			struct request *__rq;
+
+			__rq = list_entry_rq(plug->list.prev);
+			if (__rq->q != q)
+				plug->should_sort = 1;
+		}
+		/*
+		 * Debug flag, kill later
+		 */
+		req->cmd_flags |= REQ_ON_PLUG;
+		list_add_tail(&req->queuelist, &plug->list);
+		drive_stat_acct(req, 1);
+	} else {
+		spin_lock_irq(q->queue_lock);
+		add_acct_request(q, req, where);
+		__blk_run_queue(q);
+out_unlock:
+		spin_unlock_irq(q->queue_lock);
+	}
 out:
-	if (unplug || !queue_should_plug(q))
-		__generic_unplug_device(q);
-	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 
@@ -1772,9 +1847,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	drive_stat_acct(rq, 1);
-	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
-
+	add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
@@ -2659,6 +2732,106 @@ int kblockd_schedule_delayed_work(struct request_queue *q,
 }
 EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 
+#define PLUG_MAGIC	0x91827364
+
+void blk_start_plug(struct blk_plug *plug)
+{
+	struct task_struct *tsk = current;
+
+	plug->magic = PLUG_MAGIC;
+	INIT_LIST_HEAD(&plug->list);
+	plug->should_sort = 0;
+
+	/*
+	 * If this is a nested plug, don't actually assign it. It will be
+	 * flushed on its own.
+	 */
+	if (!tsk->plug) {
+		/*
+		 * Store ordering should not be needed here, since a potential
+		 * preempt will imply a full memory barrier
+		 */
+		tsk->plug = plug;
+	}
+}
+EXPORT_SYMBOL(blk_start_plug);
+
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct request *rqa = container_of(a, struct request, queuelist);
+	struct request *rqb = container_of(b, struct request, queuelist);
+
+	return !(rqa->q == rqb->q);
+}
+
+static void flush_plug_list(struct blk_plug *plug)
+{
+	struct request_queue *q;
+	unsigned long flags;
+	struct request *rq;
+
+	BUG_ON(plug->magic != PLUG_MAGIC);
+
+	if (list_empty(&plug->list))
+		return;
+
+	if (plug->should_sort)
+		list_sort(NULL, &plug->list, plug_rq_cmp);
+
+	q = NULL;
+	local_irq_save(flags);
+	while (!list_empty(&plug->list)) {
+		rq = list_entry_rq(plug->list.next);
+		list_del_init(&rq->queuelist);
+		BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
+		BUG_ON(!rq->q);
+		if (rq->q != q) {
+			if (q) {
+				__blk_run_queue(q);
+				spin_unlock(q->queue_lock);
+			}
+			q = rq->q;
+			spin_lock(q->queue_lock);
+		}
+		rq->cmd_flags &= ~REQ_ON_PLUG;
+
+		/*
+		 * rq is already accounted, so use raw insert
+		 */
+		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT, 0);
+	}
+
+	if (q) {
+		__blk_run_queue(q);
+		spin_unlock(q->queue_lock);
+	}
+
+	BUG_ON(!list_empty(&plug->list));
+	local_irq_restore(flags);
+}
+
+static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+	flush_plug_list(plug);
+
+	if (plug == tsk->plug)
+		tsk->plug = NULL;
+}
+
+void blk_finish_plug(struct blk_plug *plug)
+{
+	if (plug)
+		__blk_finish_plug(current, plug);
+}
+EXPORT_SYMBOL(blk_finish_plug);
+
+void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+	__blk_finish_plug(tsk, plug);
+	tsk->plug = plug;
+}
+EXPORT_SYMBOL(__blk_flush_plug);
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a867e3f524f3..1e2aa8a8908c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -264,10 +264,9 @@ static bool blk_kick_flush(struct request_queue *q)
 static void flush_data_end_io(struct request *rq, int error)
 {
 	struct request_queue *q = rq->q;
-	bool was_empty = elv_queue_empty(q);
 
 	/* after populating an empty queue, kick it to avoid stall */
-	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
+	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
 		__blk_run_queue(q);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index f98e92edc937..25713927c0d3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
 
-static inline int elv_try_merge(struct request *__rq, struct bio *bio)
+int elv_try_merge(struct request *__rq, struct bio *bio)
 {
 	int ret = ELEVATOR_NO_MERGE;
 
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	struct list_head *entry;
 	int stop_flags;
 
+	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
 
@@ -696,6 +698,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
+	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
 	if (rq->cmd_flags & REQ_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dddedfc0af81..16b286473042 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -152,6 +152,7 @@ enum rq_flag_bits {
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
 	__REQ_SECURE,		/* secure discard (used with __REQ_DISCARD) */
+	__REQ_ON_PLUG,		/* on plug list */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -193,5 +194,6 @@ enum rq_flag_bits {
 #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
 #define REQ_SECURE		(1 << __REQ_SECURE)
+#define REQ_ON_PLUG		(1 << __REQ_ON_PLUG)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f55b2a8b6610..5873037eeb91 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -871,6 +871,31 @@ struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 
+struct blk_plug {
+	unsigned long magic;
+	struct list_head list;
+	unsigned int should_sort;
+};
+
+extern void blk_start_plug(struct blk_plug *);
+extern void blk_finish_plug(struct blk_plug *);
+extern void __blk_flush_plug(struct task_struct *, struct blk_plug *);
+
+static inline void blk_flush_plug(struct task_struct *tsk)
+{
+	struct blk_plug *plug = tsk->plug;
+
+	if (unlikely(plug))
+		__blk_flush_plug(tsk, plug);
+}
+
+static inline bool blk_needs_flush_plug(struct task_struct *tsk)
+{
+	struct blk_plug *plug = tsk->plug;
+
+	return plug && !list_empty(&plug->list);
+}
+
 /*
  * tag stuff
  */
@@ -1294,6 +1319,23 @@ static inline long nr_blockdev_pages(void)
 	return 0;
 }
 
+static inline void blk_start_plug(struct list_head *list)
+{
+}
+
+static inline void blk_finish_plug(struct list_head *list)
+{
+}
+
+static inline void blk_flush_plug(struct task_struct *tsk)
+{
+}
+
+static inline bool blk_needs_flush_plug(struct task_struct *tsk)
+{
+	return false;
+}
+
 #endif /* CONFIG_BLOCK */
 
 #endif
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 39b68edb388d..8857cf9adbb7 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -105,6 +105,7 @@ extern void elv_add_request(struct request_queue *, struct request *, int, int);
 extern void __elv_add_request(struct request_queue *, struct request *, int, int);
 extern void elv_insert(struct request_queue *, struct request *, int);
 extern int elv_merge(struct request_queue *, struct request **, struct bio *);
+extern int elv_try_merge(struct request *, struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *, int);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5ed06b..96ac22643742 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,6 +99,7 @@ struct robust_list_head;
 struct bio_list;
 struct fs_struct;
 struct perf_event_context;
+struct blk_plug;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1429,6 +1430,11 @@ struct task_struct {
 /* stacked block device info */
 	struct bio_list *bio_list;
 
+#ifdef CONFIG_BLOCK
+/* stack plugging */
+	struct blk_plug *plug;
+#endif
+
 /* VM state */
 	struct reclaim_state *reclaim_state;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..6a488ad2dce5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
 	profile_task_exit(tsk);
 
 	WARN_ON(atomic_read(&tsk->fs_excl));
+	WARN_ON(blk_needs_flush_plug(tsk));
 
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..027c80e5162f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1204,6 +1204,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * Clear TID on mm_release()?
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+	p->plug = NULL;
+#endif
 #ifdef CONFIG_FUTEX
 	p->robust_list = NULL;
 #ifdef CONFIG_COMPAT
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..ca098bf4cc65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3978,6 +3978,16 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 	}
 
+	/*
+	 * If we are going to sleep and we have plugged IO queued, make
+	 * sure to submit it to avoid deadlocks.
+	 */
+	if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
+		raw_spin_unlock(&rq->lock);
+		blk_flush_plug(prev);
+		raw_spin_lock(&rq->lock);
+	}
+
 	pre_schedule(rq, prev);
 
 	if (unlikely(!rq->nr_running))
@@ -5333,6 +5343,7 @@ void __sched io_schedule(void)
 
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
 	current->in_iowait = 1;
 	schedule();
 	current->in_iowait = 0;
@@ -5348,6 +5359,7 @@ long __sched io_schedule_timeout(long timeout)
 
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
+	blk_flush_plug(current);
 	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
 	current->in_iowait = 0;

From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 10 Mar 2011 08:52:07 +0100
Subject: [PATCH 38/65] block: remove per-queue plugging

Code has been converted over to the new explicit on-stack plugging,
and delay users have been converted to use the new API for that.
So lets kill off the old plugging along with aops->sync_page().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/block/biodoc.txt      |   5 -
 block/blk-core.c                    | 173 ++++------------------------
 block/blk-exec.c                    |   4 +-
 block/blk-flush.c                   |   3 +-
 block/blk-settings.c                |   8 --
 block/blk-throttle.c                |   1 -
 block/blk.h                         |   2 -
 block/cfq-iosched.c                 |   8 --
 block/deadline-iosched.c            |   9 --
 block/elevator.c                    |  43 +------
 block/noop-iosched.c                |   8 --
 drivers/block/cciss.c               |   6 -
 drivers/block/cpqarray.c            |   3 -
 drivers/block/drbd/drbd_actlog.c    |   2 -
 drivers/block/drbd/drbd_bitmap.c    |   1 -
 drivers/block/drbd/drbd_int.h       |  14 ---
 drivers/block/drbd/drbd_main.c      |  33 +-----
 drivers/block/drbd/drbd_receiver.c  |  20 +---
 drivers/block/drbd/drbd_req.c       |   4 -
 drivers/block/drbd/drbd_worker.c    |   1 -
 drivers/block/drbd/drbd_wrappers.h  |  18 ---
 drivers/block/floppy.c              |   1 -
 drivers/block/loop.c                |  13 ---
 drivers/block/pktcdvd.c             |   2 -
 drivers/block/umem.c                |  16 +--
 drivers/ide/ide-atapi.c             |   3 +-
 drivers/ide/ide-io.c                |   4 -
 drivers/ide/ide-park.c              |   2 +-
 drivers/md/bitmap.c                 |   3 +-
 drivers/md/dm-crypt.c               |   9 +-
 drivers/md/dm-kcopyd.c              |  52 +--------
 drivers/md/dm-raid.c                |   2 +-
 drivers/md/dm-raid1.c               |   2 -
 drivers/md/dm-table.c               |  24 ----
 drivers/md/dm.c                     |  33 +-----
 drivers/md/linear.c                 |  17 ---
 drivers/md/md.c                     |   7 --
 drivers/md/multipath.c              |  31 -----
 drivers/md/raid0.c                  |  16 ---
 drivers/md/raid1.c                  |  83 +++----------
 drivers/md/raid10.c                 |  87 +++-----------
 drivers/md/raid5.c                  |  62 ++--------
 drivers/md/raid5.h                  |   2 +-
 drivers/message/i2o/i2o_block.c     |   6 +-
 drivers/mmc/card/queue.c            |   3 +-
 drivers/s390/block/dasd.c           |   2 +-
 drivers/s390/char/tape_block.c      |   1 -
 drivers/scsi/scsi_transport_fc.c    |   2 +-
 drivers/scsi/scsi_transport_sas.c   |   6 +-
 drivers/target/target_core_iblock.c |   7 +-
 fs/adfs/inode.c                     |   1 -
 fs/affs/file.c                      |   2 -
 fs/aio.c                            |   4 +-
 fs/befs/linuxvfs.c                  |   1 -
 fs/bfs/file.c                       |   1 -
 fs/block_dev.c                      |   1 -
 fs/btrfs/disk-io.c                  |  79 -------------
 fs/btrfs/inode.c                    |   1 -
 fs/btrfs/volumes.c                  |  91 ++-------------
 fs/buffer.c                         |  31 +----
 fs/cifs/file.c                      |  30 -----
 fs/direct-io.c                      |   5 +-
 fs/efs/inode.c                      |   1 -
 fs/exofs/inode.c                    |   1 -
 fs/ext2/inode.c                     |   2 -
 fs/ext3/inode.c                     |   3 -
 fs/ext4/inode.c                     |   4 -
 fs/fat/inode.c                      |   1 -
 fs/freevxfs/vxfs_subr.c             |   1 -
 fs/fuse/inode.c                     |   1 -
 fs/gfs2/aops.c                      |   3 -
 fs/gfs2/meta_io.c                   |   1 -
 fs/hfs/inode.c                      |   2 -
 fs/hfsplus/inode.c                  |   2 -
 fs/hpfs/file.c                      |   1 -
 fs/isofs/inode.c                    |   1 -
 fs/jfs/inode.c                      |   1 -
 fs/jfs/jfs_metapage.c               |   1 -
 fs/logfs/dev_bdev.c                 |   2 -
 fs/minix/inode.c                    |   1 -
 fs/nilfs2/btnode.c                  |   6 +-
 fs/nilfs2/gcinode.c                 |   1 -
 fs/nilfs2/inode.c                   |   1 -
 fs/nilfs2/mdt.c                     |   9 +-
 fs/nilfs2/page.c                    |   5 +-
 fs/nilfs2/page.h                    |   3 +-
 fs/ntfs/aops.c                      |   4 -
 fs/ntfs/compress.c                  |   3 +-
 fs/ocfs2/aops.c                     |   1 -
 fs/ocfs2/cluster/heartbeat.c        |   4 -
 fs/omfs/file.c                      |   1 -
 fs/qnx4/inode.c                     |   1 -
 fs/reiserfs/inode.c                 |   1 -
 fs/sysv/itree.c                     |   1 -
 fs/ubifs/super.c                    |   1 -
 fs/udf/file.c                       |   1 -
 fs/udf/inode.c                      |   1 -
 fs/ufs/inode.c                      |   1 -
 fs/ufs/truncate.c                   |   2 +-
 fs/xfs/linux-2.6/xfs_aops.c         |   1 -
 fs/xfs/linux-2.6/xfs_buf.c          |  13 +--
 include/linux/backing-dev.h         |  16 ---
 include/linux/blkdev.h              |  31 ++---
 include/linux/buffer_head.h         |   1 -
 include/linux/device-mapper.h       |   5 -
 include/linux/elevator.h            |   7 +-
 include/linux/fs.h                  |   1 -
 include/linux/pagemap.h             |  12 --
 include/linux/swap.h                |   2 -
 mm/backing-dev.c                    |   6 -
 mm/filemap.c                        |  67 +----------
 mm/memory-failure.c                 |   8 +-
 mm/nommu.c                          |   4 -
 mm/page-writeback.c                 |   2 +-
 mm/readahead.c                      |  12 --
 mm/shmem.c                          |   1 -
 mm/swap_state.c                     |   5 +-
 mm/swapfile.c                       |  37 ------
 mm/vmscan.c                         |   2 +-
 119 files changed, 151 insertions(+), 1269 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index b9a83dd24732..2a7b38c832c7 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -963,11 +963,6 @@ elevator_dispatch_fn*		fills the dispatch queue with ready requests.
 
 elevator_add_req_fn*		called to add a new request into the scheduler
 
-elevator_queue_empty_fn		returns true if the merge queue is empty.
-				Drivers shouldn't use this, but rather check
-				if elv_next_request is NULL (without losing the
-				request if one exists!)
-
 elevator_former_req_fn
 elevator_latter_req_fn		These return the request before or after the
 				one specified in disk sort order. Used by the
diff --git a/block/blk-core.c b/block/blk-core.c
index 6efb55cc5af0..82a45898ba76 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,6 +198,19 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
+/*
+ * Make sure that plugs that were pending when this function was entered,
+ * are now complete and requests pushed to the queue.
+*/
+static inline void queue_sync_plugs(struct request_queue *q)
+{
+	/*
+	 * If the current process is plugged and has barriers submitted,
+	 * we will livelock if we don't unplug first.
+	 */
+	blk_flush_plug(current);
+}
+
 static void blk_delay_work(struct work_struct *work)
 {
 	struct request_queue *q;
@@ -224,137 +237,6 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_delay_queue);
 
-/*
- * "plug" the device if there are no outstanding requests: this will
- * force the transfer to start only after we have put all the requests
- * on the list.
- *
- * This is called with interrupts off and no requests on the queue and
- * with the queue lock held.
- */
-void blk_plug_device(struct request_queue *q)
-{
-	WARN_ON(!irqs_disabled());
-
-	/*
-	 * don't plug a stopped queue, it must be paired with blk_start_queue()
-	 * which will restart the queueing
-	 */
-	if (blk_queue_stopped(q))
-		return;
-
-	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
-		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		trace_block_plug(q);
-	}
-}
-EXPORT_SYMBOL(blk_plug_device);
-
-/**
- * blk_plug_device_unlocked - plug a device without queue lock held
- * @q:    The &struct request_queue to plug
- *
- * Description:
- *   Like @blk_plug_device(), but grabs the queue lock and disables
- *   interrupts.
- **/
-void blk_plug_device_unlocked(struct request_queue *q)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_plug_device(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_plug_device_unlocked);
-
-/*
- * remove the queue from the plugged list, if present. called with
- * queue lock held and interrupts disabled.
- */
-int blk_remove_plug(struct request_queue *q)
-{
-	WARN_ON(!irqs_disabled());
-
-	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
-		return 0;
-
-	del_timer(&q->unplug_timer);
-	return 1;
-}
-EXPORT_SYMBOL(blk_remove_plug);
-
-/*
- * remove the plug and let it rip..
- */
-void __generic_unplug_device(struct request_queue *q)
-{
-	if (unlikely(blk_queue_stopped(q)))
-		return;
-	if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
-		return;
-
-	q->request_fn(q);
-}
-
-/**
- * generic_unplug_device - fire a request queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   Linux uses plugging to build bigger requests queues before letting
- *   the device have at them. If a queue is plugged, the I/O scheduler
- *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged, the request_fn defined for the queue is invoked and
- *   transfers started.
- **/
-void generic_unplug_device(struct request_queue *q)
-{
-	if (blk_queue_plugged(q)) {
-		spin_lock_irq(q->queue_lock);
-		__generic_unplug_device(q);
-		spin_unlock_irq(q->queue_lock);
-	}
-}
-EXPORT_SYMBOL(generic_unplug_device);
-
-static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
-				   struct page *page)
-{
-	struct request_queue *q = bdi->unplug_io_data;
-
-	blk_unplug(q);
-}
-
-void blk_unplug_work(struct work_struct *work)
-{
-	struct request_queue *q =
-		container_of(work, struct request_queue, unplug_work);
-
-	trace_block_unplug_io(q);
-	q->unplug_fn(q);
-}
-
-void blk_unplug_timeout(unsigned long data)
-{
-	struct request_queue *q = (struct request_queue *)data;
-
-	trace_block_unplug_timer(q);
-	kblockd_schedule_work(q, &q->unplug_work);
-}
-
-void blk_unplug(struct request_queue *q)
-{
-	/*
-	 * devices don't necessarily have an ->unplug_fn defined
-	 */
-	if (q->unplug_fn) {
-		trace_block_unplug_io(q);
-		q->unplug_fn(q);
-	}
-}
-EXPORT_SYMBOL(blk_unplug);
-
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
@@ -389,7 +271,6 @@ EXPORT_SYMBOL(blk_start_queue);
  **/
 void blk_stop_queue(struct request_queue *q)
 {
-	blk_remove_plug(q);
 	cancel_delayed_work(&q->delay_work);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
@@ -411,11 +292,10 @@ EXPORT_SYMBOL(blk_stop_queue);
  */
 void blk_sync_queue(struct request_queue *q)
 {
-	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
-	cancel_work_sync(&q->unplug_work);
 	throtl_shutdown_timer_wq(q);
 	cancel_delayed_work_sync(&q->delay_work);
+	queue_sync_plugs(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -430,14 +310,9 @@ EXPORT_SYMBOL(blk_sync_queue);
  */
 void __blk_run_queue(struct request_queue *q)
 {
-	blk_remove_plug(q);
-
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 
-	if (elv_queue_empty(q))
-		return;
-
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
@@ -445,10 +320,8 @@ void __blk_run_queue(struct request_queue *q)
 	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
-	} else {
-		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
-		kblockd_schedule_work(q, &q->unplug_work);
-	}
+	} else
+		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
@@ -535,8 +408,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q)
 		return NULL;
 
-	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
-	q->backing_dev_info.unplug_io_data = q;
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
@@ -556,13 +427,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
-	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
-	INIT_WORK(&q->unplug_work, blk_unplug_work);
 	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -652,7 +521,6 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unprep_rq_fn		= NULL;
-	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	q->queue_lock		= lock;
 
@@ -910,8 +778,8 @@ out:
 }
 
 /*
- * No available requests for this queue, unplug the device and wait for some
- * requests to become available.
+ * No available requests for this queue, wait for some requests to become
+ * available.
  *
  * Called with q->queue_lock held, and returns with it unlocked.
  */
@@ -932,7 +800,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 
 		trace_block_sleeprq(q, bio, rw_flags & 1);
 
-		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 		io_schedule();
 
@@ -1058,7 +925,7 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 			     int where)
 {
 	drive_stat_acct(rq, 1);
-	__elv_add_request(q, rq, where, 0);
+	__elv_add_request(q, rq, where);
 }
 
 /**
@@ -2798,7 +2665,7 @@ static void flush_plug_list(struct blk_plug *plug)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT, 0);
+		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT);
 	}
 
 	if (q) {
diff --git a/block/blk-exec.c b/block/blk-exec.c
index cf1456a02acd..81e31819a597 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	rq->end_io = done;
 	WARN_ON(irqs_disabled());
 	spin_lock_irq(q->queue_lock);
-	__elv_add_request(q, rq, where, 1);
-	__generic_unplug_device(q);
+	__elv_add_request(q, rq, where);
+	__blk_run_queue(q);
 	/* the queue is stopped so it won't be plugged+unplugged */
 	if (rq->cmd_type == REQ_TYPE_PM_RESUME)
 		q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1e2aa8a8908c..671fa9da7560 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -194,7 +194,6 @@ static void flush_end_io(struct request *flush_rq, int error)
 {
 	struct request_queue *q = flush_rq->q;
 	struct list_head *running = &q->flush_queue[q->flush_running_idx];
-	bool was_empty = elv_queue_empty(q);
 	bool queued = false;
 	struct request *rq, *n;
 
@@ -213,7 +212,7 @@ static void flush_end_io(struct request *flush_rq, int error)
 	}
 
 	/* after populating an empty queue, kick it to avoid stall */
-	if (queued && was_empty)
+	if (queued)
 		__blk_run_queue(q);
 }
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af18..c8d68921dddb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -164,14 +164,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
 
-	q->unplug_thresh = 4;		/* hmm */
-	q->unplug_delay = msecs_to_jiffies(3);	/* 3 milliseconds */
-	if (q->unplug_delay == 0)
-		q->unplug_delay = 1;
-
-	q->unplug_timer.function = blk_unplug_timeout;
-	q->unplug_timer.data = (unsigned long)q;
-
 	blk_set_default_limits(&q->limits);
 	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a89043a3caa4..b8dcdc2663a1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -800,7 +800,6 @@ out:
 	if (nr_disp) {
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
-		blk_unplug(q);
 	}
 	return nr_disp;
 }
diff --git a/block/blk.h b/block/blk.h
index 284b500852bd..49d21af81d07 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
-void blk_unplug_work(struct work_struct *work);
-void blk_unplug_timeout(unsigned long data);
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
 void blk_add_timer(struct request *);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3202c7e87fb3..ef631539dd2a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -499,13 +499,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 	}
 }
 
-static int cfq_queue_empty(struct request_queue *q)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-
-	return !cfqd->rq_queued;
-}
-
 /*
  * Scale schedule slice based on io priority. Use the sync time slice only
  * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -4061,7 +4054,6 @@ static struct elevator_type iosched_cfq = {
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_activate_req_fn =	cfq_activate_request,
 		.elevator_deactivate_req_fn =	cfq_deactivate_request,
-		.elevator_queue_empty_fn =	cfq_queue_empty,
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b23..5139c0ea1864 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
 	return 1;
 }
 
-static int deadline_queue_empty(struct request_queue *q)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-
-	return list_empty(&dd->fifo_list[WRITE])
-		&& list_empty(&dd->fifo_list[READ]);
-}
-
 static void deadline_exit_queue(struct elevator_queue *e)
 {
 	struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
 		.elevator_merge_req_fn =	deadline_merged_requests,
 		.elevator_dispatch_fn =		deadline_dispatch_requests,
 		.elevator_add_req_fn =		deadline_add_request,
-		.elevator_queue_empty_fn =	deadline_queue_empty,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_init_fn =		deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 25713927c0d3..3ea208256e78 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -619,21 +619,12 @@ void elv_quiesce_end(struct request_queue *q)
 
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
-	int unplug_it = 1;
-
 	trace_block_rq_insert(q, rq);
 
 	rq->q = q;
 
 	switch (where) {
 	case ELEVATOR_INSERT_REQUEUE:
-		/*
-		 * Most requeues happen because of a busy condition,
-		 * don't force unplug of the queue for that case.
-		 * Clear unplug_it and fall through.
-		 */
-		unplug_it = 0;
-
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->queue_head);
@@ -679,24 +670,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		blk_insert_flush(rq);
 		break;
-
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
 		BUG();
 	}
-
-	if (unplug_it && blk_queue_plugged(q)) {
-		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
-				- queue_in_flight(q);
-
-		if (nrq >= q->unplug_thresh)
-			__generic_unplug_device(q);
-	}
 }
 
-void __elv_add_request(struct request_queue *q, struct request *rq, int where,
-		       int plug)
+void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
 	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
 
@@ -711,38 +692,20 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		    where == ELEVATOR_INSERT_SORT)
 		where = ELEVATOR_INSERT_BACK;
 
-	if (plug)
-		blk_plug_device(q);
-
 	elv_insert(q, rq, where);
 }
 EXPORT_SYMBOL(__elv_add_request);
 
-void elv_add_request(struct request_queue *q, struct request *rq, int where,
-		     int plug)
+void elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__elv_add_request(q, rq, where, plug);
+	__elv_add_request(q, rq, where);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(elv_add_request);
 
-int elv_queue_empty(struct request_queue *q)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (!list_empty(&q->queue_head))
-		return 0;
-
-	if (e->ops->elevator_queue_empty_fn)
-		return e->ops->elevator_queue_empty_fn(q);
-
-	return 1;
-}
-EXPORT_SYMBOL(elv_queue_empty);
-
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd37..06389e9ef96d 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
 	list_add_tail(&rq->queuelist, &nd->queue);
 }
 
-static int noop_queue_empty(struct request_queue *q)
-{
-	struct noop_data *nd = q->elevator->elevator_data;
-
-	return list_empty(&nd->queue);
-}
-
 static struct request *
 noop_former_request(struct request_queue *q, struct request *rq)
 {
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
 		.elevator_merge_req_fn		= noop_merged_requests,
 		.elevator_dispatch_fn		= noop_dispatch,
 		.elevator_add_req_fn		= noop_add_request,
-		.elevator_queue_empty_fn	= noop_queue_empty,
 		.elevator_former_req_fn		= noop_former_request,
 		.elevator_latter_req_fn		= noop_latter_request,
 		.elevator_init_fn		= noop_init_queue,
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9279272b3732..35658f445fca 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3170,12 +3170,6 @@ static void do_cciss_request(struct request_queue *q)
 	int sg_index = 0;
 	int chained = 0;
 
-	/* We call start_io here in case there is a command waiting on the
-	 * queue that has not been sent.
-	 */
-	if (blk_queue_plugged(q))
-		goto startio;
-
       queue:
 	creq = blk_peek_request(q);
 	if (!creq)
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 946dad4caef3..b2fceb53e809 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -911,9 +911,6 @@ static void do_ida_request(struct request_queue *q)
 	struct scatterlist tmp_sg[SG_MAX];
 	int i, dir, seg;
 
-	if (blk_queue_plugged(q))
-		goto startio;
-
 queue_next:
 	creq = blk_peek_request(q);
 	if (!creq)
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index ba95cba192be..2096628d6e65 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -689,8 +689,6 @@ void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
 		}
 	}
 
-	drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
-
 	/* always (try to) flush bitmap to stable storage */
 	drbd_md_flush(mdev);
 
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index fd42832f785b..0645ca829a94 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -840,7 +840,6 @@ static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
 	for (i = 0; i < num_pages; i++)
 		bm_page_io_async(mdev, b, i, rw);
 
-	drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
 	wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
 
 	if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3803a0348937..0b5718e19586 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2382,20 +2382,6 @@ static inline int drbd_queue_order_type(struct drbd_conf *mdev)
 	return QUEUE_ORDERED_NONE;
 }
 
-static inline void drbd_blk_run_queue(struct request_queue *q)
-{
-	if (q && q->unplug_fn)
-		q->unplug_fn(q);
-}
-
-static inline void drbd_kick_lo(struct drbd_conf *mdev)
-{
-	if (get_ldev(mdev)) {
-		drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
-		put_ldev(mdev);
-	}
-}
-
 static inline void drbd_md_flush(struct drbd_conf *mdev)
 {
 	int r;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 29cd0dc9fe4f..6049cb85310d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2719,35 +2719,6 @@ static int drbd_release(struct gendisk *gd, fmode_t mode)
 	return 0;
 }
 
-static void drbd_unplug_fn(struct request_queue *q)
-{
-	struct drbd_conf *mdev = q->queuedata;
-
-	/* unplug FIRST */
-	spin_lock_irq(q->queue_lock);
-	blk_remove_plug(q);
-	spin_unlock_irq(q->queue_lock);
-
-	/* only if connected */
-	spin_lock_irq(&mdev->req_lock);
-	if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
-		D_ASSERT(mdev->state.role == R_PRIMARY);
-		if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
-			/* add to the data.work queue,
-			 * unless already queued.
-			 * XXX this might be a good addition to drbd_queue_work
-			 * anyways, to detect "double queuing" ... */
-			if (list_empty(&mdev->unplug_work.list))
-				drbd_queue_work(&mdev->data.work,
-						&mdev->unplug_work);
-		}
-	}
-	spin_unlock_irq(&mdev->req_lock);
-
-	if (mdev->state.disk >= D_INCONSISTENT)
-		drbd_kick_lo(mdev);
-}
-
 static void drbd_set_defaults(struct drbd_conf *mdev)
 {
 	/* This way we get a compile error when sync_conf grows,
@@ -3222,9 +3193,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
 	blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(q, drbd_merge_bvec);
-	q->queue_lock = &mdev->req_lock; /* needed since we use */
-		/* plugging on a queue, that actually has no requests! */
-	q->unplug_fn = drbd_unplug_fn;
+	q->queue_lock = &mdev->req_lock;
 
 	mdev->md_io_page = alloc_page(GFP_KERNEL);
 	if (!mdev->md_io_page)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 24487d4fb202..84132f8bf8a4 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -187,15 +187,6 @@ static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int
 	return NULL;
 }
 
-/* kick lower level device, if we have more than (arbitrary number)
- * reference counts on it, which typically are locally submitted io
- * requests.  don't use unacked_cnt, so we speed up proto A and B, too. */
-static void maybe_kick_lo(struct drbd_conf *mdev)
-{
-	if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
-		drbd_kick_lo(mdev);
-}
-
 static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
 {
 	struct drbd_epoch_entry *e;
@@ -219,7 +210,6 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 	LIST_HEAD(reclaimed);
 	struct drbd_epoch_entry *e, *t;
 
-	maybe_kick_lo(mdev);
 	spin_lock_irq(&mdev->req_lock);
 	reclaim_net_ee(mdev, &reclaimed);
 	spin_unlock_irq(&mdev->req_lock);
@@ -436,8 +426,7 @@ void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
 	while (!list_empty(head)) {
 		prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(&mdev->req_lock);
-		drbd_kick_lo(mdev);
-		schedule();
+		io_schedule();
 		finish_wait(&mdev->ee_wait, &wait);
 		spin_lock_irq(&mdev->req_lock);
 	}
@@ -1147,7 +1136,6 @@ next_bio:
 
 		drbd_generic_make_request(mdev, fault_type, bio);
 	} while (bios);
-	maybe_kick_lo(mdev);
 	return 0;
 
 fail:
@@ -1167,9 +1155,6 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
 
 	inc_unacked(mdev);
 
-	if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
-		drbd_kick_lo(mdev);
-
 	mdev->current_epoch->barrier_nr = p->barrier;
 	rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
 
@@ -3556,9 +3541,6 @@ static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
 static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	if (mdev->state.disk >= D_INCONSISTENT)
-		drbd_kick_lo(mdev);
-
 	/* Make sure we've acked all the TCP data associated
 	 * with the data requests being unplugged */
 	drbd_tcp_quickack(mdev->data.socket);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 11a75d32a2e2..ad3fc6228f27 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -960,10 +960,6 @@ allocate_barrier:
 			bio_endio(req->private_bio, -EIO);
 	}
 
-	/* we need to plug ALWAYS since we possibly need to kick lo_dev.
-	 * we plug after submit, so we won't miss an unplug event */
-	drbd_plug_device(mdev);
-
 	return 0;
 
 fail_conflicting:
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 34f224b018b3..e027446590d3 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -792,7 +792,6 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 		 * queue (or even the read operations for those packets
 		 * is not finished by now).   Retry in 100ms. */
 
-		drbd_kick_lo(mdev);
 		__set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ / 10);
 		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index defdb5013ea3..53586fa5ae1b 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -45,24 +45,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
 		generic_make_request(bio);
 }
 
-static inline void drbd_plug_device(struct drbd_conf *mdev)
-{
-	struct request_queue *q;
-	q = bdev_get_queue(mdev->this_bdev);
-
-	spin_lock_irq(q->queue_lock);
-
-/* XXX the check on !blk_queue_plugged is redundant,
- * implicitly checked in blk_plug_device */
-
-	if (!blk_queue_plugged(q)) {
-		blk_plug_device(q);
-		del_timer(&q->unplug_timer);
-		/* unplugging should not happen automatically... */
-	}
-	spin_unlock_irq(q->queue_lock);
-}
-
 static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
 {
         return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index b9ba04fc2b34..271142b9e2cd 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3837,7 +3837,6 @@ static int __floppy_read_block_0(struct block_device *bdev)
 	bio.bi_end_io = floppy_rb0_complete;
 
 	submit_bio(READ, &bio);
-	generic_unplug_device(bdev_get_queue(bdev));
 	process_fd_request();
 	wait_for_completion(&complete);
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 49e6a545eb63..01b8e4a87c9f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -541,17 +541,6 @@ out:
 	return 0;
 }
 
-/*
- * kick off io on the underlying address space
- */
-static void loop_unplug(struct request_queue *q)
-{
-	struct loop_device *lo = q->queuedata;
-
-	queue_flag_clear_unlocked(QUEUE_FLAG_PLUGGED, q);
-	blk_run_address_space(lo->lo_backing_file->f_mapping);
-}
-
 struct switch_request {
 	struct file *file;
 	struct completion wait;
@@ -918,7 +907,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	 */
 	blk_queue_make_request(lo->lo_queue, loop_make_request);
 	lo->lo_queue->queuedata = lo;
-	lo->lo_queue->unplug_fn = loop_unplug;
 
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
 		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
@@ -1020,7 +1008,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 
 	kthread_stop(lo->lo_thread);
 
-	lo->lo_queue->unplug_fn = NULL;
 	lo->lo_backing_file = NULL;
 
 	loop_release_xfer(lo);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 77d70eebb6b2..d20e13f80001 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1606,8 +1606,6 @@ static int kcdrwd(void *foobar)
 					min_sleep_time = pkt->sleep_time;
 			}
 
-			generic_unplug_device(bdev_get_queue(pd->bdev));
-
 			VPRINTK("kcdrwd: sleeping\n");
 			residue = schedule_timeout(min_sleep_time);
 			VPRINTK("kcdrwd: wake up\n");
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8be57151f5d6..653439faa729 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -241,8 +241,7 @@ static void dump_dmastat(struct cardinfo *card, unsigned int dmastat)
  *
  * Whenever IO on the active page completes, the Ready page is activated
  * and the ex-Active page is clean out and made Ready.
- * Otherwise the Ready page is only activated when it becomes full, or
- * when mm_unplug_device is called via the unplug_io_fn.
+ * Otherwise the Ready page is only activated when it becomes full.
  *
  * If a request arrives while both pages a full, it is queued, and b_rdev is
  * overloaded to record whether it was a read or a write.
@@ -333,17 +332,6 @@ static inline void reset_page(struct mm_page *page)
 	page->biotail = &page->bio;
 }
 
-static void mm_unplug_device(struct request_queue *q)
-{
-	struct cardinfo *card = q->queuedata;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->lock, flags);
-	if (blk_remove_plug(q))
-		activate(card);
-	spin_unlock_irqrestore(&card->lock, flags);
-}
-
 /*
  * If there is room on Ready page, take
  * one bh off list and add it.
@@ -535,7 +523,6 @@ static int mm_make_request(struct request_queue *q, struct bio *bio)
 	*card->biotail = bio;
 	bio->bi_next = NULL;
 	card->biotail = &bio->bi_next;
-	blk_plug_device(q);
 	spin_unlock_irq(&card->lock);
 
 	return 0;
@@ -907,7 +894,6 @@ static int __devinit mm_pci_probe(struct pci_dev *dev,
 	blk_queue_make_request(card->queue, mm_make_request);
 	card->queue->queue_lock = &card->lock;
 	card->queue->queuedata = card;
-	card->queue->unplug_fn = mm_unplug_device;
 
 	tasklet_init(&card->tasklet, process_page, (unsigned long)card);
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index e88a2cf17711..6f218e014e99 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -233,8 +233,7 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 
 	drive->hwif->rq = NULL;
 
-	elv_add_request(drive->queue, &drive->sense_rq,
-			ELEVATOR_INSERT_FRONT, 0);
+	elv_add_request(drive->queue, &drive->sense_rq, ELEVATOR_INSERT_FRONT);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 999dac054bcc..f4077840d3ab 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -549,8 +549,6 @@ plug_device_2:
 
 	if (rq)
 		blk_requeue_request(q, rq);
-	if (!elv_queue_empty(q))
-		blk_plug_device(q);
 }
 
 void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
@@ -562,8 +560,6 @@ void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
 
 	if (rq)
 		blk_requeue_request(q, rq);
-	if (!elv_queue_empty(q))
-		blk_plug_device(q);
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 88a380c5a470..6ab9ab2a5081 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -52,7 +52,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	rq->cmd[0] = REQ_UNPARK_HEADS;
 	rq->cmd_len = 1;
 	rq->cmd_type = REQ_TYPE_SPECIAL;
-	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
+	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
 
 out:
 	return;
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a35320fb59f..54bfc274b39a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1339,8 +1339,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 			prepare_to_wait(&bitmap->overflow_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&bitmap->lock);
-			md_unplug(bitmap->mddev);
-			schedule();
+			io_schedule();
 			finish_wait(&bitmap->overflow_wait, &__wait);
 			continue;
 		}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4e054bd91664..2c62c1169f78 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -991,11 +991,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
-static void kcryptd_unplug(struct crypt_config *cc)
-{
-	blk_unplug(bdev_get_queue(cc->dev->bdev));
-}
-
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->target->private;
@@ -1008,10 +1003,8 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	 * one in order to decrypt the whole bio data *afterwards*.
 	 */
 	clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
-	if (!clone) {
-		kcryptd_unplug(cc);
+	if (!clone)
 		return 1;
-	}
 
 	crypt_inc_pending(io);
 
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 924f5f0084c2..400cf35094a4 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,13 +37,6 @@ struct dm_kcopyd_client {
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
 
-	/*
-	 * Block devices to unplug.
-	 * Non-NULL pointer means that a block device has some pending requests
-	 * and needs to be unplugged.
-	 */
-	struct block_device *unplug[2];
-
 	struct dm_io_client *io_client;
 
 	wait_queue_head_t destroyq;
@@ -315,31 +308,6 @@ static int run_complete_job(struct kcopyd_job *job)
 	return 0;
 }
 
-/*
- * Unplug the block device at the specified index.
- */
-static void unplug(struct dm_kcopyd_client *kc, int rw)
-{
-	if (kc->unplug[rw] != NULL) {
-		blk_unplug(bdev_get_queue(kc->unplug[rw]));
-		kc->unplug[rw] = NULL;
-	}
-}
-
-/*
- * Prepare block device unplug. If there's another device
- * to be unplugged at the same array index, we unplug that
- * device first.
- */
-static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
-			   struct block_device *bdev)
-{
-	if (likely(kc->unplug[rw] == bdev))
-		return;
-	unplug(kc, rw);
-	kc->unplug[rw] = bdev;
-}
-
 static void complete_io(unsigned long error, void *context)
 {
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -386,15 +354,12 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
-	if (job->rw == READ) {
+	if (job->rw == READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
-		prepare_unplug(job->kc, READ, job->source.bdev);
-	} else {
+	else {
 		if (job->num_dests > 1)
 			io_req.bi_rw |= REQ_UNPLUG;
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
-		if (!(io_req.bi_rw & REQ_UNPLUG))
-			prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
 	}
 
 	return r;
@@ -466,6 +431,7 @@ static void do_work(struct work_struct *work)
 {
 	struct dm_kcopyd_client *kc = container_of(work,
 					struct dm_kcopyd_client, kcopyd_work);
+	struct blk_plug plug;
 
 	/*
 	 * The order that these are called is *very* important.
@@ -473,18 +439,12 @@ static void do_work(struct work_struct *work)
 	 * Pages jobs when successful will jump onto the io jobs
 	 * list.  io jobs call wake when they complete and it all
 	 * starts again.
-	 *
-	 * Note that io_jobs add block devices to the unplug array,
-	 * this array is cleared with "unplug" calls. It is thus
-	 * forbidden to run complete_jobs after io_jobs and before
-	 * unplug because the block device could be destroyed in
-	 * job completion callback.
 	 */
+	blk_start_plug(&plug);
 	process_jobs(&kc->complete_jobs, kc, run_complete_job);
 	process_jobs(&kc->pages_jobs, kc, run_pages_job);
 	process_jobs(&kc->io_jobs, kc, run_io_job);
-	unplug(kc, READ);
-	unplug(kc, WRITE);
+	blk_finish_plug(&plug);
 }
 
 /*
@@ -665,8 +625,6 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
 
-	memset(kc->unplug, 0, sizeof(kc->unplug));
-
 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!kc->job_pool)
 		goto bad_slab;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b9e1e15ef11c..5ef136cdba91 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -394,7 +394,7 @@ static void raid_unplug(struct dm_target_callbacks *cb)
 {
 	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
 
-	md_raid5_unplug_device(rs->md.private);
+	md_raid5_kick_device(rs->md.private);
 }
 
 /*
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index dee326775c60..976ad4688afc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -842,8 +842,6 @@ static void do_mirror(struct work_struct *work)
 	do_reads(ms, &reads);
 	do_writes(ms, &writes);
 	do_failures(ms, &failures);
-
-	dm_table_unplug_all(ms->ti->table);
 }
 
 /*-----------------------------------------------------------------
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 38e4eb1bb965..f50a7b952257 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1275,29 +1275,6 @@ int dm_table_any_busy_target(struct dm_table *t)
 	return 0;
 }
 
-void dm_table_unplug_all(struct dm_table *t)
-{
-	struct dm_dev_internal *dd;
-	struct list_head *devices = dm_table_get_devices(t);
-	struct dm_target_callbacks *cb;
-
-	list_for_each_entry(dd, devices, list) {
-		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
-		char b[BDEVNAME_SIZE];
-
-		if (likely(q))
-			blk_unplug(q);
-		else
-			DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s",
-				     dm_device_name(t->md),
-				     bdevname(dd->dm_dev.bdev, b));
-	}
-
-	list_for_each_entry(cb, &t->target_callbacks, list)
-		if (cb->unplug_fn)
-			cb->unplug_fn(cb);
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
@@ -1345,4 +1322,3 @@ EXPORT_SYMBOL(dm_table_get_mode);
 EXPORT_SYMBOL(dm_table_get_md);
 EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
-EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eaa3af0e0632..d22b9905c168 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -807,8 +807,6 @@ void dm_requeue_unmapped_request(struct request *clone)
 	dm_unprep_request(rq);
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (elv_queue_empty(q))
-		blk_plug_device(q);
 	blk_requeue_request(q, rq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
@@ -1613,10 +1611,10 @@ static void dm_request_fn(struct request_queue *q)
 	 * number of in-flight I/Os after the queue is stopped in
 	 * dm_suspend().
 	 */
-	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+	while (!blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
-			goto plug_and_out;
+			goto delay_and_out;
 
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;
@@ -1627,7 +1625,7 @@ static void dm_request_fn(struct request_queue *q)
 		BUG_ON(!dm_target_is_valid(ti));
 
 		if (ti->type->busy && ti->type->busy(ti))
-			goto plug_and_out;
+			goto delay_and_out;
 
 		blk_start_request(rq);
 		clone = rq->special;
@@ -1647,11 +1645,8 @@ requeued:
 	BUG_ON(!irqs_disabled());
 	spin_lock(q->queue_lock);
 
-plug_and_out:
-	if (!elv_queue_empty(q))
-		/* Some requests still remain, retry later */
-		blk_plug_device(q);
-
+delay_and_out:
+	blk_delay_queue(q, HZ / 10);
 out:
 	dm_table_put(map);
 
@@ -1680,20 +1675,6 @@ static int dm_lld_busy(struct request_queue *q)
 	return r;
 }
 
-static void dm_unplug_all(struct request_queue *q)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table(md);
-
-	if (map) {
-		if (dm_request_based(md))
-			generic_unplug_device(q);
-
-		dm_table_unplug_all(map);
-		dm_table_put(map);
-	}
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r = bdi_bits;
@@ -1817,7 +1798,6 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
@@ -2263,8 +2243,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
-	dm_unplug_all(md->queue);
-
 	add_wait_queue(&md->wait, &wait);
 
 	while (1) {
@@ -2539,7 +2517,6 @@ int dm_resume(struct mapped_device *md)
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
-	dm_table_unplug_all(map);
 	r = 0;
 out:
 	dm_table_put(map);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 8a2f767f26d8..38861b5b9d90 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
 	return maxsectors << 9;
 }
 
-static void linear_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	linear_conf_t *conf;
-	int i;
-
-	rcu_read_lock();
-	conf = rcu_dereference(mddev->private);
-
-	for (i=0; i < mddev->raid_disks; i++) {
-		struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
-		blk_unplug(r_queue);
-	}
-	rcu_read_unlock();
-}
-
 static int linear_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -225,7 +209,6 @@ static int linear_run (mddev_t *mddev)
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
-	mddev->queue->unplug_fn = linear_unplug;
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0cc30ecda4c1..ca0d79c264b9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4812,7 +4812,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		__md_stop_writes(mddev);
 		md_stop(mddev);
 		mddev->queue->merge_bvec_fn = NULL;
-		mddev->queue->unplug_fn = NULL;
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 
 		/* tell userspace to handle 'inactive' */
@@ -6669,8 +6668,6 @@ EXPORT_SYMBOL_GPL(md_allow_write);
 
 void md_unplug(mddev_t *mddev)
 {
-	if (mddev->queue)
-		blk_unplug(mddev->queue);
 	if (mddev->plug)
 		mddev->plug->unplug_fn(mddev->plug);
 }
@@ -6853,7 +6850,6 @@ void md_do_sync(mddev_t *mddev)
 		     >= mddev->resync_max - mddev->curr_resync_completed
 			    )) {
 			/* time to update curr_resync_completed */
-			md_unplug(mddev);
 			wait_event(mddev->recovery_wait,
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed = j;
@@ -6929,7 +6925,6 @@ void md_do_sync(mddev_t *mddev)
 		 * about not overloading the IO subsystem. (things like an
 		 * e2fsck being done on the RAID array should execute fast)
 		 */
-		md_unplug(mddev);
 		cond_resched();
 
 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6948,8 +6943,6 @@ void md_do_sync(mddev_t *mddev)
 	 * this also signals 'finished resyncing' to md_stop
 	 */
  out:
-	md_unplug(mddev);
-
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 6d7ddf32ef2e..1cc8ed44e4ad 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,36 +106,6 @@ static void multipath_end_request(struct bio *bio, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	multipath_conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags)
-		    && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void multipath_unplug(struct request_queue *q)
-{
-	unplug_slaves(q->queuedata);
-}
-
-
 static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 {
 	multipath_conf_t *conf = mddev->private;
@@ -518,7 +488,6 @@ static int multipath_run (mddev_t *mddev)
 	 */
 	md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
 
-	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 637a96855edb..6338c0fe6208 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,21 +25,6 @@
 #include "raid0.h"
 #include "raid5.h"
 
-static void raid0_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	raid0_conf_t *conf = mddev->private;
-	mdk_rdev_t **devlist = conf->devlist;
-	int raid_disks = conf->strip_zone[0].nb_dev;
-	int i;
-
-	for (i=0; i < raid_disks; i++) {
-		struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
-
-		blk_unplug(r_queue);
-	}
-}
-
 static int raid0_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -272,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
 		       mdname(mddev),
 		       (unsigned long long)smallest->sectors);
 	}
-	mddev->queue->unplug_fn = raid0_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a23ffa397ba9..b67d822d57ae 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
 #define	NR_RAID1_BIOS 256
 
 
-static void unplug_slaves(mddev_t *mddev);
-
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
-	r1bio_t *r1_bio;
 	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
-	r1_bio = kzalloc(size, gfp_flags);
-	if (!r1_bio && pi->mddev)
-		unplug_slaves(pi->mddev);
-
-	return r1_bio;
+	return kzalloc(size, gfp_flags);
 }
 
 static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,10 +84,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int i, j;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
-	if (!r1_bio) {
-		unplug_slaves(pi->mddev);
+	if (!r1_bio)
 		return NULL;
-	}
 
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
@@ -520,37 +511,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 	return new_disk;
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i<mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void raid1_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	unplug_slaves(mddev);
-	md_wakeup_thread(mddev->thread);
-}
-
 static int raid1_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -580,20 +540,16 @@ static int raid1_congested(void *data, int bits)
 }
 
 
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
-	 * We return 1 if any requests were actually submitted.
 	 */
-	int rv = 0;
-
 	spin_lock_irq(&conf->device_lock);
 
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
-		blk_remove_plug(conf->mddev->queue);
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to
 		 * disk before proceeding w/ I/O */
@@ -605,10 +561,14 @@ static int flush_pending_writes(conf_t *conf)
 			generic_make_request(bio);
 			bio = next;
 		}
-		rv = 1;
 	} else
 		spin_unlock_irq(&conf->device_lock);
-	return rv;
+}
+
+static void md_kick_device(mddev_t *mddev)
+{
+	blk_flush_plug(current);
+	md_wakeup_thread(mddev->thread);
 }
 
 /* Barriers....
@@ -640,8 +600,7 @@ static void raise_barrier(conf_t *conf)
 
 	/* Wait until no block IO is waiting */
 	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
-			    conf->resync_lock,
-			    raid1_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -649,8 +608,7 @@ static void raise_barrier(conf_t *conf)
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock,
-			    raid1_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -672,7 +630,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    raid1_unplug(conf->mddev->queue));
+				    md_kick_device(conf->mddev));
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -709,7 +667,7 @@ static void freeze_array(conf_t *conf)
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
 			    ({ flush_pending_writes(conf);
-			       raid1_unplug(conf->mddev->queue); }));
+			       md_kick_device(conf->mddev); }));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
@@ -959,7 +917,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		atomic_inc(&r1_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
-		blk_plug_device(mddev->queue);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
@@ -968,7 +925,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid1d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync)
+	if (do_sync || !bitmap)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
@@ -1558,7 +1515,6 @@ static void raid1d(mddev_t *mddev)
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	int unplug=0;
 	mdk_rdev_t *rdev;
 
 	md_check_recovery(mddev);
@@ -1566,7 +1522,7 @@ static void raid1d(mddev_t *mddev)
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
-		unplug += flush_pending_writes(conf);
+		flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -1580,10 +1536,9 @@ static void raid1d(mddev_t *mddev)
 
 		mddev = r1_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
+		if (test_bit(R1BIO_IsSync, &r1_bio->state))
 			sync_request_write(mddev, r1_bio);
-			unplug = 1;
-		} else {
+		else {
 			int disk;
 
 			/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -1633,14 +1588,11 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_end_io = raid1_end_read_request;
 				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r1_bio;
-				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 		cond_resched();
 	}
-	if (unplug)
-		unplug_slaves(mddev);
 }
 
 
@@ -2064,7 +2016,6 @@ static int run(mddev_t *mddev)
 
 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
-	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	md_integrity_register(mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3b607b28741b..e79f1c5bf71b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -57,23 +57,16 @@
  */
 #define	NR_RAID10_BIOS 256
 
-static void unplug_slaves(mddev_t *mddev);
-
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	conf_t *conf = data;
-	r10bio_t *r10_bio;
 	int size = offsetof(struct r10bio_s, devs[conf->copies]);
 
 	/* allocate a r10bio with room for raid_disks entries in the bios array */
-	r10_bio = kzalloc(size, gfp_flags);
-	if (!r10_bio && conf->mddev)
-		unplug_slaves(conf->mddev);
-
-	return r10_bio;
+	return kzalloc(size, gfp_flags);
 }
 
 static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int nalloc;
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-	if (!r10_bio) {
-		unplug_slaves(conf->mddev);
+	if (!r10_bio)
 		return NULL;
-	}
 
 	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 		nalloc = conf->copies; /* resync */
@@ -597,37 +588,6 @@ rb_out:
 	return disk;
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-	conf_t *conf = mddev->private;
-	int i;
-
-	rcu_read_lock();
-	for (i=0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-static void raid10_unplug(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	unplug_slaves(q->queuedata);
-	md_wakeup_thread(mddev->thread);
-}
-
 static int raid10_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
@@ -649,20 +609,16 @@ static int raid10_congested(void *data, int bits)
 	return ret;
 }
 
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
-	 * We return 1 if any requests were actually submitted.
 	 */
-	int rv = 0;
-
 	spin_lock_irq(&conf->device_lock);
 
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
-		blk_remove_plug(conf->mddev->queue);
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
@@ -674,11 +630,16 @@ static int flush_pending_writes(conf_t *conf)
 			generic_make_request(bio);
 			bio = next;
 		}
-		rv = 1;
 	} else
 		spin_unlock_irq(&conf->device_lock);
-	return rv;
 }
+
+static void md_kick_device(mddev_t *mddev)
+{
+	blk_flush_plug(current);
+	md_wakeup_thread(mddev->thread);
+}
+
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -708,8 +669,7 @@ static void raise_barrier(conf_t *conf, int force)
 
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock,
-			    raid10_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	/* block any new IO from starting */
 	conf->barrier++;
@@ -717,8 +677,7 @@ static void raise_barrier(conf_t *conf, int force)
 	/* No wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock,
-			    raid10_unplug(conf->mddev->queue));
+			    conf->resync_lock, md_kick_device(conf->mddev));
 
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -739,7 +698,7 @@ static void wait_barrier(conf_t *conf)
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
-				    raid10_unplug(conf->mddev->queue));
+				    md_kick_device(conf->mddev));
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
@@ -776,7 +735,7 @@ static void freeze_array(conf_t *conf)
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
 			    ({ flush_pending_writes(conf);
-			       raid10_unplug(conf->mddev->queue); }));
+			       md_kick_device(conf->mddev); }));
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -971,7 +930,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		atomic_inc(&r10_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
-		blk_plug_device(mddev->queue);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 
@@ -988,7 +946,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 
-	if (do_sync)
+	if (do_sync || !mddev->bitmap)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
@@ -1681,7 +1639,6 @@ static void raid10d(mddev_t *mddev)
 	unsigned long flags;
 	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
-	int unplug=0;
 	mdk_rdev_t *rdev;
 
 	md_check_recovery(mddev);
@@ -1689,7 +1646,7 @@ static void raid10d(mddev_t *mddev)
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 
-		unplug += flush_pending_writes(conf);
+		flush_pending_writes(conf);
 
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
@@ -1703,13 +1660,11 @@ static void raid10d(mddev_t *mddev)
 
 		mddev = r10_bio->mddev;
 		conf = mddev->private;
-		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+		if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
-			unplug = 1;
-		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
-			unplug = 1;
-		} else {
+		else {
 			int mirror;
 			/* we got a read error. Maybe the drive is bad.  Maybe just
 			 * the block and we can fix it.
@@ -1756,14 +1711,11 @@ static void raid10d(mddev_t *mddev)
 				bio->bi_rw = READ | do_sync;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = raid10_end_read_request;
-				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 		cond_resched();
 	}
-	if (unplug)
-		unplug_slaves(mddev);
 }
 
 
@@ -2376,7 +2328,6 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, size);
 	mddev->resync_max_sectors = size;
 
-	mddev->queue->unplug_fn = raid10_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 702812824195..e867ee42b152 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -433,8 +433,6 @@ static int has_failed(raid5_conf_t *conf)
 	return 0;
 }
 
-static void unplug_slaves(mddev_t *mddev);
-
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
 		  int previous, int noblock, int noquiesce)
@@ -463,8 +461,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
 						     < (conf->max_nr_stripes *3/4)
 						     || !conf->inactive_blocked),
 						    conf->device_lock,
-						    md_raid5_unplug_device(conf)
-					);
+						    md_raid5_kick_device(conf));
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, previous);
@@ -1473,8 +1470,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    !list_empty(&conf->inactive_list),
 				    conf->device_lock,
-				    unplug_slaves(conf->mddev)
-			);
+				    blk_flush_plug(current));
 		osh = get_free_stripe(conf);
 		spin_unlock_irq(&conf->device_lock);
 		atomic_set(&nsh->count, 1);
@@ -3645,58 +3641,19 @@ static void activate_bit_delay(raid5_conf_t *conf)
 	}
 }
 
-static void unplug_slaves(mddev_t *mddev)
+void md_raid5_kick_device(raid5_conf_t *conf)
 {
-	raid5_conf_t *conf = mddev->private;
-	int i;
-	int devs = max(conf->raid_disks, conf->previous_raid_disks);
-
-	rcu_read_lock();
-	for (i = 0; i < devs; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-			atomic_inc(&rdev->nr_pending);
-			rcu_read_unlock();
-
-			blk_unplug(r_queue);
-
-			rdev_dec_pending(rdev, mddev);
-			rcu_read_lock();
-		}
-	}
-	rcu_read_unlock();
-}
-
-void md_raid5_unplug_device(raid5_conf_t *conf)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&conf->device_lock, flags);
-
-	if (plugger_remove_plug(&conf->plug)) {
-		conf->seq_flush++;
-		raid5_activate_delayed(conf);
-	}
+	blk_flush_plug(current);
+	raid5_activate_delayed(conf);
 	md_wakeup_thread(conf->mddev->thread);
-
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-
-	unplug_slaves(conf->mddev);
 }
-EXPORT_SYMBOL_GPL(md_raid5_unplug_device);
+EXPORT_SYMBOL_GPL(md_raid5_kick_device);
 
 static void raid5_unplug(struct plug_handle *plug)
 {
 	raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
-	md_raid5_unplug_device(conf);
-}
 
-static void raid5_unplug_queue(struct request_queue *q)
-{
-	mddev_t *mddev = q->queuedata;
-	md_raid5_unplug_device(mddev->private);
+	md_raid5_kick_device(conf);
 }
 
 int md_raid5_congested(mddev_t *mddev, int bits)
@@ -4100,7 +4057,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
-				md_raid5_unplug_device(conf);
+				md_raid5_kick_device(conf);
 				release_stripe(sh);
 				schedule();
 				goto retry;
@@ -4365,7 +4322,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
-		unplug_slaves(mddev);
 
 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
 			end_reshape(conf);
@@ -4569,7 +4525,6 @@ static void raid5d(mddev_t *mddev)
 	spin_unlock_irq(&conf->device_lock);
 
 	async_tx_issue_pending_all();
-	unplug_slaves(mddev);
 
 	pr_debug("--- raid5d inactive\n");
 }
@@ -5205,7 +5160,6 @@ static int run(mddev_t *mddev)
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 		mddev->queue->queue_lock = &conf->device_lock;
-		mddev->queue->unplug_fn = raid5_unplug_queue;
 
 		chunk_size = mddev->chunk_sectors << 9;
 		blk_queue_io_min(mddev->queue, chunk_size);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2ace0582b409..8d563a4f022a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -503,6 +503,6 @@ static inline int algorithm_is_DDF(int layout)
 }
 
 extern int md_raid5_congested(mddev_t *mddev, int bits);
-extern void md_raid5_unplug_device(raid5_conf_t *conf);
+extern void md_raid5_kick_device(raid5_conf_t *conf);
 extern int raid5_set_cache_size(mddev_t *mddev, int size);
 #endif
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index ae7cad185898..b29eb4eaa86e 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -895,11 +895,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 {
 	struct request *req;
 
-	while (!blk_queue_plugged(q)) {
-		req = blk_peek_request(q);
-		if (!req)
-			break;
-
+	while ((req = blk_peek_request(q)) != NULL) {
 		if (req->cmd_type == REQ_TYPE_FS) {
 			struct i2o_block_delayed_request *dreq;
 			struct i2o_block_request *ireq = req->special;
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 4e42d030e097..2ae727568df9 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -55,8 +55,7 @@ static int mmc_queue_thread(void *d)
 
 		spin_lock_irq(q->queue_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (!blk_queue_plugged(q))
-			req = blk_fetch_request(q);
+		req = blk_fetch_request(q);
 		mq->req = req;
 		spin_unlock_irq(q->queue_lock);
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 794bfd962266..4d2df2f76ea0 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1917,7 +1917,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 		return;
 	}
 	/* Now we try to fetch requests from the request queue */
-	while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) {
+	while ((req = blk_peek_request(queue))) {
 		if (basedev->features & DASD_FEATURE_READONLY &&
 		    rq_data_dir(req) == WRITE) {
 			DBF_DEV_EVENT(DBF_ERR, basedev,
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 55d2d0f4eabc..f061b2527b44 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -161,7 +161,6 @@ tapeblock_requeue(struct work_struct *work) {
 
 	spin_lock_irq(&device->blk_data.request_queue_lock);
 	while (
-		!blk_queue_plugged(queue) &&
 		blk_peek_request(queue) &&
 		nr_queued < TAPEBLOCK_MIN_REQUEUE
 	) {
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 998c01be3234..2cefabd5bdb5 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3913,7 +3913,7 @@ fc_bsg_request_handler(struct request_queue *q, struct Scsi_Host *shost,
 	if (!get_device(dev))
 		return;
 
-	while (!blk_queue_plugged(q)) {
+	while (1) {
 		if (rport && (rport->port_state == FC_PORTSTATE_BLOCKED) &&
 		    !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
 			break;
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 927e99cb7225..c6fcf76cade5 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -173,11 +173,7 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
 	int ret;
 	int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *);
 
-	while (!blk_queue_plugged(q)) {
-		req = blk_fetch_request(q);
-		if (!req)
-			break;
-
+	while ((req = blk_fetch_request(q)) != NULL) {
 		spin_unlock_irq(q->queue_lock);
 
 		handler = to_sas_internal(shost->transportt)->f->smp_handler;
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 67f0c09983c8..c1b539d7b0d3 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -392,9 +392,8 @@ static int iblock_do_task(struct se_task *task)
 {
 	struct se_device *dev = task->task_se_cmd->se_dev;
 	struct iblock_req *req = IBLOCK_REQ(task);
-	struct iblock_dev *ibd = (struct iblock_dev *)req->ib_dev;
-	struct request_queue *q = bdev_get_queue(ibd->ibd_bd);
 	struct bio *bio = req->ib_bio, *nbio = NULL;
+	struct blk_plug plug;
 	int rw;
 
 	if (task->task_data_direction == DMA_TO_DEVICE) {
@@ -412,6 +411,7 @@ static int iblock_do_task(struct se_task *task)
 		rw = READ;
 	}
 
+	blk_start_plug(&plug);
 	while (bio) {
 		nbio = bio->bi_next;
 		bio->bi_next = NULL;
@@ -421,9 +421,8 @@ static int iblock_do_task(struct se_task *task)
 		submit_bio(rw, bio);
 		bio = nbio;
 	}
+	blk_finish_plug(&plug);
 
-	if (q->unplug_fn)
-		q->unplug_fn(q);
 	return PYX_TRANSPORT_SENT_TO_TRANSPORT;
 }
 
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79e..1cc84b276131 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -73,7 +73,6 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations adfs_aops = {
 	.readpage	= adfs_readpage,
 	.writepage	= adfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= adfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= _adfs_bmap
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de2..acf321b70fcd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,7 +429,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations affs_aops = {
 	.readpage = affs_readpage,
 	.writepage = affs_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = affs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = _affs_bmap
@@ -786,7 +785,6 @@ out:
 const struct address_space_operations affs_aops_ofs = {
 	.readpage = affs_readpage_ofs,
 	//.writepage = affs_writepage_ofs,
-	//.sync_page = affs_sync_page_ofs,
 	.write_begin = affs_write_begin_ofs,
 	.write_end = affs_write_end_ofs
 };
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..c5ea494ea9e2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1550,9 +1550,11 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 	struct hlist_node *pos, *n;
 	int i;
 
+	/*
+	 * TODO: kill this
+	 */
 	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
 		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-			blk_run_address_space(abe->mapping);
 			iput(abe->mapping->host);
 			hlist_del(&abe->list);
 			mempool_free(abe, abe_pool);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747b..06457ed8f3e7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -75,7 +75,6 @@ static const struct inode_operations befs_dir_inode_operations = {
 
 static const struct address_space_operations befs_aops = {
 	.readpage	= befs_readpage,
-	.sync_page	= block_sync_page,
 	.bmap		= befs_bmap,
 };
 
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8ea..f20e8a71062f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -186,7 +186,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations bfs_aops = {
 	.readpage	= bfs_readpage,
 	.writepage	= bfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= bfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= bfs_bmap,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4fb8a3431531..fffc2c672396 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1520,7 +1520,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1aa8d607bc7..ada1f6bd0a57 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -847,7 +847,6 @@ static const struct address_space_operations btree_aops = {
 	.writepages	= btree_writepages,
 	.releasepage	= btree_releasepage,
 	.invalidatepage = btree_invalidatepage,
-	.sync_page	= block_sync_page,
 #ifdef CONFIG_MIGRATION
 	.migratepage	= btree_migratepage,
 #endif
@@ -1330,82 +1329,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	return ret;
 }
 
-/*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-	struct btrfs_device *device;
-	struct btrfs_fs_info *info;
-
-	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
-		if (!device->bdev)
-			continue;
-
-		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn)
-			bdi->unplug_io_fn(bdi, page);
-	}
-}
-
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-	struct inode *inode;
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
-	struct address_space *mapping;
-	u64 offset;
-
-	/* the generic O_DIRECT read code does this */
-	if (1 || !page) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	/*
-	 * page->mapping may change at any time.  Get a consistent copy
-	 * and use that for everything below
-	 */
-	smp_mb();
-	mapping = page->mapping;
-	if (!mapping)
-		return;
-
-	inode = mapping->host;
-
-	/*
-	 * don't do the expensive searching for a small number of
-	 * devices
-	 */
-	if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	offset = page_offset(page);
-
-	em_tree = &BTRFS_I(inode)->extent_tree;
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-	read_unlock(&em_tree->lock);
-	if (!em) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		free_extent_map(em);
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-	offset = offset - em->start;
-	btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
-			  em->block_start + offset, page);
-	free_extent_map(em);
-}
-
 /*
  * If this fails, caller must call bdi_destroy() to get rid of the
  * bdi again.
@@ -1420,8 +1343,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 		return err;
 
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
-	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
-	bdi->unplug_io_data	= info;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb9bd7832b6d..462e08e724b0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7218,7 +7218,6 @@ static const struct address_space_operations btrfs_aops = {
 	.writepage	= btrfs_writepage,
 	.writepages	= btrfs_writepages,
 	.readpages	= btrfs_readpages,
-	.sync_page	= block_sync_page,
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index af7dbca15276..6e0e82a1b188 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -162,7 +162,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	struct bio *cur;
 	int again = 0;
 	unsigned long num_run;
-	unsigned long num_sync_run;
 	unsigned long batch_run = 0;
 	unsigned long limit;
 	unsigned long last_waited = 0;
@@ -173,11 +172,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
-	/* we want to make sure that every time we switch from the sync
-	 * list to the normal list, we unplug
-	 */
-	num_sync_run = 0;
-
 loop:
 	spin_lock(&device->io_lock);
 
@@ -223,15 +217,6 @@ loop_lock:
 
 	spin_unlock(&device->io_lock);
 
-	/*
-	 * if we're doing the regular priority list, make sure we unplug
-	 * for any high prio bios we've sent down
-	 */
-	if (pending_bios == &device->pending_bios && num_sync_run > 0) {
-		num_sync_run = 0;
-		blk_run_backing_dev(bdi, NULL);
-	}
-
 	while (pending) {
 
 		rmb();
@@ -259,19 +244,11 @@ loop_lock:
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 
-		if (cur->bi_rw & REQ_SYNC)
-			num_sync_run++;
-
 		submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
-		if (need_resched()) {
-			if (num_sync_run) {
-				blk_run_backing_dev(bdi, NULL);
-				num_sync_run = 0;
-			}
+		if (need_resched())
 			cond_resched();
-		}
 
 		/*
 		 * we made progress, there is more work to do and the bdi
@@ -304,13 +281,8 @@ loop_lock:
 				 * against it before looping
 				 */
 				last_waited = ioc->last_waited;
-				if (need_resched()) {
-					if (num_sync_run) {
-						blk_run_backing_dev(bdi, NULL);
-						num_sync_run = 0;
-					}
+				if (need_resched())
 					cond_resched();
-				}
 				continue;
 			}
 			spin_lock(&device->io_lock);
@@ -323,22 +295,6 @@ loop_lock:
 		}
 	}
 
-	if (num_sync_run) {
-		num_sync_run = 0;
-		blk_run_backing_dev(bdi, NULL);
-	}
-	/*
-	 * IO has already been through a long path to get here.  Checksumming,
-	 * async helper threads, perhaps compression.  We've done a pretty
-	 * good job of collecting a batch of IO and should just unplug
-	 * the device right away.
-	 *
-	 * This will help anyone who is waiting on the IO, they might have
-	 * already unplugged, but managed to do so before the bio they
-	 * cared about found its way down here.
-	 */
-	blk_run_backing_dev(bdi, NULL);
-
 	cond_resched();
 	if (again)
 		goto loop;
@@ -2948,7 +2904,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_multi_bio **multi_ret,
-			     int mirror_num, struct page *unplug_page)
+			     int mirror_num)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -2980,11 +2936,6 @@ again:
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
 
-	if (!em && unplug_page) {
-		kfree(multi);
-		return 0;
-	}
-
 	if (!em) {
 		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
 		       (unsigned long long)logical,
@@ -3040,13 +2991,13 @@ again:
 		*length = em->len - offset;
 	}
 
-	if (!multi_ret && !unplug_page)
+	if (!multi_ret)
 		goto out;
 
 	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (unplug_page || (rw & REQ_WRITE))
+		if (rw & REQ_WRITE)
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
@@ -3068,7 +3019,7 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (unplug_page || (rw & REQ_WRITE))
+		if (rw & REQ_WRITE)
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
@@ -3088,22 +3039,10 @@ again:
 	BUG_ON(stripe_index >= map->num_stripes);
 
 	for (i = 0; i < num_stripes; i++) {
-		if (unplug_page) {
-			struct btrfs_device *device;
-			struct backing_dev_info *bdi;
-
-			device = map->stripes[stripe_index].dev;
-			if (device->bdev) {
-				bdi = blk_get_backing_dev_info(device->bdev);
-				if (bdi->unplug_io_fn)
-					bdi->unplug_io_fn(bdi, unplug_page);
-			}
-		} else {
-			multi->stripes[i].physical =
-				map->stripes[stripe_index].physical +
-				stripe_offset + stripe_nr * map->stripe_len;
-			multi->stripes[i].dev = map->stripes[stripe_index].dev;
-		}
+		multi->stripes[i].physical =
+			map->stripes[stripe_index].physical +
+			stripe_offset + stripe_nr * map->stripe_len;
+		multi->stripes[i].dev = map->stripes[stripe_index].dev;
 		stripe_index++;
 	}
 	if (multi_ret) {
@@ -3121,7 +3060,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		      struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
 	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
-				 mirror_num, NULL);
+				 mirror_num);
 }
 
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3189,14 +3128,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-		      u64 logical, struct page *page)
-{
-	u64 length = PAGE_CACHE_SIZE;
-	return __btrfs_map_block(map_tree, READ, logical, &length,
-				 NULL, 0, page);
-}
-
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2caf..f903f2e5b4fe 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,23 +54,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 }
 EXPORT_SYMBOL(init_buffer);
 
-static int sync_buffer(void *word)
+static int sleep_on_buffer(void *word)
 {
-	struct block_device *bd;
-	struct buffer_head *bh
-		= container_of(word, struct buffer_head, b_state);
-
-	smp_mb();
-	bd = bh->b_bdev;
-	if (bd)
-		blk_run_address_space(bd->bd_inode->i_mapping);
 	io_schedule();
 	return 0;
 }
 
 void __lock_buffer(struct buffer_head *bh)
 {
-	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(unlock_buffer);
  */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
 
@@ -749,7 +741,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
-	struct address_space *mapping, *prev_mapping = NULL;
+	struct address_space *mapping;
 	int err = 0, err2;
 
 	INIT_LIST_HEAD(&tmp);
@@ -783,10 +775,6 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * wait_on_buffer() will do that for us
 				 * through sync_buffer().
 				 */
-				if (prev_mapping && prev_mapping != mapping)
-					blk_run_address_space(prev_mapping);
-				prev_mapping = mapping;
-
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -3138,17 +3126,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 
-void block_sync_page(struct page *page)
-{
-	struct address_space *mapping;
-
-	smp_mb();
-	mapping = page_mapping(page);
-	if (mapping)
-		blk_run_backing_dev(mapping->backing_dev_info, page);
-}
-EXPORT_SYMBOL(block_sync_page);
-
 /*
  * There are no bdflush tunables left.  But distributions are
  * still running obsolete flush daemons, so we terminate them here.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e964b1cd5dd0..c27d236738fc 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1569,34 +1569,6 @@ int cifs_fsync(struct file *file, int datasync)
 	return rc;
 }
 
-/* static void cifs_sync_page(struct page *page)
-{
-	struct address_space *mapping;
-	struct inode *inode;
-	unsigned long index = page->index;
-	unsigned int rpages = 0;
-	int rc = 0;
-
-	cFYI(1, "sync page %p", page);
-	mapping = page->mapping;
-	if (!mapping)
-		return 0;
-	inode = mapping->host;
-	if (!inode)
-		return; */
-
-/*	fill in rpages then
-	result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-
-/*	cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
-
-#if 0
-	if (rc < 0)
-		return rc;
-	return 0;
-#endif
-} */
-
 /*
  * As file closes, flush all cached write data for this inode checking
  * for write behind errors.
@@ -2510,7 +2482,6 @@ const struct address_space_operations cifs_addr_ops = {
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
 	.invalidatepage = cifs_invalidate_page,
-	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
 };
 
@@ -2528,6 +2499,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
 	.invalidatepage = cifs_invalidate_page,
-	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
 };
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705eedd4..df709b3b860a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1110,11 +1110,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	    ((rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
 
-	if (ret != -EIOCBQUEUED) {
-		/* All IO is now issued, send it on its way */
-		blk_run_address_space(inode->i_mapping);
+	if (ret != -EIOCBQUEUED)
 		dio_await_completion(dio);
-	}
 
 	/*
 	 * Sync will always be dropping the final ref and completing the
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index a8e7797b9477..9c13412e6c99 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -23,7 +23,6 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 }
 static const struct address_space_operations efs_aops = {
 	.readpage = efs_readpage,
-	.sync_page = block_sync_page,
 	.bmap = _efs_bmap
 };
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a7555238c41a..82b94c8f5d22 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -795,7 +795,6 @@ const struct address_space_operations exofs_aops = {
 	.direct_IO	= NULL, /* TODO: Should be trivial to do */
 
 	/* With these NULL has special meaning or default is not exported */
-	.sync_page	= NULL,
 	.get_xip_mem	= NULL,
 	.migratepage	= NULL,
 	.launder_page	= NULL,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 40ad210a5049..c47f706878b5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -860,7 +860,6 @@ const struct address_space_operations ext2_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext2_write_begin,
 	.write_end		= ext2_write_end,
 	.bmap			= ext2_bmap,
@@ -880,7 +879,6 @@ const struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_nobh_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext2_nobh_write_begin,
 	.write_end		= nobh_write_end,
 	.bmap			= ext2_bmap,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ae94f6d949f5..fe2541d250e4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1894,7 +1894,6 @@ static const struct address_space_operations ext3_ordered_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_ordered_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_ordered_write_end,
 	.bmap			= ext3_bmap,
@@ -1910,7 +1909,6 @@ static const struct address_space_operations ext3_writeback_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_writeback_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_writeback_write_end,
 	.bmap			= ext3_bmap,
@@ -1926,7 +1924,6 @@ static const struct address_space_operations ext3_journalled_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_journalled_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_journalled_write_end,
 	.set_page_dirty		= ext3_journalled_set_page_dirty,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..9297ad46c465 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3903,7 +3903,6 @@ static const struct address_space_operations ext4_ordered_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_ordered_write_end,
 	.bmap			= ext4_bmap,
@@ -3919,7 +3918,6 @@ static const struct address_space_operations ext4_writeback_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_writeback_write_end,
 	.bmap			= ext4_bmap,
@@ -3935,7 +3933,6 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
 	.set_page_dirty		= ext4_journalled_set_page_dirty,
@@ -3951,7 +3948,6 @@ static const struct address_space_operations ext4_da_aops = {
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.writepages		= ext4_da_writepages,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
 	.bmap			= ext4_bmap,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd1..f4ff09fb79b1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -236,7 +236,6 @@ static const struct address_space_operations fat_aops = {
 	.readpages	= fat_readpages,
 	.writepage	= fat_writepage,
 	.writepages	= fat_writepages,
-	.sync_page	= block_sync_page,
 	.write_begin	= fat_write_begin,
 	.write_end	= fat_write_end,
 	.direct_IO	= fat_direct_IO,
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 1429f3ae1e86..5d318c44f855 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -44,7 +44,6 @@ static sector_t		vxfs_bmap(struct address_space *, sector_t);
 const struct address_space_operations vxfs_aops = {
 	.readpage =		vxfs_readpage,
 	.bmap =			vxfs_bmap,
-	.sync_page =		block_sync_page,
 };
 
 inline void
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd1..09e8d51eeb64 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -868,7 +868,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 
 	fc->bdi.name = "fuse";
 	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-	fc->bdi.unplug_io_fn = default_unplug_io_fn;
 	/* fuse does it's own writeback accounting */
 	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9b..2f87ad27efd0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1116,7 +1116,6 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.writepages = gfs2_writeback_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
 	.write_begin = gfs2_write_begin,
 	.write_end = gfs2_write_end,
 	.bmap = gfs2_bmap,
@@ -1132,7 +1131,6 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.writepage = gfs2_ordered_writepage,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
 	.write_begin = gfs2_write_begin,
 	.write_end = gfs2_write_end,
 	.set_page_dirty = gfs2_set_page_dirty,
@@ -1150,7 +1148,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.writepages = gfs2_jdata_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
 	.write_begin = gfs2_write_begin,
 	.write_end = gfs2_write_end,
 	.set_page_dirty = gfs2_set_page_dirty,
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f9..a566331db4e1 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -94,7 +94,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 const struct address_space_operations gfs2_meta_aops = {
 	.writepage = gfs2_aspace_writepage,
 	.releasepage = gfs2_releasepage,
-	.sync_page = block_sync_page,
 };
 
 /**
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dffb4e996643..fff16c968e67 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,6 @@ static int hfs_writepages(struct address_space *mapping,
 const struct address_space_operations hfs_btree_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfs_bmap,
@@ -160,7 +159,6 @@ const struct address_space_operations hfs_btree_aops = {
 const struct address_space_operations hfs_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfs_bmap,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a8df651747f0..b248a6cfcad9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -146,7 +146,6 @@ static int hfsplus_writepages(struct address_space *mapping,
 const struct address_space_operations hfsplus_btree_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfsplus_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfsplus_bmap,
@@ -156,7 +155,6 @@ const struct address_space_operations hfsplus_btree_aops = {
 const struct address_space_operations hfsplus_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfsplus_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfsplus_bmap,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7ea..9e84257b3ad5 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -120,7 +120,6 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations hpfs_aops = {
 	.readpage = hpfs_readpage,
 	.writepage = hpfs_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = hpfs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = _hpfs_bmap
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a0f3833c0dbf..3db5ba4568fc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1158,7 +1158,6 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 
 static const struct address_space_operations isofs_aops = {
 	.readpage = isofs_readpage,
-	.sync_page = block_sync_page,
 	.bmap = _isofs_bmap
 };
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9978803ceedc..eddbb373209e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -352,7 +352,6 @@ const struct address_space_operations jfs_aops = {
 	.readpages	= jfs_readpages,
 	.writepage	= jfs_writepage,
 	.writepages	= jfs_writepages,
-	.sync_page	= block_sync_page,
 	.write_begin	= jfs_write_begin,
 	.write_end	= nobh_write_end,
 	.bmap		= jfs_bmap,
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 48b44bd8267b..6740d34cd82b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -583,7 +583,6 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
 const struct address_space_operations jfs_metapage_aops = {
 	.readpage	= metapage_readpage,
 	.writepage	= metapage_writepage,
-	.sync_page	= block_sync_page,
 	.releasepage	= metapage_releasepage,
 	.invalidatepage	= metapage_invalidatepage,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 723bc5bca09a..1adc8d455f0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -39,7 +39,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	bio.bi_end_io = request_complete;
 
 	submit_bio(rw, &bio);
-	generic_unplug_device(bdev_get_queue(bdev));
 	wait_for_completion(&complete);
 	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
 }
@@ -168,7 +167,6 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
 	}
 	len = PAGE_ALIGN(len);
 	__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-	generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
 }
 
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ae0b83f476a6..adcdc0a4e182 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -399,7 +399,6 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations minix_aops = {
 	.readpage = minix_readpage,
 	.writepage = minix_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = minix_write_begin,
 	.write_end = generic_write_end,
 	.bmap = minix_bmap
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..f4f1c08807ed 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -40,14 +40,10 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
 	nilfs_mapping_init_once(btnc);
 }
 
-static const struct address_space_operations def_btnode_aops = {
-	.sync_page		= block_sync_page,
-};
-
 void nilfs_btnode_cache_init(struct address_space *btnc,
 			     struct backing_dev_info *bdi)
 {
-	nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
+	nilfs_mapping_init(btnc, bdi);
 }
 
 void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index caf9a6a3fb54..1c2a3e23f8b2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -49,7 +49,6 @@
 #include "ifile.h"
 
 static const struct address_space_operations def_gcinode_aops = {
-	.sync_page		= block_sync_page,
 };
 
 /*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2fd440d8d6b8..c89d5d1ea7c7 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -262,7 +262,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 const struct address_space_operations nilfs_aops = {
 	.writepage		= nilfs_writepage,
 	.readpage		= nilfs_readpage,
-	.sync_page		= block_sync_page,
 	.writepages		= nilfs_writepages,
 	.set_page_dirty		= nilfs_set_page_dirty,
 	.readpages		= nilfs_readpages,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..3fdb61d79c9a 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -399,7 +399,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 
 static const struct address_space_operations def_mdt_aops = {
 	.writepage		= nilfs_mdt_write_page,
-	.sync_page		= block_sync_page,
 };
 
 static const struct inode_operations def_mdt_iops;
@@ -438,10 +437,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
 	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
 
-static const struct address_space_operations shadow_map_aops = {
-	.sync_page		= block_sync_page,
-};
-
 /**
  * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
  * @inode: inode of the metadata file
@@ -455,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
 	nilfs_mapping_init_once(&shadow->frozen_data);
-	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+	nilfs_mapping_init(&shadow->frozen_data, bdi);
 	nilfs_mapping_init_once(&shadow->frozen_btnodes);
-	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+	nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
 	mi->mi_shadow = shadow;
 	return 0;
 }
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..3da37cc5de34 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -506,15 +506,14 @@ void nilfs_mapping_init_once(struct address_space *mapping)
 }
 
 void nilfs_mapping_init(struct address_space *mapping,
-			struct backing_dev_info *bdi,
-			const struct address_space_operations *aops)
+			struct backing_dev_info *bdi)
 {
 	mapping->host = NULL;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->assoc_mapping = NULL;
 	mapping->backing_dev_info = bdi;
-	mapping->a_ops = aops;
+	mapping->a_ops = NULL;
 }
 
 /*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..ba4d6fd40b04 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -63,8 +63,7 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
 void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
-			struct backing_dev_info *bdi,
-			const struct address_space_operations *aops);
+			struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c3c2c7ac9020..0b1e885b8cf8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1543,8 +1543,6 @@ err_out:
  */
 const struct address_space_operations ntfs_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
-	.sync_page	= block_sync_page,	/* Currently, just unplugs the
-						   disk request queue. */
 #ifdef NTFS_RW
 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
 #endif /* NTFS_RW */
@@ -1560,8 +1558,6 @@ const struct address_space_operations ntfs_aops = {
  */
 const struct address_space_operations ntfs_mst_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
-	.sync_page	= block_sync_page,	/* Currently, just unplugs the
-						   disk request queue. */
 #ifdef NTFS_RW
 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
 	.set_page_dirty	= __set_page_dirty_nobuffers,	/* Set the page dirty
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6551c7cbad92..ef9ed854255c 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -698,8 +698,7 @@ lock_retry_remap:
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_address_space(mapping);
-			schedule();
+			io_schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
 				goto read_err;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131b..daea0359e974 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2043,7 +2043,6 @@ const struct address_space_operations ocfs2_aops = {
 	.write_begin		= ocfs2_write_begin,
 	.write_end		= ocfs2_write_end,
 	.bmap			= ocfs2_bmap,
-	.sync_page		= block_sync_page,
 	.direct_IO		= ocfs2_direct_IO,
 	.invalidatepage		= ocfs2_invalidatepage,
 	.releasepage		= ocfs2_releasepage,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f6..1adab287bd24 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -367,11 +367,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
 static void o2hb_wait_on_io(struct o2hb_region *reg,
 			    struct o2hb_bio_wait_ctxt *wc)
 {
-	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
-
-	blk_run_address_space(mapping);
 	o2hb_bio_wait_dec(wc, 1);
-
 	wait_for_completion(&wc->wc_io_complete);
 }
 
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 8a6d34fa668a..d738a7e493dd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,6 @@ const struct address_space_operations omfs_aops = {
 	.readpages = omfs_readpages,
 	.writepage = omfs_writepage,
 	.writepages = omfs_writepages,
-	.sync_page = block_sync_page,
 	.write_begin = omfs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = omfs_bmap,
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e63b4171d583..2b0646613f5a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -335,7 +335,6 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations qnx4_aops = {
 	.readpage	= qnx4_readpage,
 	.writepage	= qnx4_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= qnx4_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= qnx4_bmap
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e2..03674675f886 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3212,7 +3212,6 @@ const struct address_space_operations reiserfs_address_space_operations = {
 	.readpages = reiserfs_readpages,
 	.releasepage = reiserfs_releasepage,
 	.invalidatepage = reiserfs_invalidatepage,
-	.sync_page = block_sync_page,
 	.write_begin = reiserfs_write_begin,
 	.write_end = reiserfs_write_end,
 	.bmap = reiserfs_aop_bmap,
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 9ca66276315e..fa8d43c92bb8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -488,7 +488,6 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations sysv_aops = {
 	.readpage = sysv_readpage,
 	.writepage = sysv_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = sysv_write_begin,
 	.write_end = generic_write_end,
 	.bmap = sysv_bmap
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6e11c2975dcf..81368d4d4a2c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1979,7 +1979,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	c->bdi.name = "ubifs",
 	c->bdi.capabilities = BDI_CAP_MAP_COPY;
-	c->bdi.unplug_io_fn = default_unplug_io_fn;
 	err  = bdi_init(&c->bdi);
 	if (err)
 		goto out_close;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbbe..94e4553491c9 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -98,7 +98,6 @@ static int udf_adinicb_write_end(struct file *file,
 const struct address_space_operations udf_adinicb_aops = {
 	.readpage	= udf_adinicb_readpage,
 	.writepage	= udf_adinicb_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin = simple_write_begin,
 	.write_end = udf_adinicb_write_end,
 };
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97b..fa96fc0fe12b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -133,7 +133,6 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations udf_aops = {
 	.readpage	= udf_readpage,
 	.writepage	= udf_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin		= udf_write_begin,
 	.write_end		= generic_write_end,
 	.bmap		= udf_bmap,
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093af..83b28444eb17 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -588,7 +588,6 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations ufs_aops = {
 	.readpage = ufs_readpage,
 	.writepage = ufs_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = ufs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = ufs_bmap
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9a..ff0e79276f2d 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -481,7 +481,7 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
 			break;
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
-		blk_run_address_space(inode->i_mapping);
+		blk_flush_plug(current);
 		yield();
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b6..83c1c20d145a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1495,7 +1495,6 @@ const struct address_space_operations xfs_address_space_operations = {
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
-	.sync_page		= block_sync_page,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.write_begin		= xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378dd..4f8f53c4d42c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -991,7 +991,7 @@ xfs_buf_lock(
 	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
 		xfs_log_force(bp->b_target->bt_mount, 0);
 	if (atomic_read(&bp->b_io_remaining))
-		blk_run_address_space(bp->b_target->bt_mapping);
+		blk_flush_plug(current);
 	down(&bp->b_sema);
 	XB_SET_OWNER(bp);
 
@@ -1035,9 +1035,7 @@ xfs_buf_wait_unpin(
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (atomic_read(&bp->b_pin_count) == 0)
 			break;
-		if (atomic_read(&bp->b_io_remaining))
-			blk_run_address_space(bp->b_target->bt_mapping);
-		schedule();
+		io_schedule();
 	}
 	remove_wait_queue(&bp->b_waiters, &wait);
 	set_current_state(TASK_RUNNING);
@@ -1443,7 +1441,7 @@ xfs_buf_iowait(
 	trace_xfs_buf_iowait(bp, _RET_IP_);
 
 	if (atomic_read(&bp->b_io_remaining))
-		blk_run_address_space(bp->b_target->bt_mapping);
+		blk_flush_plug(current);
 	wait_for_completion(&bp->b_iowait);
 
 	trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1667,7 +1665,6 @@ xfs_mapping_buftarg(
 	struct inode		*inode;
 	struct address_space	*mapping;
 	static const struct address_space_operations mapping_aops = {
-		.sync_page = block_sync_page,
 		.migratepage = fail_migrate_page,
 	};
 
@@ -1948,7 +1945,7 @@ xfsbufd(
 			count++;
 		}
 		if (count)
-			blk_run_address_space(target->bt_mapping);
+			blk_flush_plug(current);
 
 	} while (!kthread_should_stop());
 
@@ -1996,7 +1993,7 @@ xfs_flush_buftarg(
 
 	if (wait) {
 		/* Expedite and wait for IO to complete. */
-		blk_run_address_space(target->bt_mapping);
+		blk_flush_plug(current);
 		while (!list_empty(&wait_list)) {
 			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 4ce34fa937d4..96f4094b706d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -66,8 +66,6 @@ struct backing_dev_info {
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
-	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
-	void *unplug_io_data;
 
 	char *name;
 
@@ -251,7 +249,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
 extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
 
 int writeback_in_progress(struct backing_dev_info *bdi);
 
@@ -336,17 +333,4 @@ static inline int bdi_sched_wait(void *word)
 	return 0;
 }
 
-static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
-				       struct page *page)
-{
-	if (bdi && bdi->unplug_io_fn)
-		bdi->unplug_io_fn(bdi, page);
-}
-
-static inline void blk_run_address_space(struct address_space *mapping)
-{
-	if (mapping)
-		blk_run_backing_dev(mapping->backing_dev_info, NULL);
-}
-
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5873037eeb91..64ab2a1bb167 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -196,7 +196,6 @@ typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
-typedef void (unplug_fn) (struct request_queue *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -279,7 +278,6 @@ struct request_queue
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unprep_rq_fn		*unprep_rq_fn;
-	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
@@ -292,14 +290,6 @@ struct request_queue
 	sector_t		end_sector;
 	struct request		*boundary_rq;
 
-	/*
-	 * Auto-unplugging state
-	 */
-	struct timer_list	unplug_timer;
-	int			unplug_thresh;	/* After this many requests */
-	unsigned long		unplug_delay;	/* After this many jiffies */
-	struct work_struct	unplug_work;
-
 	/*
 	 * Delayed queue handling
 	 */
@@ -399,14 +389,13 @@ struct request_queue
 #define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
-#define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
-#define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
-#define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
-#define QUEUE_FLAG_NOMERGES    10	/* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP   11	/* force complete on same CPU */
-#define QUEUE_FLAG_FAIL_IO     12	/* fake timeout */
-#define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
-#define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
+#define QUEUE_FLAG_ELVSWITCH	7	/* don't use elevator, just do FIFO */
+#define QUEUE_FLAG_BIDI		8	/* queue supports bidi requests */
+#define QUEUE_FLAG_NOMERGES     9	/* disable merge attempts */
+#define QUEUE_FLAG_SAME_COMP   10	/* force complete on same CPU */
+#define QUEUE_FLAG_FAIL_IO     11	/* fake timeout */
+#define QUEUE_FLAG_STACKABLE   12	/* supports request stacking */
+#define QUEUE_FLAG_NONROT      13	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
 #define QUEUE_FLAG_DISCARD     16	/* supports DISCARD */
@@ -484,7 +473,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	__clear_bit(flag, &q->queue_flags);
 }
 
-#define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
@@ -679,9 +667,6 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
-extern void blk_plug_device(struct request_queue *);
-extern void blk_plug_device_unlocked(struct request_queue *);
-extern int blk_remove_plug(struct request_queue *);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
@@ -726,7 +711,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
-extern void blk_unplug(struct request_queue *q);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
@@ -863,7 +847,6 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-extern void generic_unplug_device(struct request_queue *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(struct request_queue *);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 68d1fe7b877c..f5df23561b96 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -219,7 +219,6 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
-void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 272496d1fae4..e2768834f397 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -285,11 +285,6 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
  */
 int dm_table_complete(struct dm_table *t);
 
-/*
- * Unplug all devices in a table.
- */
-void dm_table_unplug_all(struct dm_table *t);
-
 /*
  * Table reference counting.
  */
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 8857cf9adbb7..ec6f72b84477 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -20,7 +20,6 @@ typedef void (elevator_bio_merged_fn) (struct request_queue *,
 typedef int (elevator_dispatch_fn) (struct request_queue *, int);
 
 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
-typedef int (elevator_queue_empty_fn) (struct request_queue *);
 typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
 typedef int (elevator_may_queue_fn) (struct request_queue *, int);
@@ -46,7 +45,6 @@ struct elevator_ops
 	elevator_activate_req_fn *elevator_activate_req_fn;
 	elevator_deactivate_req_fn *elevator_deactivate_req_fn;
 
-	elevator_queue_empty_fn *elevator_queue_empty_fn;
 	elevator_completed_req_fn *elevator_completed_req_fn;
 
 	elevator_request_list_fn *elevator_former_req_fn;
@@ -101,8 +99,8 @@ struct elevator_queue
  */
 extern void elv_dispatch_sort(struct request_queue *, struct request *);
 extern void elv_dispatch_add_tail(struct request_queue *, struct request *);
-extern void elv_add_request(struct request_queue *, struct request *, int, int);
-extern void __elv_add_request(struct request_queue *, struct request *, int, int);
+extern void elv_add_request(struct request_queue *, struct request *, int);
+extern void __elv_add_request(struct request_queue *, struct request *, int);
 extern void elv_insert(struct request_queue *, struct request *, int);
 extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 extern int elv_try_merge(struct request *, struct bio *);
@@ -112,7 +110,6 @@ extern void elv_merged_request(struct request_queue *, struct request *, int);
 extern void elv_bio_merged(struct request_queue *q, struct request *,
 				struct bio *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
-extern int elv_queue_empty(struct request_queue *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 extern int elv_register_queue(struct request_queue *q);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bd3215940c37..9f2cf69911b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -583,7 +583,6 @@ typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
-	void (*sync_page)(struct page *);
 
 	/* Write back some dirty pages from this mapping. */
 	int (*writepages)(struct address_space *, struct writeback_control *);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9c66e994540f..e112b8db2f3c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -298,7 +298,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 
 extern void __lock_page(struct page *page);
 extern int __lock_page_killable(struct page *page);
-extern void __lock_page_nosync(struct page *page);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
 extern void unlock_page(struct page *page);
@@ -341,17 +340,6 @@ static inline int lock_page_killable(struct page *page)
 	return 0;
 }
 
-/*
- * lock_page_nosync should only be used if we can't pin the page's inode.
- * Doesn't play quite so well with block device plugging.
- */
-static inline void lock_page_nosync(struct page *page)
-{
-	might_sleep();
-	if (!trylock_page(page))
-		__lock_page_nosync(page);
-}
-	
 /*
  * lock_page_or_retry - Lock the page, unless this would block and the
  * caller indicated that it can handle a retry.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4d559325d919..9ee321833b21 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -299,8 +299,6 @@ extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
 					struct page **pagep, swp_entry_t *ent);
 #endif
 
-extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
-
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d30227..c91e139a652e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
-
 struct backing_dev_info default_backing_dev_info = {
 	.name		= "default",
 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
 	.state		= 0,
 	.capabilities	= BDI_CAP_MAP_COPY,
-	.unplug_io_fn	= default_unplug_io_fn,
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468b..380776c2a9ac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -155,45 +155,15 @@ void remove_from_page_cache(struct page *page)
 }
 EXPORT_SYMBOL(remove_from_page_cache);
 
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
 {
-	struct address_space *mapping;
-	struct page *page;
-
-	page = container_of((unsigned long *)word, struct page, flags);
-
-	/*
-	 * page_mapping() is being called without PG_locked held.
-	 * Some knowledge of the state and use of the page is used to
-	 * reduce the requirements down to a memory barrier.
-	 * The danger here is of a stale page_mapping() return value
-	 * indicating a struct address_space different from the one it's
-	 * associated with when it is associated with one.
-	 * After smp_mb(), it's either the correct page_mapping() for
-	 * the page, or an old page_mapping() and the page's own
-	 * page_mapping() has gone NULL.
-	 * The ->sync_page() address_space operation must tolerate
-	 * page_mapping() going NULL. By an amazing coincidence,
-	 * this comes about because none of the users of the page
-	 * in the ->sync_page() methods make essential use of the
-	 * page_mapping(), merely passing the page down to the backing
-	 * device's unplug functions when it's non-NULL, which in turn
-	 * ignore it for all cases but swap, where only page_private(page) is
-	 * of interest. When page_mapping() does go NULL, the entire
-	 * call stack gracefully ignores the page and returns.
-	 * -- wli
-	 */
-	smp_mb();
-	mapping = page_mapping(page);
-	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-		mapping->a_ops->sync_page(page);
 	io_schedule();
 	return 0;
 }
 
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
 {
-	sync_page(word);
+	sleep_on_page(word);
 	return fatal_signal_pending(current) ? -EINTR : 0;
 }
 
@@ -479,12 +449,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-	io_schedule();
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -512,7 +476,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 
 	if (test_bit(bit_nr, &page->flags))
-		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
+		__wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
@@ -576,17 +540,12 @@ EXPORT_SYMBOL(end_page_writeback);
 /**
  * __lock_page - get a lock on the page, assuming we need to sleep to get it
  * @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
  */
 void __lock_page(struct page *page)
 {
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+	__wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
@@ -596,24 +555,10 @@ int __lock_page_killable(struct page *page)
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
 	return __wait_on_bit_lock(page_waitqueue(page), &wait,
-					sync_page_killable, TASK_KILLABLE);
+					sleep_on_page_killable, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
-{
-	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-							TASK_UNINTERRUPTIBLE);
-}
-
 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 			 unsigned int flags)
 {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0207c2f6f8bd..bfba796d374d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -945,7 +945,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		collect_procs(ppage, &tokill);
 
 	if (hpage != ppage)
-		lock_page_nosync(ppage);
+		lock_page(ppage);
 
 	ret = try_to_unmap(ppage, ttu);
 	if (ret != SWAP_SUCCESS)
@@ -1038,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 			 * Check "just unpoisoned", "filter hit", and
 			 * "race with other subpage."
 			 */
-			lock_page_nosync(hpage);
+			lock_page(hpage);
 			if (!PageHWPoison(hpage)
 			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
 			    || (p != hpage && TestSetPageHWPoison(hpage))) {
@@ -1088,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	 * It's very difficult to mess with pages currently under IO
 	 * and in many cases impossible, so we just avoid it here.
 	 */
-	lock_page_nosync(hpage);
+	lock_page(hpage);
 
 	/*
 	 * unpoison always clear PG_hwpoison inside page lock
@@ -1231,7 +1231,7 @@ int unpoison_memory(unsigned long pfn)
 		return 0;
 	}
 
-	lock_page_nosync(page);
+	lock_page(page);
 	/*
 	 * This test is racy because PG_hwpoison is set outside of page lock.
 	 * That's acceptable because that won't trigger kernel panic. Instead,
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e1424d3db..fb6cbd6abe16 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 }
 EXPORT_SYMBOL(remap_vmalloc_range);
 
-void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-
 unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long pgoff, unsigned long flags)
 {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2cb01f6ec5d0..cc0ede169e41 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1239,7 +1239,7 @@ int set_page_dirty_lock(struct page *page)
 {
 	int ret;
 
-	lock_page_nosync(page);
+	lock_page(page);
 	ret = set_page_dirty(page);
 	unlock_page(page);
 	return ret;
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2d..cbddc3e17246 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -554,17 +554,5 @@ page_cache_async_readahead(struct address_space *mapping,
 
 	/* do read-ahead */
 	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
-
-#ifdef CONFIG_BLOCK
-	/*
-	 * Normally the current page is !uptodate and lock_page() will be
-	 * immediately called to implicitly unplug the device. However this
-	 * is not always true for RAID conifgurations, where data arrives
-	 * not strictly in their submission order. In this case we need to
-	 * explicitly kick off the IO.
-	 */
-	if (PageUptodate(page))
-		blk_run_backing_dev(mapping->backing_dev_info, NULL);
-#endif
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c990602..24d23f5bedf1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops;
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-	.unplug_io_fn	= default_unplug_io_fn,
 };
 
 static LIST_HEAD(shmem_swaplist);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5c8cfabbc9bc..46680461785b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
 
 /*
  * swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
- * future use of radix_tree tags in the swap cache.
+ * vmscan's shrink_page_list.
  */
 static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
-	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.migratepage	= migrate_page,
 };
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
 static struct backing_dev_info swap_backing_dev_info = {
 	.name		= "swap",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
 struct address_space swapper_space = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..7ceea78ceb20 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -94,39 +94,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
 	return ret;
 }
 
-/*
- * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a mutex.
- */
-static DECLARE_RWSEM(swap_unplug_sem);
-
-void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
-{
-	swp_entry_t entry;
-
-	down_read(&swap_unplug_sem);
-	entry.val = page_private(page);
-	if (PageSwapCache(page)) {
-		struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
-		struct backing_dev_info *bdi;
-
-		/*
-		 * If the page is removed from swapcache from under us (with a
-		 * racy try_to_unuse/swapoff) we need an additional reference
-		 * count to avoid reading garbage from page_private(page) above.
-		 * If the WARN_ON triggers during a swapoff it maybe the race
-		 * condition and it's harmless. However if it triggers without
-		 * swapoff it signals a problem.
-		 */
-		WARN_ON(page_count(page) <= 1);
-
-		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
-		blk_run_backing_dev(bdi, page);
-	}
-	up_read(&swap_unplug_sem);
-}
-
 /*
  * swapon tell device that all the old swap contents can be discarded,
  * to allow the swap device to optimize its wear-levelling.
@@ -1643,10 +1610,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		goto out_dput;
 	}
 
-	/* wait for any unplug function to finish */
-	down_write(&swap_unplug_sem);
-	up_write(&swap_unplug_sem);
-
 	destroy_swap_extents(p);
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 17497d0cd8b9..251bed73ac03 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -358,7 +358,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
 static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
-	lock_page_nosync(page);
+	lock_page(page);
 	if (page_mapping(page) == mapping)
 		mapping_set_error(mapping, error);
 	unlock_page(page);

From 55602dd66f5353981b53f734e26c307f310ced37 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 24 Jun 2010 15:05:37 +0200
Subject: [PATCH 39/65] fs: make generic file read/write functions plug

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 mm/filemap.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/filemap.c b/mm/filemap.c
index 380776c2a9ac..f9a29c87a2cf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1243,12 +1243,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	unsigned long seg = 0;
 	size_t count;
 	loff_t *ppos = &iocb->ki_pos;
+	struct blk_plug plug;
 
 	count = 0;
 	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
 	if (retval)
 		return retval;
 
+	blk_start_plug(&plug);
+
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (filp->f_flags & O_DIRECT) {
 		loff_t size;
@@ -1321,6 +1324,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			break;
 	}
 out:
+	blk_finish_plug(&plug);
 	return retval;
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -2432,11 +2436,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	struct blk_plug plug;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	mutex_lock(&inode->i_mutex);
+	blk_start_plug(&plug);
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
@@ -2447,6 +2453,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);

From 5b417b1873694ece3291d7f64a943304559a817b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 19 Apr 2010 10:04:38 +0200
Subject: [PATCH 40/65] read-ahead: use plugging

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 mm/readahead.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/readahead.c b/mm/readahead.c
index cbddc3e17246..2c0cc489e288 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
 static int read_pages(struct address_space *mapping, struct file *filp,
 		struct list_head *pages, unsigned nr_pages)
 {
+	struct blk_plug plug;
 	unsigned page_idx;
 	int ret;
 
+	blk_start_plug(&plug);
+
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		page_cache_release(page);
 	}
 	ret = 0;
+
 out:
+	blk_finish_plug(&plug);
+
 	return ret;
 }
 

From 2ed1a6bcf97a7ed787e6799bf2e80b9e6f51dca7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 22 Jun 2010 12:52:14 +0200
Subject: [PATCH 41/65] fs: make mpage read/write_pages() plug

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/mpage.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/mpage.c b/fs/mpage.c
index d78455a81ec9..0afc809e46e0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -364,6 +364,9 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
@@ -385,6 +388,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		mpage_bio_submit(READ, bio);
+	blk_finish_plug(&plug);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
@@ -666,8 +670,11 @@ int
 mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block)
 {
+	struct blk_plug plug;
 	int ret;
 
+	blk_start_plug(&plug);
+
 	if (!get_block)
 		ret = generic_writepages(mapping, wbc);
 	else {
@@ -682,6 +689,7 @@ mpage_writepages(struct address_space *mapping,
 		if (mpd.bio)
 			mpage_bio_submit(WRITE, mpd.bio);
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);

From 9f5b9425468c85a901d863d241ba5c5dff9b23b8 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 1 Jul 2010 07:55:01 +0200
Subject: [PATCH 42/65] fs: make aio plug

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/aio.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/aio.c b/fs/aio.c
index c5ea494ea9e2..1476bed1c5fb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1660,6 +1660,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	long ret = 0;
 	int i;
 	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
+	struct blk_plug plug;
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1676,6 +1677,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 		return -EINVAL;
 	}
 
+	blk_start_plug(&plug);
+
 	/*
 	 * AKPM: should this return a partial result if some of the IOs were
 	 * successfully submitted?
@@ -1698,6 +1701,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 		if (ret)
 			break;
 	}
+	blk_finish_plug(&plug);
 	aio_batch_free(batch_hash);
 
 	put_ioctx(ctx);

From cf15900e1209d5b46ec2d24643adbf561830935f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 2 Mar 2011 20:12:18 -0500
Subject: [PATCH 43/65] aio: remove request submission batching

This should be useless now that we have on-stack plugging. So lets just
kill it.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/aio.c | 75 +++-----------------------------------------------------
 1 file changed, 3 insertions(+), 72 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1476bed1c5fb..020de5cb4a67 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -34,8 +34,6 @@
 #include <linux/security.h>
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/hash.h>
 #include <linux/compat.h>
 
 #include <asm/kmap_types.h>
@@ -65,14 +63,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
 
-#define AIO_BATCH_HASH_BITS	3 /* allocated on-stack, so don't go crazy */
-#define AIO_BATCH_HASH_SIZE	(1 << AIO_BATCH_HASH_BITS)
-struct aio_batch_entry {
-	struct hlist_node list;
-	struct address_space *mapping;
-};
-mempool_t *abe_pool;
-
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
 
@@ -86,8 +76,7 @@ static int __init aio_setup(void)
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	aio_wq = create_workqueue("aio");
-	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-	BUG_ON(!aio_wq || !abe_pool);
+	BUG_ON(!aio_wq);
 
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
@@ -1512,59 +1501,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 	return 0;
 }
 
-static void aio_batch_add(struct address_space *mapping,
-			  struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos;
-	unsigned bucket;
-
-	bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
-	hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
-		if (abe->mapping == mapping)
-			return;
-	}
-
-	abe = mempool_alloc(abe_pool, GFP_KERNEL);
-
-	/*
-	 * we should be using igrab here, but
-	 * we don't want to hammer on the global
-	 * inode spinlock just to take an extra
-	 * reference on a file that we must already
-	 * have a reference to.
-	 *
-	 * When we're called, we always have a reference
-	 * on the file, so we must always have a reference
-	 * on the inode, so ihold() is safe here.
-	 */
-	ihold(mapping->host);
-	abe->mapping = mapping;
-	hlist_add_head(&abe->list, &batch_hash[bucket]);
-	return;
-}
-
-static void aio_batch_free(struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos, *n;
-	int i;
-
-	/*
-	 * TODO: kill this
-	 */
-	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
-		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-			iput(abe->mapping->host);
-			hlist_del(&abe->list);
-			mempool_free(abe, abe_pool);
-		}
-	}
-}
-
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct hlist_head *batch_hash,
-			 bool compat)
+			 struct iocb *iocb, bool compat)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1638,11 +1576,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			;
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
-	if (req->ki_opcode == IOCB_CMD_PREAD ||
-	    req->ki_opcode == IOCB_CMD_PREADV ||
-	    req->ki_opcode == IOCB_CMD_PWRITE ||
-	    req->ki_opcode == IOCB_CMD_PWRITEV)
-		aio_batch_add(file->f_mapping, batch_hash);
 
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
@@ -1659,7 +1592,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	struct kioctx *ctx;
 	long ret = 0;
 	int i;
-	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
 	struct blk_plug plug;
 
 	if (unlikely(nr < 0))
@@ -1697,12 +1629,11 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
+		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
-	aio_batch_free(batch_hash);
 
 	put_ioctx(ctx);
 	return i ? i : ret;

From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 9 Mar 2011 11:56:30 +0100
Subject: [PATCH 44/65] block: kill off REQ_UNPLUG

With the plugging now being explicitly controlled by the
submitter, callers need not pass down unplugging hints
to the block layer. If they want to unplug, it's because they
manually plugged on their own - in which case, they should just
unplug at will.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c                   |  2 +-
 drivers/block/drbd/drbd_actlog.c   |  2 +-
 drivers/block/drbd/drbd_int.h      |  2 +-
 drivers/block/drbd/drbd_main.c     |  3 +--
 drivers/block/drbd/drbd_receiver.c |  9 +--------
 drivers/md/bitmap.c                |  2 +-
 drivers/md/dm-io.c                 |  2 +-
 drivers/md/dm-kcopyd.c             |  5 +----
 drivers/md/md.c                    |  5 ++---
 fs/btrfs/extent_io.c               |  2 +-
 fs/buffer.c                        | 14 ++++----------
 fs/direct-io.c                     |  2 +-
 fs/ext4/page-io.c                  |  3 +--
 fs/gfs2/log.c                      |  4 ++--
 fs/gfs2/lops.c                     | 12 ++++++------
 fs/gfs2/meta_io.c                  |  2 +-
 fs/jbd/commit.c                    |  2 +-
 fs/jbd2/commit.c                   |  6 +++---
 fs/nilfs2/segbuf.c                 |  2 +-
 fs/xfs/linux-2.6/xfs_aops.c        |  3 +--
 include/linux/blk_types.h          |  2 --
 include/linux/fs.h                 | 28 +++++++++-------------------
 kernel/power/block_io.c            |  2 +-
 mm/page_io.c                       |  2 +-
 24 files changed, 43 insertions(+), 75 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 82a45898ba76..7e9715ae18c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1290,7 +1290,7 @@ get_rq:
 	}
 
 	plug = current->plug;
-	if (plug && !sync) {
+	if (plug) {
 		if (!plug->should_sort && !list_empty(&plug->list)) {
 			struct request *__rq;
 
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 2096628d6e65..aca302492ff2 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -80,7 +80,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 
 	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
 		rw |= REQ_FUA;
-	rw |= REQ_UNPLUG | REQ_SYNC;
+	rw |= REQ_SYNC;
 
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_bdev = bdev->md_bdev;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 0b5718e19586..b0bd27dfc1e8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -377,7 +377,7 @@ union p_header {
 #define DP_HARDBARRIER	      1 /* depricated */
 #define DP_RW_SYNC	      2 /* equals REQ_SYNC    */
 #define DP_MAY_SET_IN_SYNC    4
-#define DP_UNPLUG             8 /* equals REQ_UNPLUG  */
+#define DP_UNPLUG             8 /* not used anymore   */
 #define DP_FUA               16 /* equals REQ_FUA     */
 #define DP_FLUSH             32 /* equals REQ_FLUSH   */
 #define DP_DISCARD           64 /* equals REQ_DISCARD */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 6049cb85310d..8a43ce0edeed 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2477,12 +2477,11 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
 {
 	if (mdev->agreed_pro_version >= 95)
 		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
-			(bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
 			(bi_rw & REQ_FUA ? DP_FUA : 0) |
 			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
 			(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
 	else
-		return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
+		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
 }
 
 /* Used to send write requests
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 84132f8bf8a4..8e68be939deb 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1100,8 +1100,6 @@ next_bio:
 	/* > e->sector, unless this is the first bio */
 	bio->bi_sector = sector;
 	bio->bi_bdev = mdev->ldev->backing_bdev;
-	/* we special case some flags in the multi-bio case, see below
-	 * (REQ_UNPLUG) */
 	bio->bi_rw = rw;
 	bio->bi_private = e;
 	bio->bi_end_io = drbd_endio_sec;
@@ -1130,10 +1128,6 @@ next_bio:
 		bios = bios->bi_next;
 		bio->bi_next = NULL;
 
-		/* strip off REQ_UNPLUG unless it is the last bio */
-		if (bios)
-			bio->bi_rw &= ~REQ_UNPLUG;
-
 		drbd_generic_make_request(mdev, fault_type, bio);
 	} while (bios);
 	return 0;
@@ -1621,12 +1615,11 @@ static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
 {
 	if (mdev->agreed_pro_version >= 95)
 		return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
-			(dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
 			(dpf & DP_FUA ? REQ_FUA : 0) |
 			(dpf & DP_FLUSH ? REQ_FUA : 0) |
 			(dpf & DP_DISCARD ? REQ_DISCARD : 0);
 	else
-		return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+		return dpf & DP_RW_SYNC ? REQ_SYNC : 0;
 }
 
 /* mirrored write */
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 54bfc274b39a..ca203cb23f3c 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -347,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 			atomic_inc(&bitmap->pending_writes);
 			set_buffer_locked(bh);
 			set_buffer_mapped(bh);
-			submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh);
+			submit_bh(WRITE | REQ_SYNC, bh);
 			bh = bh->b_this_page;
 		}
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 136d4f71a116..76a5af00a26b 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -352,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	BUG_ON(num_regions > DM_IO_MAX_REGIONS);
 
 	if (sync)
-		rw |= REQ_SYNC | REQ_UNPLUG;
+		rw |= REQ_SYNC;
 
 	/*
 	 * For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 400cf35094a4..1bb73a13ca40 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -356,11 +356,8 @@ static int run_io_job(struct kcopyd_job *job)
 
 	if (job->rw == READ)
 		r = dm_io(&io_req, 1, &job->source, NULL);
-	else {
-		if (job->num_dests > 1)
-			io_req.bi_rw |= REQ_UNPLUG;
+	else
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
-	}
 
 	return r;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ca0d79c264b9..28f9c1ee4e3a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -777,8 +777,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	bio->bi_end_io = super_written;
 
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
-		   bio);
+	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
 }
 
 void md_super_wait(mddev_t *mddev)
@@ -806,7 +805,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
 	struct completion event;
 	int ret;
 
-	rw |= REQ_SYNC | REQ_UNPLUG;
+	rw |= REQ_SYNC;
 
 	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
 		rdev->meta_bdev : rdev->bdev;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 92ac5192c518..b76f7cd47401 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2182,7 +2182,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long nr_written = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		write_flags = WRITE_SYNC_PLUG;
+		write_flags = WRITE_SYNC;
 	else
 		write_flags = WRITE;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index f903f2e5b4fe..42534f67d71b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -767,7 +767,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * still in flight on potentially older
 				 * contents.
 				 */
-				write_dirty_buffer(bh, WRITE_SYNC_PLUG);
+				write_dirty_buffer(bh, WRITE_SYNC);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -1602,14 +1602,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
  * prevents this contention from occurring.
  *
  * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
- * causes the writes to be flagged as synchronous writes, but the
- * block device queue will NOT be unplugged, since usually many pages
- * will be pushed to the out before the higher-level caller actually
- * waits for the writes to be completed.  The various wait functions,
- * such as wait_on_writeback_range() will ultimately call sync_page()
- * which will ultimately call blk_run_backing_dev(), which will end up
- * unplugging the device queue.
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * causes the writes to be flagged as synchronous writes.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc,
@@ -1622,7 +1616,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC_PLUG : WRITE);
+			WRITE_SYNC : WRITE);
 
 	BUG_ON(!PageLocked(page));
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index df709b3b860a..426083136099 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1173,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct dio *dio;
 
 	if (rw & WRITE)
-		rw = WRITE_ODIRECT_PLUG;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 955cc309142f..e2cd90e4bb7c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -310,8 +310,7 @@ static int io_submit_init(struct ext4_io_submit *io,
 	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
 
 	io->io_bio = bio;
-	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC_PLUG : WRITE);
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
 	io->io_next_block = bh->b_blocknr;
 	return 0;
 }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e10..7f1c11202342 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -121,7 +121,7 @@ __acquires(&sdp->sd_log_lock)
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
-				submit_bh(WRITE_SYNC_PLUG, bh);
+				submit_bh(WRITE_SYNC, bh);
 			} else {
 				unlock_buffer(bh);
 				brelse(bh);
@@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 		lock_buffer(bh);
 		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
 			bh->b_end_io = end_buffer_write_sync;
-			submit_bh(WRITE_SYNC_PLUG, bh);
+			submit_bh(WRITE_SYNC, bh);
 		} else {
 			unlock_buffer(bh);
 			brelse(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058d..48b545a1979a 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -200,7 +200,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		gfs2_log_unlock(sdp);
-		submit_bh(WRITE_SYNC_PLUG, bh);
+		submit_bh(WRITE_SYNC, bh);
 		gfs2_log_lock(sdp);
 
 		n = 0;
@@ -210,7 +210,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 			gfs2_log_unlock(sdp);
 			lock_buffer(bd2->bd_bh);
 			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-			submit_bh(WRITE_SYNC_PLUG, bh);
+			submit_bh(WRITE_SYNC, bh);
 			gfs2_log_lock(sdp);
 			if (++n >= num)
 				break;
@@ -352,7 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-			submit_bh(WRITE_SYNC_PLUG, bh);
+			submit_bh(WRITE_SYNC, bh);
 
 			bh = gfs2_log_get_buf(sdp);
 			mh = (struct gfs2_meta_header *)bh->b_data;
@@ -369,7 +369,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	}
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 
-	submit_bh(WRITE_SYNC_PLUG, bh);
+	submit_bh(WRITE_SYNC, bh);
 }
 
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -571,7 +571,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	ptr = bh_log_ptr(bh);
 	
 	get_bh(bh);
-	submit_bh(WRITE_SYNC_PLUG, bh);
+	submit_bh(WRITE_SYNC, bh);
 	gfs2_log_lock(sdp);
 	while(!list_empty(list)) {
 		bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -597,7 +597,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		} else {
 			bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
 		}
-		submit_bh(WRITE_SYNC_PLUG, bh1);
+		submit_bh(WRITE_SYNC, bh1);
 		gfs2_log_lock(sdp);
 		ptr += 2;
 	}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index a566331db4e1..867b713cba92 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
 	int write_op = REQ_META |
-		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
+		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 34a4861c14b8..66be299acb1b 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -333,7 +333,7 @@ void journal_commit_transaction(journal_t *journal)
 	 * instead we rely on sync_buffer() doing the unplug for us.
 	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC_PLUG;
+		write_op = WRITE_SYNC;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f3ad1598b201..3da1cc4346d5 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -137,9 +137,9 @@ static int journal_submit_commit_record(journal_t *journal,
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 	else
-		ret = submit_bh(WRITE_SYNC_PLUG, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 
 	*cbh = bh;
 	return ret;
@@ -369,7 +369,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * instead we rely on sync_buffer() doing the unplug for us.
 	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC_PLUG;
+		write_op = WRITE_SYNC;
 	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.run.rs_wait = commit_transaction->t_max_wait;
 	stats.run.rs_locked = jiffies;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 0f83e93935b2..2853ff20f85a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
 		 * Last BIO is always sent through the following
 		 * submission.
 		 */
-		rw |= REQ_SYNC | REQ_UNPLUG;
+		rw |= REQ_SYNC;
 		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 83c1c20d145a..6bbb0ee33253 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -413,8 +413,7 @@ xfs_submit_ioend_bio(
 	if (xfs_ioend_new_eof(ioend))
 		xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
 
-	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
-		   WRITE_SYNC_PLUG : WRITE, bio);
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
 }
 
 STATIC struct bio *
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 16b286473042..be50d9e70a7d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -128,7 +128,6 @@ enum rq_flag_bits {
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
 
 	/* bio only flags */
-	__REQ_UNPLUG,		/* unplug the immediately after submission */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_THROTTLED,	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
@@ -172,7 +171,6 @@ enum rq_flag_bits {
 	 REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
-#define REQ_UNPLUG		(1 << __REQ_UNPLUG)
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
 #define REQ_THROTTLED		(1 << __REQ_THROTTLED)
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9f2cf69911b8..543e226ea6a3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -135,16 +135,10 @@ struct inodes_stat_t {
  *			block layer could (in theory) choose to ignore this
  *			request if it runs into resource problems.
  * WRITE		A normal async write. Device will be plugged.
- * WRITE_SYNC_PLUG	Synchronous write. Identical to WRITE, but passes down
+ * WRITE_SYNC		Synchronous write. Identical to WRITE, but passes down
  *			the hint that someone will be waiting on this IO
- *			shortly. The device must still be unplugged explicitly,
- *			WRITE_SYNC_PLUG does not do this as we could be
- *			submitting more writes before we actually wait on any
- *			of them.
- * WRITE_SYNC		Like WRITE_SYNC_PLUG, but also unplugs the device
- *			immediately after submission. The write equivalent
- *			of READ_SYNC.
- * WRITE_ODIRECT_PLUG	Special case write for O_DIRECT only.
+ *			shortly. The write equivalent of READ_SYNC.
+ * WRITE_ODIRECT	Special case write for O_DIRECT only.
  * WRITE_FLUSH		Like WRITE_SYNC but with preceding cache flush.
  * WRITE_FUA		Like WRITE_SYNC but data is guaranteed to be on
  *			non-volatile media on completion.
@@ -160,18 +154,14 @@ struct inodes_stat_t {
 #define WRITE			RW_MASK
 #define READA			RWA_MASK
 
-#define READ_SYNC		(READ | REQ_SYNC | REQ_UNPLUG)
+#define READ_SYNC		(READ | REQ_SYNC)
 #define READ_META		(READ | REQ_META)
-#define WRITE_SYNC_PLUG		(WRITE | REQ_SYNC | REQ_NOIDLE)
-#define WRITE_SYNC		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
-#define WRITE_ODIRECT_PLUG	(WRITE | REQ_SYNC)
+#define WRITE_SYNC		(WRITE | REQ_SYNC | REQ_NOIDLE)
+#define WRITE_ODIRECT		(WRITE | REQ_SYNC)
 #define WRITE_META		(WRITE | REQ_META)
-#define WRITE_FLUSH		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
-				 REQ_FLUSH)
-#define WRITE_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
-				 REQ_FUA)
-#define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
-				 REQ_FLUSH | REQ_FUA)
+#define WRITE_FLUSH		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
+#define WRITE_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
+#define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
 static int submit(int rw, struct block_device *bdev, sector_t sector,
 		struct page *page, struct bio **bio_chain)
 {
-	const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+	const int bio_rw = rw | REQ_SYNC;
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf469..dc76b4d0611e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		goto out;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		rw |= REQ_SYNC | REQ_UNPLUG;
+		rw |= REQ_SYNC;
 	count_vm_event(PSWPOUT);
 	set_page_writeback(page);
 	unlock_page(page);

From 69d60eb96ae8a73cf9b79cf28051caf973006011 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 9 Mar 2011 08:27:37 +0100
Subject: [PATCH 45/65] blk-throttle: Use blk_plug in throttle dispatch

Use plug in throttle dispatch also as we are dispatching a bunch of
bios in throttle context and some of them might merge.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b8dcdc2663a1..658ee505315b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -770,6 +770,7 @@ static int throtl_dispatch(struct request_queue *q)
 	unsigned int nr_disp = 0;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
+	struct blk_plug plug;
 
 	spin_lock_irq(q->queue_lock);
 
@@ -798,8 +799,10 @@ out:
 	 * immediate dispatch
 	 */
 	if (nr_disp) {
+		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
+		blk_finish_plug(&plug);
 	}
 	return nr_disp;
 }

From 805f6b5e1cbfedfb9b3d354013e7f4b13a79270f Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 11 Mar 2011 20:11:59 +0100
Subject: [PATCH 46/65] blktrace: Use rq->cmd_flags directly in
 blk_add_trace_rq.

In blk_add_trace_rq, we only chose the minor 2 bits from
request's cmd_flags and did some check for discard.
so most of other flags(e.g, REQ_SYNC) are missing.

For example, with a sync write after blkparse we get:
  8,16   1        1     0.001776503  7509  A  WS 1349632 + 1024 <- (8,17) 1347584
  8,16   1        2     0.001776813  7509  Q  WS 1349632 + 1024 [dd]
  8,16   1        3     0.001780395  7509  G  WS 1349632 + 1024 [dd]
  8,16   1        5     0.001783186  7509  I   W 1349632 + 1024 [dd]
  8,16   1       11     0.001816987  7509  D   W 1349632 + 1024 [dd]
  8,16   0        2     0.006218192     0  C   W 1349632 + 1024 [0]

Since now we have integrated the flags of both bio and request,
it is safe to pass rq->cmd_flags directly to __blk_add_trace.

With this patch, after a sync write we get:
  8,16   1        1     0.001776900  5425  A  WS 1189888 + 1024 <- (8,17) 1187840
  8,16   1        2     0.001777179  5425  Q  WS 1189888 + 1024 [dd]
  8,16   1        3     0.001780797  5425  G  WS 1189888 + 1024 [dd]
  8,16   1        5     0.001783402  5425  I  WS 1189888 + 1024 [dd]
  8,16   1       11     0.001817468  5425  D  WS 1189888 + 1024 [dd]
  8,16   0        2     0.005640709     0  C  WS 1189888 + 1024 [0]

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..7aa40f8e182d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
  *
  **/
 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
+			     u32 what)
 {
 	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
 
 	if (likely(!bt))
 		return;
 
-	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= REQ_DISCARD;
-
-	if (rq->cmd_flags & REQ_SECURE)
-		rw |= REQ_SECURE;
-
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
 				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
-				what, rq->errors, 0, NULL);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+				rq->cmd_flags, what, rq->errors, 0, NULL);
 	}
 }
 

From eba2ed9c9636d9e7ed203a2498ed230b48341709 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 11 Mar 2011 20:13:54 +0100
Subject: [PATCH 47/65] block: remove obsolete comments for
 blkdev_issue_zeroout.

barrier is already removed, so remove the obsolete comments
in blkdev_issue_zeroout.

Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index eec78becb355..cb56c4e5b494 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -140,8 +140,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
  *
  * Description:
  *  Generate and issue number of bios with zerofiled pages.
- *  Send barrier at the beginning and at the end if requested. This guarantie
- *  correct request ordering. Empty barrier allow us to avoid post queue flush.
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,

From 1f940bdfc0d03265d178d9dfd840d854819f797d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 11 Mar 2011 20:17:08 +0100
Subject: [PATCH 48/65] block: fixup plugging stubs for !CONFIG_BLOCK

They used an older prototype, fix it up.

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 91fa428fa2c1..16a902f099ac 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1297,15 +1297,18 @@ static inline long nr_blockdev_pages(void)
 	return 0;
 }
 
-static inline void blk_start_plug(struct list_head *list)
+struct blk_plug {
+};
+
+static inline void blk_start_plug(struct blk_plug *plug)
 {
 }
 
-static inline void blk_finish_plug(struct list_head *list)
+static inline void blk_finish_plug(struct blk_plug *plug)
 {
 }
 
-static inline void blk_flush_plug(struct task_struct *tsk)
+static inline void blk_flush_plug(struct task_struct *task)
 {
 }
 

From 167400d34070ebbc408dc0f447c4ddb4bf837360 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Sat, 12 Mar 2011 16:54:00 +0100
Subject: [PATCH 49/65] blk-cgroup: Add unaccounted time to timeslice_used.

There are two kind of times that tasks are not charged for: the first
seek and the extra time slice used over the allocated timeslice. Both
of these exported as a new unaccounted_time stat.

I think it would be good to have this reported in 'time' as well, but
that is probably a separate discussion.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c  | 16 +++++++++++++++-
 block/blk-cgroup.h  | 12 ++++++++++--
 block/cfq-iosched.c | 21 +++++++++++++--------
 block/cfq.h         |  6 +++---
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a3eb9e..77ee3c1ec1a7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
+				unsigned long unaccounted_time)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&blkg->stats_lock, flags);
 	blkg->stats.time += time;
+	blkg->stats.unaccounted_time += unaccounted_time;
 	spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
@@ -603,6 +605,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	if (type == BLKIO_STAT_SECTORS)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.sectors, cb, dev);
+	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+					blkg->stats.unaccounted_time, cb, dev);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 		uint64_t sum = blkg->stats.avg_queue_size_sum;
@@ -1106,6 +1111,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 		case BLKIO_PROP_sectors:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SECTORS, 0);
+		case BLKIO_PROP_unaccounted_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_UNACCOUNTED_TIME, 0);
 		case BLKIO_PROP_io_service_bytes:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SERVICE_BYTES, 1);
@@ -1261,6 +1269,12 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_sectors),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "unaccounted_time",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_unaccounted_time),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_service_bytes",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 57e7234c5ae5..10919fae2d3a 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,6 +49,8 @@ enum stat_type {
 	/* All the single valued stats go below this */
 	BLKIO_STAT_TIME,
 	BLKIO_STAT_SECTORS,
+	/* Time not charged to this cgroup */
+	BLKIO_STAT_UNACCOUNTED_TIME,
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	BLKIO_STAT_AVG_QUEUE_SIZE,
 	BLKIO_STAT_IDLE_TIME,
@@ -81,6 +83,7 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_io_serviced,
 	BLKIO_PROP_time,
 	BLKIO_PROP_sectors,
+	BLKIO_PROP_unaccounted_time,
 	BLKIO_PROP_io_service_time,
 	BLKIO_PROP_io_wait_time,
 	BLKIO_PROP_io_merged,
@@ -114,6 +117,8 @@ struct blkio_group_stats {
 	/* total disk time and nr sectors dispatched by this group */
 	uint64_t time;
 	uint64_t sectors;
+	/* Time not charged to this cgroup */
+	uint64_t unaccounted_time;
 	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Sum of number of IOs queued across all samples */
@@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-					unsigned long time);
+					unsigned long time,
+					unsigned long unaccounted_time);
 void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
 						bool direction, bool sync);
 void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
-						unsigned long time) {}
+						unsigned long time,
+						unsigned long unaccounted_time)
+{}
 static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 				uint64_t bytes, bool direction, bool sync) {}
 static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c826ef81c679..89e0d1cc14ba 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -899,7 +899,8 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 
-static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
+						unsigned int *unaccounted_time)
 {
 	unsigned int slice_used;
 
@@ -918,8 +919,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 					1);
 	} else {
 		slice_used = jiffies - cfqq->slice_start;
-		if (slice_used > cfqq->allocated_slice)
+		if (slice_used > cfqq->allocated_slice) {
+			*unaccounted_time = slice_used - cfqq->allocated_slice;
 			slice_used = cfqq->allocated_slice;
+		}
+		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
+			*unaccounted_time += cfqq->slice_start -
+					cfqq->dispatch_start;
 	}
 
 	return slice_used;
@@ -929,12 +935,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	unsigned int used_sl, charge;
+	unsigned int used_sl, charge, unaccounted_sl = 0;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
 
 	BUG_ON(nr_sync < 0);
-	used_sl = charge = cfq_cfqq_slice_usage(cfqq);
+	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
 
 	if (iops_mode(cfqd))
 		charge = cfqq->slice_dispatch;
@@ -960,7 +966,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
 			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
 			iops_mode(cfqd), cfqq->nr_sectors);
-	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+	cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
+					  unaccounted_sl);
 	cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 
@@ -3296,9 +3303,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 
 	cfq_service_tree_add(cfqd, cfqq, 1);
-
-	cfqq->slice_end = 0;
-	cfq_mark_cfqq_slice_new(cfqq);
+	__cfq_set_active_queue(cfqd, cfqq);
 }
 
 /*
diff --git a/block/cfq.h b/block/cfq.h
index 54a6d90f8e8c..2a155927e37c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 }
 
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time)
+			unsigned long time, unsigned long unaccounted_time)
 {
-	blkiocg_update_timeslice_used(blkg, time);
+	blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
 }
 
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 			unsigned long dequeue) {}
 
 static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
-			unsigned long time) {}
+			unsigned long time, unsigned long unaccounted_time) {}
 static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 				bool direction, bool sync) {}

From 9b6096a65f99a89dfd8328c4e469e7b53b3ae04a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 17 Mar 2011 10:47:06 +0100
Subject: [PATCH 50/65] mm: make generic_writepages() use plugging

This recovers a performance regression caused by the removal
of the per-device plugging.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 mm/page-writeback.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cc0ede169e41..24b7ac2bc36b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1039,11 +1039,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
 int generic_writepages(struct address_space *mapping,
 		       struct writeback_control *wbc)
 {
+	struct blk_plug plug;
+	int ret;
+
 	/* deal with chardevs and other special file */
 	if (!mapping->a_ops->writepage)
 		return 0;
 
-	return write_cache_pages(mapping, wbc, __writepage, mapping);
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+	blk_finish_plug(&plug);
+	return ret;
 }
 
 EXPORT_SYMBOL(generic_writepages);

From 4ee2491ed8569f370bf4c1a4c046a6efb8032bd2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 10:51:40 +0100
Subject: [PATCH 51/65] fs: make fsync_buffers_list() plug

It used WRITE_SYNC_PLUG before and potentially submits a batch
of IO, so lets enable plugging for this case.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/buffer.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/buffer.c b/fs/buffer.c
index 42534f67d71b..2e6b1a387b7e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -743,8 +743,10 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 	struct list_head tmp;
 	struct address_space *mapping;
 	int err = 0, err2;
+	struct blk_plug plug;
 
 	INIT_LIST_HEAD(&tmp);
+	blk_start_plug(&plug);
 
 	spin_lock(lock);
 	while (!list_empty(list)) {
@@ -781,6 +783,10 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 		}
 	}
 
+	spin_unlock(lock);
+	blk_finish_plug(&plug);
+	spin_lock(lock);
+
 	while (!list_empty(&tmp)) {
 		bh = BH_ENTRY(tmp.prev);
 		get_bh(bh);

From 65ab80279d7c323ce151e858d951e2684df72a97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 10:56:45 +0100
Subject: [PATCH 52/65] jbd: finish conversion from WRITE_SYNC_PLUG to
 WRITE_SYNC and explicit plugging

'write_op' was still used, even though it was always WRITE_SYNC now.
Add plugging around the cases where it submits IO, and flush them
before we end up waiting for that IO.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd/commit.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 66be299acb1b..da871ee084d3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -294,7 +295,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
-	int write_op = WRITE_SYNC;
+	struct blk_plug plug;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -327,13 +328,6 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
-	/*
-	 * Use plugged writes here, since we want to submit several before
-	 * we unplug the device. We don't do explicit unplugging in here,
-	 * instead we rely on sync_buffer() doing the unplug for us.
-	 */
-	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -418,8 +412,10 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
+	blk_start_plug(&plug);
 	err = journal_submit_data_buffers(journal, commit_transaction,
-					  write_op);
+					  WRITE_SYNC);
+	blk_finish_plug(&plug);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
@@ -480,7 +476,9 @@ void journal_commit_transaction(journal_t *journal)
 		err = 0;
 	}
 
-	journal_write_revoke_records(journal, commit_transaction, write_op);
+	blk_start_plug(&plug);
+
+	journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
 
 	/*
 	 * If we found any dirty or locked buffers, then we should have
@@ -650,7 +648,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(write_op, bh);
+				submit_bh(WRITE_SYNC, bh);
 			}
 			cond_resched();
 
@@ -661,6 +659,8 @@ start_journal_io:
 		}
 	}
 
+	blk_finish_plug(&plug);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the

From 82f04ab47e1d94d78503591a7460b2cad9601ede Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 11:01:52 +0100
Subject: [PATCH 53/65] jbd2: finish conversion from WRITE_SYNC_PLUG to
 WRITE_SYNC and explicit plugging

'write_op' was still used, even though it was always WRITE_SYNC now.
Add plugging around the cases where it submits IO, and flush them
before we end up waiting for that IO.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd2/commit.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3da1cc4346d5..fa36d7662b21 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -329,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
-	int write_op = WRITE_SYNC;
+	struct blk_plug plug;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -363,13 +363,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
-	/*
-	 * Use plugged writes here, since we want to submit several before
-	 * we unplug the device. We don't do explicit unplugging in here,
-	 * instead we rely on sync_buffer() doing the unplug for us.
-	 */
-	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
 	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.run.rs_wait = commit_transaction->t_max_wait;
 	stats.run.rs_locked = jiffies;
@@ -469,8 +462,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	if (err)
 		jbd2_journal_abort(journal, err);
 
+	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(journal, commit_transaction,
-					  write_op);
+					  WRITE_SYNC);
+	blk_finish_plug(&plug);
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
@@ -497,6 +492,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	err = 0;
 	descriptor = NULL;
 	bufs = 0;
+	blk_start_plug(&plug);
 	while (commit_transaction->t_buffers) {
 
 		/* Find the next buffer to be journaled... */
@@ -658,7 +654,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(write_op, bh);
+				submit_bh(WRITE_SYNC, bh);
 			}
 			cond_resched();
 			stats.run.rs_blocks_logged += bufs;
@@ -699,6 +695,8 @@ start_journal_io:
 			__jbd2_journal_abort_hard(journal);
 	}
 
+	blk_finish_plug(&plug);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the

From a91a2785b200864aef2270ed6a3babac7a253a20 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 17 Mar 2011 11:11:05 +0100
Subject: [PATCH 54/65] block: Require subsystems to explicitly allocate
 bio_set integrity mempool

MD and DM create a new bio_set for every metadevice. Each bio_set has an
integrity mempool attached regardless of whether the metadevice is
capable of passing integrity metadata. This is a waste of memory.

Instead we defer the allocation decision to MD and DM since we know at
metadevice creation time whether integrity passthrough is needed or not.

Automatic integrity mempool allocation can then be removed from
bioset_create() and we make an explicit integrity allocation for the
fs_bio_set.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Acked-by: Mike Snitzer <snizer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm-table.c  |  7 +++++--
 drivers/md/dm.c        | 12 +++++++++---
 drivers/md/dm.h        |  2 +-
 drivers/md/linear.c    |  3 +--
 drivers/md/md.c        |  8 ++++++--
 drivers/md/multipath.c |  7 +++++--
 drivers/md/raid0.c     |  3 +--
 drivers/md/raid1.c     |  5 ++---
 drivers/md/raid10.c    |  7 +++++--
 fs/bio-integrity.c     |  3 +++
 fs/bio.c               |  6 +++---
 11 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f50a7b952257..416d4e258df6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -55,6 +55,7 @@ struct dm_table {
 	struct dm_target *targets;
 
 	unsigned discards_supported:1;
+	unsigned integrity_supported:1;
 
 	/*
 	 * Indicates the rw permissions for the new logical
@@ -859,7 +860,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t)
 		return -EINVAL;
 	}
 
-	t->mempools = dm_alloc_md_mempools(type);
+	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
 	if (!t->mempools)
 		return -ENOMEM;
 
@@ -935,8 +936,10 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device
 	struct dm_dev_internal *dd;
 
 	list_for_each_entry(dd, devices, list)
-		if (bdev_get_integrity(dd->dm_dev.bdev))
+		if (bdev_get_integrity(dd->dm_dev.bdev)) {
+			t->integrity_supported = 1;
 			return blk_integrity_register(dm_disk(md), NULL);
+		}
 
 	return 0;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d22b9905c168..88820704a191 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2620,9 +2620,10 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 {
 	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
 
 	if (!pools)
 		return NULL;
@@ -2639,13 +2640,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
 	if (!pools->tio_pool)
 		goto free_io_pool_and_out;
 
-	pools->bs = (type == DM_TYPE_BIO_BASED) ?
-		    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
+	pools->bs = bioset_create(pool_size, 0);
 	if (!pools->bs)
 		goto free_tio_pool_and_out;
 
+	if (integrity && bioset_integrity_create(pools->bs, pool_size))
+		goto free_bioset_and_out;
+
 	return pools;
 
+free_bioset_and_out:
+	bioset_free(pools->bs);
+
 free_tio_pool_and_out:
 	mempool_destroy(pools->tio_pool);
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0c2dd5f4af76..1aaf16746da8 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 #endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 338804f8fb3b..abfb59a61ede 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -210,8 +210,7 @@ static int linear_run (mddev_t *mddev)
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->backing_dev_info.congested_fn = linear_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static void free_conf(struct rcu_head *head)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 86ba66c0b28a..1876761f1828 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1803,8 +1803,12 @@ int md_integrity_register(mddev_t *mddev)
 			mdname(mddev));
 		return -EINVAL;
 	}
-	printk(KERN_NOTICE "md: data integrity on %s enabled\n",
-		mdname(mddev));
+	printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
+	if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
+		printk(KERN_ERR "md: failed to create integrity pool for %s\n",
+		       mdname(mddev));
+		return -EINVAL;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(md_integrity_register);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 5e694b151c30..c35890990985 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -315,7 +315,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -489,7 +489,10 @@ static int multipath_run (mddev_t *mddev)
 
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
+
+	if (md_integrity_register(mddev))
+		goto out_free_conf;
+
 	return 0;
 
 out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 95916fd6394a..e86bf3682e1e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -379,8 +379,7 @@ static int raid0_run(mddev_t *mddev)
 
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
 	dump_zones(mddev);
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static int raid0_stop(mddev_t *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8f34ad5c478b..c2a21ae56d97 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1132,7 +1132,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -2017,8 +2017,7 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-	md_integrity_register(mddev);
-	return 0;
+	return md_integrity_register(mddev);
 }
 
 static int stop(mddev_t *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c0d0f5f7e407..f7b62370b374 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1188,7 +1188,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
 			p->rdev = rdev;
 			goto abort;
 		}
-		md_integrity_register(mddev);
+		err = md_integrity_register(mddev);
 	}
 abort:
 
@@ -2343,7 +2343,10 @@ static int run(mddev_t *mddev)
 
 	if (conf->near_copies < conf->raid_disks)
 		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-	md_integrity_register(mddev);
+
+	if (md_integrity_register(mddev))
+		goto out_free_conf;
+
 	return 0;
 
 out_free_conf:
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e49cce234c65..9c5e6b2cd11a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
 	unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
 
+	if (bs->bio_integrity_pool)
+		return 0;
+
 	bs->bio_integrity_pool =
 		mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
 
diff --git a/fs/bio.c b/fs/bio.c
index 5694b756ed01..85e2eabb1f81 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (bioset_integrity_create(bs, pool_size))
-		goto bad;
-
 	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
@@ -1682,6 +1679,9 @@ static int __init init_bio(void)
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
+	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+		panic("bio: can't create integrity pool\n");
+
 	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
 						     sizeof(struct bio_pair));
 	if (!bio_split_pool)

From 95f28604a65b1c40b6c6cd95e58439cd7ded3add Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 11:13:12 +0100
Subject: [PATCH 55/65] fs: assign sb->s_bdi to default_backing_dev_info if the
 bdi is going away

We don't have proper reference counting for this yet, so we run into
cases where the device is pulled and we OOPS on flushing the fs data.
This happens even though the dirty inodes have already been
migrated to the default_backing_dev_info.

Reported-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Tested-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/super.c       | 2 ++
 fs/sync.c        | 4 ++--
 mm/backing-dev.c | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 7e9dd4cc2c01..0d89e93f654e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 #else
 		INIT_LIST_HEAD(&s->s_files);
 #endif
+		s->s_bdi = &default_backing_dev_info;
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_BL_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -1003,6 +1004,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	}
 	BUG_ON(!mnt->mnt_sb);
 	WARN_ON(!mnt->mnt_sb->s_bdi);
+	WARN_ON(mnt->mnt_sb->s_bdi == &default_backing_dev_info);
 	mnt->mnt_sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7e..412dc89163d3 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -33,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	 * This should be safe, as we require bdi backing to actually
 	 * write out data in the first place
 	 */
-	if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
+	if (sb->s_bdi == &noop_backing_dev_info)
 		return 0;
 
 	if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
 
 static void sync_one_sb(struct super_block *sb, void *arg)
 {
-	if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+	if (!(sb->s_flags & MS_RDONLY))
 		__sync_filesystem(sb, *(int *)arg);
 }
 /*
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c91e139a652e..8fe9d3407921 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -598,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (sb->s_bdi == bdi)
-			sb->s_bdi = NULL;
+			sb->s_bdi = &default_backing_dev_info;
 	}
 	spin_unlock(&sb_lock);
 }

From 8184f93eced1e304721c2a55c00d87d5a14f8907 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Thu, 17 Mar 2011 16:12:36 +0100
Subject: [PATCH 56/65] cfq-iosched: Don't update group weights when on service
 tree

Version 3 is updated to apply to for-2.6.39/core.

For version 2, I took Vivek's advice and made sure we update the group
weight from cfq_group_service_tree_add().

If a weight was updated while a group is on the service tree, the
calculation for the total weight of the service tree can be adjusted
improperly, which either leads to bad service tree weights, or
potentially crashes (if total_weight becomes 0).

This patch defers updates to the weight until a group is off the service
tree.

Signed-off-by: Justin TerAvest <teravest@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 55 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 89e0d1cc14ba..12e380b2c4e4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -178,6 +178,8 @@ struct cfq_group {
 	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
+	unsigned int new_weight;
+	bool needs_update;
 
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
@@ -853,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 }
 
 static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_update_group_weight(struct cfq_group *cfqg)
+{
+	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+	if (cfqg->needs_update) {
+		cfqg->weight = cfqg->new_weight;
+		cfqg->needs_update = false;
+	}
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
+	cfq_update_group_weight(cfqg);
+	__cfq_group_service_tree_add(st, cfqg);
+	st->total_weight += cfqg->weight;
+}
+
+static void
+cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *__cfqg;
@@ -874,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
 	} else
 		cfqg->vdisktime = st->min_vdisktime;
-
-	__cfq_group_service_tree_add(st, cfqg);
-	st->total_weight += cfqg->weight;
+	cfq_group_service_tree_add(st, cfqg);
 }
 
 static void
-cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+	st->total_weight -= cfqg->weight;
+	if (!RB_EMPTY_NODE(&cfqg->rb_node))
+		cfq_rb_erase(&cfqg->rb_node, st);
+}
+
+static void
+cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 
@@ -892,9 +920,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	st->total_weight -= cfqg->weight;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
-		cfq_rb_erase(&cfqg->rb_node, st);
+	cfq_group_service_tree_del(st, cfqg);
 	cfqg->saved_workload_slice = 0;
 	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
@@ -948,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		charge = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
-	cfq_rb_erase(&cfqg->rb_node, st);
+	cfq_group_service_tree_del(st, cfqg);
 	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
-	__cfq_group_service_tree_add(st, cfqg);
+	/* If a new weight was requested, update now, off tree */
+	cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
@@ -982,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
 					unsigned int weight)
 {
-	cfqg_of_blkg(blkg)->weight = weight;
+	struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+	cfqg->new_weight = weight;
+	cfqg->needs_update = true;
 }
 
 static struct cfq_group *
@@ -1255,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	service_tree->count++;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
-	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
 }
 
 static struct cfq_queue *
@@ -1368,7 +1397,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->p_root = NULL;
 	}
 
-	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 	if (cfq_cfqq_sync(cfqq))

From 4345caba340f051e10847924fc078ae18ed6695c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 19 Mar 2011 13:53:31 +0100
Subject: [PATCH 57/65] block: NULL dereference on error path in __blkdev_get()

"disk" is always NULL when we goto out.  There was a check for this
before, but it was removed in 69e02c59a7d9 "block: Don't check events
while open is in progress".

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@carl>
---
 fs/block_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fbe05cbdd692..7d02afb2b7f4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1181,9 +1181,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
- out:
 	module_put(disk->fops->owner);
 	put_disk(disk);
+ out:
 	bdput(bdev);
 
 	return ret;

From 5e84ea3a9c662dc2d7a48703a4468fad954a3b7f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 21 Mar 2011 10:14:27 +0100
Subject: [PATCH 58/65] block: attempt to merge with existing requests on plug
 flush

One of the disadvantages of on-stack plugging is that we potentially
lose out on merging since all pending IO isn't always visible to
everybody. When we flush the on-stack plugs, right now we don't do
any checks to see if potential merge candidates could be utilized.

Correct this by adding a new insert variant, ELEVATOR_INSERT_SORT_MERGE.
It works just ELEVATOR_INSERT_SORT, but first checks whether we can
merge with an existing request before doing the insertion (if we fail
merging).

This fixes a regression with multiple processes issuing IO that
can be merged.

Thanks to Shaohua Li <shaohua.li@intel.com> for testing and fixing
an accounting bug.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c         |  2 +-
 block/blk-merge.c        |  6 +++++
 block/blk.h              |  2 ++
 block/elevator.c         | 52 +++++++++++++++++++++++++++++++++++++---
 include/linux/elevator.h |  1 +
 5 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index e1fcf7a24668..525693237a4a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2685,7 +2685,7 @@ static void flush_plug_list(struct blk_plug *plug)
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
-		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT);
+		__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
 	}
 
 	if (q) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20d5e94..cfcc37cb222b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
 
 	return 0;
 }
+
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+			  struct request *next)
+{
+	return attempt_merge(q, rq, next);
+}
diff --git a/block/blk.h b/block/blk.h
index 49d21af81d07..c8db371a921d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -103,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		      struct bio *bio);
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+				struct request *next);
 void blk_recalc_rq_segments(struct request *rq);
 void blk_rq_set_mixed_merge(struct request *rq);
 
diff --git a/block/elevator.c b/block/elevator.c
index 542ce826b401..c387d3168734 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -521,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	return ELEVATOR_NO_MERGE;
 }
 
+/*
+ * Attempt to do an insertion back merge. Only check for the case where
+ * we can append 'rq' to an existing request, so we can throw 'rq' away
+ * afterwards.
+ *
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_insert_merge(struct request_queue *q,
+				     struct request *rq)
+{
+	struct request *__rq;
+
+	if (blk_queue_nomerges(q))
+		return false;
+
+	/*
+	 * First try one-hit cache.
+	 */
+	if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+		return true;
+
+	if (blk_queue_noxmerges(q))
+		return false;
+
+	/*
+	 * See if our hash lookup can find a potential backmerge.
+	 */
+	__rq = elv_rqhash_find(q, blk_rq_pos(rq));
+	if (__rq && blk_attempt_req_merge(q, __rq, rq))
+		return true;
+
+	return false;
+}
+
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
@@ -538,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
+	const int next_sorted = next->cmd_flags & REQ_SORTED;
 
-	if (e->ops->elevator_merge_req_fn)
+	if (next_sorted && e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
 
 	elv_rqhash_reposition(q, rq);
-	elv_rqhash_del(q, next);
 
-	q->nr_sorted--;
+	if (next_sorted) {
+		elv_rqhash_del(q, next);
+		q->nr_sorted--;
+	}
+
 	q->last_merge = rq;
 }
 
@@ -647,6 +685,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		__blk_run_queue(q, false);
 		break;
 
+	case ELEVATOR_INSERT_SORT_MERGE:
+		/*
+		 * If we succeed in merging this request with one in the
+		 * queue already, we are done - rq has now been freed,
+		 * so no need to do anything further.
+		 */
+		if (elv_attempt_insert_merge(q, rq))
+			break;
 	case ELEVATOR_INSERT_SORT:
 		BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
 		       !(rq->cmd_flags & REQ_DISCARD));
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ec6f72b84477..d93efcc44570 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -166,6 +166,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_SORT	3
 #define ELEVATOR_INSERT_REQUEUE	4
 #define ELEVATOR_INSERT_FLUSH	5
+#define ELEVATOR_INSERT_SORT_MERGE	6
 
 /*
  * return values from elevator_may_queue_fn

From 1e9bb8808ac11094d711d20d580e7b45a4992d0c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 22 Mar 2011 08:35:35 +0100
Subject: [PATCH 59/65] block: fix non-atomic access to genhd inflight
 structures

After the stack plugging introduction, these are called lockless.
Ensure that the counters are updated atomically.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm.c       |  7 ++++---
 fs/partitions/check.c |  3 ++-
 include/linux/genhd.h | 12 ++++++------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 88820704a191..0cf68b478878 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -477,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
+	atomic_set(&dm_disk(md)->part0.in_flight[rw],
+		atomic_inc_return(&md->pending[rw]));
 }
 
 static void end_io_acct(struct dm_io *io)
@@ -497,8 +498,8 @@ static void end_io_acct(struct dm_io *io)
 	 * After this is decremented the bio must not be touched if it is
 	 * a flush.
 	 */
-	dm_disk(md)->part0.in_flight[rw] = pending =
-		atomic_dec_return(&md->pending[rw]);
+	pending = atomic_dec_return(&md->pending[rw]);
+	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9c21119512b9..ac546975031f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
+		atomic_read(&p->in_flight[1]));
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c0d5f6945c1e..d764a426e9fd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -109,7 +109,7 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	int in_flight[2];
+	atomic_t in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -370,21 +370,21 @@ static inline void free_part_stats(struct hd_struct *part)
 
 static inline void part_inc_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight[rw]++;
+	atomic_inc(&part->in_flight[rw]);
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight[rw]++;
+		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
 static inline void part_dec_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight[rw]--;
+	atomic_dec(&part->in_flight[rw]);
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight[rw]--;
+		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
 static inline int part_in_flight(struct hd_struct *part)
 {
-	return part->in_flight[0] + part->in_flight[1];
+	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
 }
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)

From eda5e0c91fed2d2a38a341b0957263406d330274 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 22 Mar 2011 21:26:49 +0100
Subject: [PATCH 60/65] cfq-iosched: Don't set active queue in preempt

Commit "Add unaccounted time to timeslice_used" changed the behavior of
cfq_preempt_queue to set cfqq active. Vivek pointed out that other
preemption rules might get involved, so we shouldn't manually set which
queue is active.

This cleans up the code to just clear the queue stats at preemption
time.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 12e380b2c4e4..69208d732903 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1620,27 +1620,33 @@ static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 }
 
+static void cfq_clear_queue_stats(struct cfq_data *cfqd,
+				  struct cfq_queue *cfqq)
+{
+	cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+	cfqq->slice_start = 0;
+	cfqq->dispatch_start = jiffies;
+	cfqq->allocated_slice = 0;
+	cfqq->slice_end = 0;
+	cfqq->slice_dispatch = 0;
+	cfqq->nr_sectors = 0;
+
+	cfq_clear_cfqq_wait_request(cfqq);
+	cfq_clear_cfqq_must_dispatch(cfqq);
+	cfq_clear_cfqq_must_alloc_slice(cfqq);
+	cfq_clear_cfqq_fifo_expire(cfqq);
+	cfq_mark_cfqq_slice_new(cfqq);
+
+	cfq_del_timer(cfqd, cfqq);
+}
+
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
-		cfqq->slice_start = 0;
-		cfqq->dispatch_start = jiffies;
-		cfqq->allocated_slice = 0;
-		cfqq->slice_end = 0;
-		cfqq->slice_dispatch = 0;
-		cfqq->nr_sectors = 0;
-
-		cfq_clear_cfqq_wait_request(cfqq);
-		cfq_clear_cfqq_must_dispatch(cfqq);
-		cfq_clear_cfqq_must_alloc_slice(cfqq);
-		cfq_clear_cfqq_fifo_expire(cfqq);
-		cfq_mark_cfqq_slice_new(cfqq);
-
-		cfq_del_timer(cfqd, cfqq);
+		cfq_clear_queue_stats(cfqd, cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -3332,7 +3338,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 
 	cfq_service_tree_add(cfqd, cfqq, 1);
-	__cfq_set_active_queue(cfqd, cfqq);
+
+	cfq_clear_queue_stats(cfqd, cfqq);
 }
 
 /*

From 9026e521c0da0731eb31f9f9022dd00cc3cd8885 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Tue, 22 Mar 2011 21:26:54 +0100
Subject: [PATCH 61/65] blk-cgroup: Only give unaccounted_time under debug

This change moves unaccounted_time to only be reported when
CONFIG_DEBUG_BLK_CGROUP is true.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 77ee3c1ec1a7..2bef5705ce24 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -605,10 +605,10 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	if (type == BLKIO_STAT_SECTORS)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_UNACCOUNTED_TIME)
 		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
 					blkg->stats.unaccounted_time, cb, dev);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
 	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
 		uint64_t sum = blkg->stats.avg_queue_size_sum;
 		uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -1111,9 +1111,6 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 		case BLKIO_PROP_sectors:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SECTORS, 0);
-		case BLKIO_PROP_unaccounted_time:
-			return blkio_read_blkg_stats(blkcg, cft, cb,
-						BLKIO_STAT_UNACCOUNTED_TIME, 0);
 		case BLKIO_PROP_io_service_bytes:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_SERVICE_BYTES, 1);
@@ -1133,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_QUEUED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
+		case BLKIO_PROP_unaccounted_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_UNACCOUNTED_TIME, 0);
 		case BLKIO_PROP_dequeue:
 			return blkio_read_blkg_stats(blkcg, cft, cb,
 						BLKIO_STAT_DEQUEUE, 0);
@@ -1269,12 +1269,6 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_sectors),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "unaccounted_time",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
-				BLKIO_PROP_unaccounted_time),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "io_service_bytes",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1396,6 +1390,12 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_dequeue),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "unaccounted_time",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_unaccounted_time),
+		.read_map = blkiocg_file_read_map,
+	},
 #endif
 };
 

From 04521db04e9a11e74b0252d222051cb194487f4d Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 22 Mar 2011 21:54:29 +0100
Subject: [PATCH 62/65] blk-throttle: Reset group slice when limits are changed

Lina reported that if throttle limits are initially very high and then
dropped, then no new bio might be dispatched for a long time. And the
reason being that after dropping the limits we don't reset the existing
slice and do the rate calculation with new low rate and account the bios
dispatched at high rate. To fix it, reset the slice upon rate change.

https://lkml.org/lkml/2011/3/10/298

Another problem with very high limit is that we never queued the
bio on throtl service tree. That means we kept on extending the
group slice but never trimmed it. Fix that also by regulary
trimming the slice even if bio is not being queued up.

Reported-by: Lina Lu <lulina_nuaa@foxmail.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 37abbfc68590..5352bdafbcf0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -756,6 +756,15 @@ static void throtl_process_limit_change(struct throtl_data *td)
 			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
 			tg->iops[READ], tg->iops[WRITE]);
 
+		/*
+		 * Restart the slices for both READ and WRITES. It
+		 * might happen that a group's limit are dropped
+		 * suddenly and we don't want to account recently
+		 * dispatched IO with new low rate
+		 */
+		throtl_start_new_slice(td, tg, 0);
+		throtl_start_new_slice(td, tg, 1);
+
 		if (throtl_tg_on_rr(tg))
 			tg_update_disptime(td, tg);
 	}
@@ -821,7 +830,8 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 
 	struct delayed_work *dwork = &td->throtl_work;
 
-	if (total_nr_queued(td) > 0) {
+	/* schedule work if limits changed even if no bio is queued */
+	if (total_nr_queued(td) > 0 || td->limits_changed) {
 		/*
 		 * We might have a work scheduled to be executed in future.
 		 * Cancel that and schedule a new one.
@@ -1002,6 +1012,19 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	/* Bio is with-in rate limit of group */
 	if (tg_may_dispatch(td, tg, bio, NULL)) {
 		throtl_charge_bio(tg, bio);
+
+		/*
+		 * We need to trim slice even when bios are not being queued
+		 * otherwise it might happen that a bio is not queued for
+		 * a long time and slice keeps on extending and trim is not
+		 * called for a long time. Now if limits are reduced suddenly
+		 * we take into account all the IO dispatched so far at new
+		 * low rate and * newly queued IO gets a really long dispatch
+		 * time.
+		 *
+		 * So keep on trimming slice even if bio is not queued.
+		 */
+		throtl_trim_slice(td, tg, rw);
 		goto out;
 	}
 

From 62a37f6badd1ac97ba07d543b5d4be2f9cb17217 Mon Sep 17 00:00:00 2001
From: Justin TerAvest <teravest@google.com>
Date: Wed, 23 Mar 2011 08:25:44 +0100
Subject: [PATCH 63/65] cfq-iosched: Don't clear queue stats when preempt.

For v2, I added back lines to cfq_preempt_queue() that were removed
during updates for accounting unaccounted_time. Thanks for pointing out
that I'd missed these, Vivek.

Previous commit "cfq-iosched: Don't set active queue in preempt" wrongly
cleared stats for preempting queues when it shouldn't have, because when
we choose a queue to preempt, it still isn't necessarily scheduled next.

Thanks to Vivek Goyal for figuring this out and understanding how the
preemption code works.

Signed-off-by: Justin TerAvest <teravest@google.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 69208d732903..fea1b5a9b7e9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1620,33 +1620,27 @@ static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 }
 
-static void cfq_clear_queue_stats(struct cfq_data *cfqd,
-				  struct cfq_queue *cfqq)
-{
-	cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
-	cfqq->slice_start = 0;
-	cfqq->dispatch_start = jiffies;
-	cfqq->allocated_slice = 0;
-	cfqq->slice_end = 0;
-	cfqq->slice_dispatch = 0;
-	cfqq->nr_sectors = 0;
-
-	cfq_clear_cfqq_wait_request(cfqq);
-	cfq_clear_cfqq_must_dispatch(cfqq);
-	cfq_clear_cfqq_must_alloc_slice(cfqq);
-	cfq_clear_cfqq_fifo_expire(cfqq);
-	cfq_mark_cfqq_slice_new(cfqq);
-
-	cfq_del_timer(cfqd, cfqq);
-}
-
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
-		cfq_clear_queue_stats(cfqd, cfqq);
+		cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
+		cfqq->slice_start = 0;
+		cfqq->dispatch_start = jiffies;
+		cfqq->allocated_slice = 0;
+		cfqq->slice_end = 0;
+		cfqq->slice_dispatch = 0;
+		cfqq->nr_sectors = 0;
+
+		cfq_clear_cfqq_wait_request(cfqq);
+		cfq_clear_cfqq_must_dispatch(cfqq);
+		cfq_clear_cfqq_must_alloc_slice(cfqq);
+		cfq_clear_cfqq_fifo_expire(cfqq);
+		cfq_mark_cfqq_slice_new(cfqq);
+
+		cfq_del_timer(cfqd, cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -3339,7 +3333,8 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	cfq_service_tree_add(cfqd, cfqq, 1);
 
-	cfq_clear_queue_stats(cfqd, cfqq);
+	cfqq->slice_end = 0;
+	cfq_mark_cfqq_slice_new(cfqq);
 }
 
 /*

From c4ade94fc00f8b34589719d8a347f658b6d3951e Mon Sep 17 00:00:00 2001
From: "Li, Shaohua" <shaohua.li@intel.com>
Date: Wed, 23 Mar 2011 08:30:34 +0100
Subject: [PATCH 64/65] cfq-iosched: removing unnecessary think time checking

Removing think time checking. A high thinktime queue might means the queue
dispatches several requests and then do away. Limitting such queue seems
meaningless. And also this can simplify code. This is suggested by Vivek.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fea1b5a9b7e9..7785169f3c8f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2417,19 +2417,14 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 			return false;
 
 		/*
-		 * If there is only one sync queue, and its think time is
-		 * small, we can ignore async queue here and give the sync
+		 * If there is only one sync queue
+		 * we can ignore async queue here and give the sync
 		 * queue no dispatch limit. The reason is a sync queue can
 		 * preempt async queue, limiting the sync queue doesn't make
 		 * sense. This is useful for aiostress test.
 		 */
-		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) {
-			struct cfq_io_context *cic = RQ_CIC(cfqq->next_rq);
-
-			if (sample_valid(cic->ttime_samples) &&
-				cic->ttime_mean < cfqd->cfq_slice_idle)
-				promote_sync = true;
-		}
+		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
+			promote_sync = true;
 
 		/*
 		 * We have other queues, don't allow more IO from this one

From 9d2e157d970a73b3f270b631828e03eb452d525e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 23 Mar 2011 20:44:18 +0100
Subject: [PATCH 65/65] Documentation/iostats.txt: bit-size reference etc.

- correction that disk stats values are native-word-sized
  32-bit or 64-bit values, not always 32-bi values
- drop "Last modified" entry; use git for that
- fix a few typos
- change "cpu" to "CPU"

Reported-by: Linda Walsh <lkml@tlinx.org>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/iostats.txt | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
index f6dece5b7014..c76c21d87e85 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -1,8 +1,6 @@
 I/O statistics fields
 ---------------
 
-Last modified Sep 30, 2003
-
 Since 2.4.20 (and some versions before, with patches), and 2.5.45,
 more extensive disk statistics have been introduced to help measure disk
 activity. Tools such as sar and iostat typically interpret these and do
@@ -46,11 +44,12 @@ the above example, the first field of statistics would be 446216.
 By contrast, in 2.6 if you look at /sys/block/hda/stat, you'll
 find just the eleven fields, beginning with 446216.  If you look at
 /proc/diskstats, the eleven fields will be preceded by the major and
-minor device numbers, and device name.  Each of these formats provide
+minor device numbers, and device name.  Each of these formats provides
 eleven fields of statistics, each meaning exactly the same things.
 All fields except field 9 are cumulative since boot.  Field 9 should
-go to zero as I/Os complete; all others only increase.  Yes, these are
-32 bit unsigned numbers, and on a very busy or long-lived system they
+go to zero as I/Os complete; all others only increase (unless they
+overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
+(native word size) numbers, and on a very busy or long-lived system they
 may wrap. Applications should be prepared to deal with that; unless
 your observations are measured in large numbers of minutes or hours,
 they should not wrap twice before you notice them.
@@ -96,11 +95,11 @@ introduced when changes collide, so (for instance) adding up all the
 read I/Os issued per partition should equal those made to the disks ...
 but due to the lack of locking it may only be very close.
 
-In 2.6, there are counters for each cpu, which made the lack of locking
-almost a non-issue.  When the statistics are read, the per-cpu counters
-are summed (possibly overflowing the unsigned 32-bit variable they are
+In 2.6, there are counters for each CPU, which make the lack of locking
+almost a non-issue.  When the statistics are read, the per-CPU counters
+are summed (possibly overflowing the unsigned long variable they are
 summed to) and the result given to the user.  There is no convenient
-user interface for accessing the per-cpu counters themselves.
+user interface for accessing the per-CPU counters themselves.
 
 Disks vs Partitions
 -------------------