blk-mq: don't allocate driver tag upfront for flush rq

The idea behind it is simple: 1) for none scheduler, driver tag has to be borrowed for flush rq, otherwise we may run out of tag, and that causes an IO hang. And get/put driver tag is actually noop for none, so reordering tags isn't necessary at all. 2) for a real I/O scheduler, we need not allocate a driver tag upfront for flush rq. It works just fine to follow the same approach as normal requests: allocate driver tag for each rq just before calling ->queue_rq(). One driver visible change is that the driver tag isn't shared in the flush request sequence. That won't be a problem, since we always do that in legacy path. Then flush rq need not be treated specially wrt. get/put driver tag. This cleans up the code - for instance, reorder_tags_to_front() can be removed, and we needn't worry about request ordering in dispatch list for avoiding I/O deadlock. Also we have to put the driver tag before requeueing. Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-11-02 23:24:38 +08:00 · 2017-11-02 23:24:38 +08:00 · 923218f616
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		/* release the tag's ownership to the req cloned from */
 		spin_lock_irqsave(&fq->mq_flush_lock, flags);
 		hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
-		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
-		flush_rq->tag = -1;
+		if (!q->elevator) {
+			blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+			flush_rq->tag = -1;
+		} else {
+			blk_mq_put_driver_tag_hctx(hctx, flush_rq);
+			flush_rq->internal_tag = -1;
+		}
 	}

 	running = &fq->flush_queue[fq->flush_running_idx];
@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	blk_rq_init(q, flush_rq);

 	/*
-	 * Borrow tag from the first request since they can't
-	 * be in flight at the same time. And acquire the tag's
-	 * ownership for flush req.
+	 * In case of none scheduler, borrow tag from the first request
+	 * since they can't be in flight at the same time. And acquire
+	 * the tag's ownership for flush req.
+	 *
+	 * In case of IO scheduler, flush rq need to borrow scheduler tag
+	 * just for cheating put/get driver tag.
 	 */
 	if (q->mq_ops) {
 		struct blk_mq_hw_ctx *hctx;

 		flush_rq->mq_ctx = first_rq->mq_ctx;
-		flush_rq->tag = first_rq->tag;
-		fq->orig_rq = first_rq;

-		hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
-		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+		if (!q->elevator) {
+			fq->orig_rq = first_rq;
+			flush_rq->tag = first_rq->tag;
+			hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+			blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+		} else {
+			flush_rq->internal_tag = first_rq->internal_tag;
+		}
 	}

 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)

 	hctx = blk_mq_map_queue(q, ctx->cpu);

+	if (q->elevator) {
+		WARN_ON(rq->tag < 0);
+		blk_mq_put_driver_tag_hctx(hctx, rq);
+	}
+
 	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
 	 * the comment in flush_end_io().
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 		return true;
 	}

-	if (has_sched) {
+	if (has_sched)
 		rq->rq_flags |= RQF_SORTED;
-		WARN_ON(rq->tag != -1);
-	}

 	return false;
 }

-/*
- * Add flush/fua to the queue. If we fail getting a driver tag, then
- * punt to the requeue list. Requeue will re-invoke us from a context
- * that's safe to block from.
- */
-static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
-				      struct request *rq, bool can_block)
-{
-	if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
-		blk_insert_flush(rq);
-		blk_mq_run_hw_queue(hctx, true);
-	} else
-		blk_mq_add_to_requeue_list(rq, false, true);
-}
-
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async, bool can_block)
 {
@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,

 	/* flush rq in flush machinery need to be dispatched directly */
 	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
-		blk_mq_sched_insert_flush(hctx, rq, can_block);
-		return;
+		blk_insert_flush(rq);
+		goto run;
 	}

+	WARN_ON(e && (rq->tag != -1));
+
 	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
 		goto run;

@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	struct elevator_queue *e = hctx->queue->elevator;

-	if (e) {
-		struct request *rq, *next;
-
-		/*
-		 * We bypass requests that already have a driver tag assigned,
-		 * which should only be flushes. Flushes are only ever inserted
-		 * as single requests, so we shouldn't ever hit the
-		 * WARN_ON_ONCE() below (but let's handle it just in case).
-		 */
-		list_for_each_entry_safe(rq, next, list, queuelist) {
-			if (WARN_ON_ONCE(rq->tag != -1)) {
-				list_del_init(&rq->queuelist);
-				blk_mq_sched_bypass_insert(hctx, true, rq);
-			}
-		}
-	}
-
 	if (e && e->type->ops.mq.insert_requests)
 		e->type->ops.mq.insert_requests(hctx, list, false);
 	else
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;

+	blk_mq_put_driver_tag(rq);
+
 	trace_block_rq_requeue(q, rq);
 	wbt_requeue(q->rq_wb, &rq->issue_stat);
 	blk_mq_sched_requeue_request(rq);
@ -996,30 +998,6 @@ done:
 	return rq->tag != -1;
 }

-/*
- * If we fail getting a driver tag because all the driver tags are already
- * assigned and on the dispatch list, BUT the first entry does not have a
- * tag, then we could deadlock. For that case, move entries with assigned
- * driver tags to the front, leaving the set of tagged requests in the
- * same order, and the untagged set in the same order.
- */
-static bool reorder_tags_to_front(struct list_head *list)
-{
-	struct request *rq, *tmp, *first = NULL;
-
-	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
-		if (rq == first)
-			break;
-		if (rq->tag != -1) {
-			list_move(&rq->queuelist, list);
-			if (!first)
-				first = rq;
-		}
-	}
-
-	return first != NULL;
-}
-
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 				void *key)
 {
@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,

 		rq = list_first_entry(list, struct request, queuelist);
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
-			if (!queued && reorder_tags_to_front(list))
-				continue;
-
 			/*
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 				nxt = list_first_entry(list, struct request, queuelist);
 				blk_mq_put_driver_tag(nxt);
 			}
-			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	if (unlikely(is_flush_fua)) {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		if (q->elevator) {
-			blk_mq_sched_insert_request(rq, false, true, true,
-					true);
-		} else {
-			blk_insert_flush(rq);
-			blk_mq_run_hw_queue(data.hctx, true);
-		}
+
+		/* bypass scheduler for flush rq */
+		blk_insert_flush(rq);
+		blk_mq_run_hw_queue(data.hctx, true);
 	} else if (plug && q->nr_hw_queues == 1) {
 		struct request *last = NULL;