2008-01-29 16:51:59 +03:00
|
|
|
#ifndef BLK_INTERNAL_H
|
|
|
|
#define BLK_INTERNAL_H
|
|
|
|
|
2011-12-14 03:33:37 +04:00
|
|
|
#include <linux/idr.h>
|
2014-09-25 19:23:47 +04:00
|
|
|
#include <linux/blk-mq.h>
|
|
|
|
#include "blk-mq.h"
|
2011-12-14 03:33:37 +04:00
|
|
|
|
2008-01-29 16:53:40 +03:00
|
|
|
/* Amount of time in which a process may batch requests */
|
|
|
|
#define BLK_BATCH_TIME (HZ/50UL)
|
|
|
|
|
|
|
|
/* Number of requests a "batching" process may submit */
|
|
|
|
#define BLK_BATCH_REQ 32
|
|
|
|
|
2014-05-14 01:10:52 +04:00
|
|
|
/* Max future timer expiry for timeouts */
|
|
|
|
#define BLK_MAX_TIMEOUT (5 * HZ)
|
|
|
|
|
2014-09-25 19:23:43 +04:00
|
|
|
struct blk_flush_queue {
|
|
|
|
unsigned int flush_queue_delayed:1;
|
|
|
|
unsigned int flush_pending_idx:1;
|
|
|
|
unsigned int flush_running_idx:1;
|
|
|
|
unsigned long flush_pending_since;
|
|
|
|
struct list_head flush_queue[2];
|
|
|
|
struct list_head flush_data_in_flight;
|
|
|
|
struct request *flush_rq;
|
2015-08-09 10:41:51 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* flush_rq shares tag with this rq, both can't be active
|
|
|
|
* at the same time
|
|
|
|
*/
|
|
|
|
struct request *orig_rq;
|
2014-09-25 19:23:43 +04:00
|
|
|
spinlock_t mq_flush_lock;
|
|
|
|
};
|
|
|
|
|
2008-01-29 16:51:59 +03:00
|
|
|
extern struct kmem_cache *blk_requestq_cachep;
|
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:
- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.
- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.
With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.
The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.
This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.
blk-mq provides various helper functions, which include:
- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.
- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.
- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.
- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.
- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.
For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).
Contributions in this patch from the following people:
Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-24 12:20:05 +04:00
|
|
|
extern struct kmem_cache *request_cachep;
|
2008-01-29 16:51:59 +03:00
|
|
|
extern struct kobj_type blk_queue_ktype;
|
2011-12-14 03:33:37 +04:00
|
|
|
extern struct ida blk_queue_ida;
|
2008-01-29 16:51:59 +03:00
|
|
|
|
2014-09-25 19:23:43 +04:00
|
|
|
static inline struct blk_flush_queue *blk_get_flush_queue(
|
2014-09-25 19:23:46 +04:00
|
|
|
struct request_queue *q, struct blk_mq_ctx *ctx)
|
2014-09-25 19:23:43 +04:00
|
|
|
{
|
2016-09-14 17:18:54 +03:00
|
|
|
if (q->mq_ops)
|
|
|
|
return blk_mq_map_queue(q, ctx->cpu)->fq;
|
|
|
|
return q->fq;
|
2014-09-25 19:23:43 +04:00
|
|
|
}
|
|
|
|
|
2011-12-14 03:33:38 +04:00
|
|
|
static inline void __blk_get_queue(struct request_queue *q)
|
|
|
|
{
|
|
|
|
kobject_get(&q->kobj);
|
|
|
|
}
|
|
|
|
|
2014-09-25 19:23:47 +04:00
|
|
|
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
|
|
|
|
int node, int cmd_size);
|
|
|
|
void blk_free_flush_queue(struct blk_flush_queue *q);
|
2014-09-25 19:23:40 +04:00
|
|
|
|
2012-06-05 07:40:59 +04:00
|
|
|
int blk_init_rl(struct request_list *rl, struct request_queue *q,
|
|
|
|
gfp_t gfp_mask);
|
|
|
|
void blk_exit_rl(struct request_list *rl);
|
2008-01-29 16:53:40 +03:00
|
|
|
void init_request_from_bio(struct request *req, struct bio *bio);
|
|
|
|
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
|
|
|
|
struct bio *bio);
|
2012-03-06 01:14:58 +04:00
|
|
|
void blk_queue_bypass_start(struct request_queue *q);
|
|
|
|
void blk_queue_bypass_end(struct request_queue *q);
|
2009-05-08 06:54:16 +04:00
|
|
|
void blk_dequeue_request(struct request *rq);
|
2008-01-29 16:51:59 +03:00
|
|
|
void __blk_queue_free_tags(struct request_queue *q);
|
block: fix flush machinery for stacking drivers with differring flush flags
Commit ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae, block: reimplement
FLUSH/FUA to support merge, introduced a performance regression when
running any sort of fsyncing workload using dm-multipath and certain
storage (in our case, an HP EVA). The test I ran was fs_mark, and it
dropped from ~800 files/sec on ext4 to ~100 files/sec. It turns out
that dm-multipath always advertised flush+fua support, and passed
commands on down the stack, where those flags used to get stripped off.
The above commit changed that behavior:
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
while (1) {
- while (!list_empty(&q->queue_head)) {
+ if (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
- if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
- (rq->cmd_flags & REQ_FLUSH_SEQ))
- return rq;
- rq = blk_do_flush(q, rq);
- if (rq)
- return rq;
+ return rq;
}
Note that previously, a command would come in here, have
REQ_FLUSH|REQ_FUA set, and then get handed off to blk_do_flush:
struct request *blk_do_flush(struct request_queue *q, struct request *rq)
{
unsigned int fflags = q->flush_flags; /* may change, cache it */
bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
bool do_postflush = has_flush && !has_fua && (rq->cmd_flags &
REQ_FUA);
unsigned skip = 0;
...
if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
rq->cmd_flags &= ~REQ_FLUSH;
if (!has_fua)
rq->cmd_flags &= ~REQ_FUA;
return rq;
}
So, the flush machinery was bypassed in such cases (q->flush_flags == 0
&& rq->cmd_flags & (REQ_FLUSH|REQ_FUA)).
Now, however, we don't get into the flush machinery at all. Instead,
__elv_next_request just hands a request with flush and fua bits set to
the scsi_request_fn, even if the underlying request_queue does not
support flush or fua.
The agreed upon approach is to fix the flush machinery to allow
stacking. While this isn't used in practice (since there is only one
request-based dm target, and that target will now reflect the flush
flags of the underlying device), it does future-proof the solution, and
make it function as designed.
In order to make this work, I had to add a field to the struct request,
inside the flush structure (to store the original req->end_io). Shaohua
had suggested overloading the union with rb_node and completion_data,
but the completion data is used by device mapper and can also be used by
other drivers. So, I didn't see a way around the additional field.
I tested this patch on an HP EVA with both ext4 and xfs, and it recovers
the lost performance. Comments and other testers, as always, are
appreciated.
Cheers,
Jeff
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2011-08-15 23:37:25 +04:00
|
|
|
bool __blk_end_bidi_request(struct request *rq, int error,
|
|
|
|
unsigned int nr_bytes, unsigned int bidi_bytes);
|
2015-10-21 20:20:12 +03:00
|
|
|
void blk_freeze_queue(struct request_queue *q);
|
|
|
|
|
|
|
|
static inline void blk_queue_enter_live(struct request_queue *q)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Given that running in generic_make_request() context
|
|
|
|
* guarantees that a live reference against q_usage_counter has
|
|
|
|
* been established, further references under that same context
|
|
|
|
* need not check that the queue has been frozen (marked dead).
|
|
|
|
*/
|
|
|
|
percpu_ref_get(&q->q_usage_counter);
|
|
|
|
}
|
2008-01-29 16:51:59 +03:00
|
|
|
|
2015-10-21 20:20:23 +03:00
|
|
|
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
|
|
|
void blk_flush_integrity(void);
|
|
|
|
#else
|
|
|
|
static inline void blk_flush_integrity(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
2008-01-29 16:51:59 +03:00
|
|
|
|
2015-10-30 15:57:30 +03:00
|
|
|
void blk_timeout_work(struct work_struct *work);
|
2014-05-14 01:10:52 +04:00
|
|
|
unsigned long blk_rq_timeout(unsigned long timeout);
|
2014-04-24 18:51:47 +04:00
|
|
|
void blk_add_timer(struct request *req);
|
2008-09-14 16:55:09 +04:00
|
|
|
void blk_delete_timer(struct request *);
|
|
|
|
|
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:
- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.
- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.
With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.
The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.
This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.
blk-mq provides various helper functions, which include:
- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.
- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.
- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.
- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.
- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.
For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).
Contributions in this patch from the following people:
Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-24 12:20:05 +04:00
|
|
|
|
|
|
|
bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
|
|
|
|
struct bio *bio);
|
|
|
|
bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
|
|
|
|
struct bio *bio);
|
|
|
|
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
|
2015-05-08 20:51:33 +03:00
|
|
|
unsigned int *request_count,
|
|
|
|
struct request **same_queue_rq);
|
2015-10-20 18:13:51 +03:00
|
|
|
unsigned int blk_plug_queued_count(struct request_queue *q);
|
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:
- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.
- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.
With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.
The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.
This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.
blk-mq provides various helper functions, which include:
- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.
- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.
- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.
- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.
- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.
For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).
Contributions in this patch from the following people:
Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-24 12:20:05 +04:00
|
|
|
|
|
|
|
void blk_account_io_start(struct request *req, bool new_io);
|
|
|
|
void blk_account_io_completion(struct request *req, unsigned int bytes);
|
|
|
|
void blk_account_io_done(struct request *req);
|
|
|
|
|
2008-09-14 16:55:09 +04:00
|
|
|
/*
|
|
|
|
* Internal atomic flags for request handling
|
|
|
|
*/
|
|
|
|
enum rq_atomic_flags {
|
|
|
|
REQ_ATOM_COMPLETE = 0,
|
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices:
- The classic request_fn based approach, where drivers use struct
request units for IO. The block layer provides various helper
functionalities to let drivers share code, things like tag
management, timeout handling, queueing, etc.
- The "stacked" approach, where a driver squeezes in between the
block layer and IO submitter. Since this bypasses the IO stack,
driver generally have to manage everything themselves.
With drivers being written for new high IOPS devices, the classic
request_fn based driver doesn't work well enough. The design dates
back to when both SMP and high IOPS was rare. It has problems with
scaling to bigger machines, and runs into scaling issues even on
smaller machines when you have IOPS in the hundreds of thousands
per device.
The stacked approach is then most often selected as the model
for the driver. But this means that everybody has to re-invent
everything, and along with that we get all the problems again
that the shared approach solved.
This commit introduces blk-mq, block multi queue support. The
design is centered around per-cpu queues for queueing IO, which
then funnel down into x number of hardware submission queues.
We might have a 1:1 mapping between the two, or it might be
an N:M mapping. That all depends on what the hardware supports.
blk-mq provides various helper functions, which include:
- Scalable support for request tagging. Most devices need to
be able to uniquely identify a request both in the driver and
to the hardware. The tagging uses per-cpu caches for freed
tags, to enable cache hot reuse.
- Timeout handling without tracking request on a per-device
basis. Basically the driver should be able to get a notification,
if a request happens to fail.
- Optional support for non 1:1 mappings between issue and
submission queues. blk-mq can redirect IO completions to the
desired location.
- Support for per-request payloads. Drivers almost always need
to associate a request structure with some driver private
command structure. Drivers can tell blk-mq this at init time,
and then any request handed to the driver will have the
required size of memory associated with it.
- Support for merging of IO, and plugging. The stacked model
gets neither of these. Even for high IOPS devices, merging
sequential IO reduces per-command overhead and thus
increases bandwidth.
For now, this is provided as a potential 3rd queueing model, with
the hope being that, as it matures, it can replace both the classic
and stacked model. That would get us back to having just 1 real
model for block devices, leaving the stacked approach to dm/md
devices (as it was originally intended).
Contributions in this patch from the following people:
Shaohua Li <shli@fusionio.com>
Alexander Gordeev <agordeev@redhat.com>
Christoph Hellwig <hch@infradead.org>
Mike Christie <michaelc@cs.wisc.edu>
Matias Bjorling <m@bjorling.me>
Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2013-10-24 12:20:05 +04:00
|
|
|
REQ_ATOM_STARTED,
|
2008-09-14 16:55:09 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* EH timer and IO completion will both attempt to 'grab' the request, make
|
2011-03-31 05:57:33 +04:00
|
|
|
* sure that only one of them succeeds
|
2008-09-14 16:55:09 +04:00
|
|
|
*/
|
|
|
|
static inline int blk_mark_rq_complete(struct request *rq)
|
|
|
|
{
|
|
|
|
return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void blk_clear_rq_complete(struct request *rq)
|
|
|
|
{
|
|
|
|
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
|
|
|
|
}
|
2008-01-29 16:53:40 +03:00
|
|
|
|
2009-04-23 06:05:18 +04:00
|
|
|
/*
|
|
|
|
* Internal elevator interface
|
|
|
|
*/
|
2014-04-10 06:27:01 +04:00
|
|
|
#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
|
2009-04-23 06:05:18 +04:00
|
|
|
|
2011-01-25 14:43:54 +03:00
|
|
|
void blk_insert_flush(struct request *rq);
|
2010-09-03 13:56:16 +04:00
|
|
|
|
2009-04-23 06:05:18 +04:00
|
|
|
static inline struct request *__elv_next_request(struct request_queue *q)
|
|
|
|
{
|
|
|
|
struct request *rq;
|
2014-09-25 19:23:46 +04:00
|
|
|
struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
|
2009-04-23 06:05:18 +04:00
|
|
|
|
|
|
|
while (1) {
|
2011-01-25 14:43:54 +03:00
|
|
|
if (!list_empty(&q->queue_head)) {
|
2009-04-23 06:05:18 +04:00
|
|
|
rq = list_entry_rq(q->queue_head.next);
|
2011-01-25 14:43:54 +03:00
|
|
|
return rq;
|
2009-04-23 06:05:18 +04:00
|
|
|
}
|
|
|
|
|
block: hold queue if flush is running for non-queueable flush drive
In some drives, flush requests are non-queueable. When flush request is
running, normal read/write requests can't run. If block layer dispatches
such request, driver can't handle it and requeue it. Tejun suggested we
can hold the queue when flush is running. This can avoid unnecessary
requeue. Also this can improve performance. For example, we have
request flush1, write1, flush 2. flush1 is dispatched, then queue is
hold, write1 isn't inserted to queue. After flush1 is finished, flush2
will be dispatched. Since disk cache is already clean, flush2 will be
finished very soon, so looks like flush2 is folded to flush1.
In my test, the queue holding completely solves a regression introduced by
commit 53d63e6b0dfb95882ec0219ba6bbd50cde423794:
block: make the flush insertion use the tail of the dispatch list
It's not a preempt type request, in fact we have to insert it
behind requests that do specify INSERT_FRONT.
which causes about 20% regression running a sysbench fileio
workload.
Stable: 2.6.39 only
Cc: stable@kernel.org
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2011-05-06 21:34:41 +04:00
|
|
|
/*
|
|
|
|
* Flush request is running and flush request isn't queueable
|
|
|
|
* in the drive, we can hold the queue till flush request is
|
|
|
|
* finished. Even we don't do this, driver can't dispatch next
|
|
|
|
* requests and will requeue them. And this can improve
|
|
|
|
* throughput too. For example, we have request flush1, write1,
|
|
|
|
* flush 2. flush1 is dispatched, then queue is hold, write1
|
|
|
|
* isn't inserted to queue. After flush1 is finished, flush2
|
|
|
|
* will be dispatched. Since disk cache is already clean,
|
|
|
|
* flush2 will be finished very soon, so looks like flush2 is
|
|
|
|
* folded to flush1.
|
|
|
|
* Since the queue is hold, a flag is set to indicate the queue
|
|
|
|
* should be restarted later. Please see flush_end_io() for
|
|
|
|
* details.
|
|
|
|
*/
|
2014-09-25 19:23:43 +04:00
|
|
|
if (fq->flush_pending_idx != fq->flush_running_idx &&
|
block: hold queue if flush is running for non-queueable flush drive
In some drives, flush requests are non-queueable. When flush request is
running, normal read/write requests can't run. If block layer dispatches
such request, driver can't handle it and requeue it. Tejun suggested we
can hold the queue when flush is running. This can avoid unnecessary
requeue. Also this can improve performance. For example, we have
request flush1, write1, flush 2. flush1 is dispatched, then queue is
hold, write1 isn't inserted to queue. After flush1 is finished, flush2
will be dispatched. Since disk cache is already clean, flush2 will be
finished very soon, so looks like flush2 is folded to flush1.
In my test, the queue holding completely solves a regression introduced by
commit 53d63e6b0dfb95882ec0219ba6bbd50cde423794:
block: make the flush insertion use the tail of the dispatch list
It's not a preempt type request, in fact we have to insert it
behind requests that do specify INSERT_FRONT.
which causes about 20% regression running a sysbench fileio
workload.
Stable: 2.6.39 only
Cc: stable@kernel.org
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2011-05-06 21:34:41 +04:00
|
|
|
!queue_flush_queueable(q)) {
|
2014-09-25 19:23:43 +04:00
|
|
|
fq->flush_queue_delayed = 1;
|
block: hold queue if flush is running for non-queueable flush drive
In some drives, flush requests are non-queueable. When flush request is
running, normal read/write requests can't run. If block layer dispatches
such request, driver can't handle it and requeue it. Tejun suggested we
can hold the queue when flush is running. This can avoid unnecessary
requeue. Also this can improve performance. For example, we have
request flush1, write1, flush 2. flush1 is dispatched, then queue is
hold, write1 isn't inserted to queue. After flush1 is finished, flush2
will be dispatched. Since disk cache is already clean, flush2 will be
finished very soon, so looks like flush2 is folded to flush1.
In my test, the queue holding completely solves a regression introduced by
commit 53d63e6b0dfb95882ec0219ba6bbd50cde423794:
block: make the flush insertion use the tail of the dispatch list
It's not a preempt type request, in fact we have to insert it
behind requests that do specify INSERT_FRONT.
which causes about 20% regression running a sysbench fileio
workload.
Stable: 2.6.39 only
Cc: stable@kernel.org
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2011-05-06 21:34:41 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2014-01-30 01:56:16 +04:00
|
|
|
if (unlikely(blk_queue_bypass(q)) ||
|
2011-12-14 03:33:41 +04:00
|
|
|
!q->elevator->type->ops.elevator_dispatch_fn(q, 0))
|
2009-04-23 06:05:18 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
|
|
|
|
{
|
|
|
|
struct elevator_queue *e = q->elevator;
|
|
|
|
|
2011-12-14 03:33:41 +04:00
|
|
|
if (e->type->ops.elevator_activate_req_fn)
|
|
|
|
e->type->ops.elevator_activate_req_fn(q, rq);
|
2009-04-23 06:05:18 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
|
|
|
|
{
|
|
|
|
struct elevator_queue *e = q->elevator;
|
|
|
|
|
2011-12-14 03:33:41 +04:00
|
|
|
if (e->type->ops.elevator_deactivate_req_fn)
|
|
|
|
e->type->ops.elevator_deactivate_req_fn(q, rq);
|
2009-04-23 06:05:18 +04:00
|
|
|
}
|
|
|
|
|
2008-09-14 16:56:33 +04:00
|
|
|
#ifdef CONFIG_FAIL_IO_TIMEOUT
|
|
|
|
int blk_should_fake_timeout(struct request_queue *);
|
|
|
|
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
|
|
|
|
ssize_t part_timeout_store(struct device *, struct device_attribute *,
|
|
|
|
const char *, size_t);
|
|
|
|
#else
|
|
|
|
static inline int blk_should_fake_timeout(struct request_queue *q)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-01-29 16:04:06 +03:00
|
|
|
int ll_back_merge_fn(struct request_queue *q, struct request *req,
|
|
|
|
struct bio *bio);
|
|
|
|
int ll_front_merge_fn(struct request_queue *q, struct request *req,
|
|
|
|
struct bio *bio);
|
|
|
|
int attempt_back_merge(struct request_queue *q, struct request *rq);
|
|
|
|
int attempt_front_merge(struct request_queue *q, struct request *rq);
|
2011-03-21 12:14:27 +03:00
|
|
|
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
|
|
|
|
struct request *next);
|
2008-01-29 16:04:06 +03:00
|
|
|
void blk_recalc_rq_segments(struct request *rq);
|
2009-07-03 12:48:17 +04:00
|
|
|
void blk_rq_set_mixed_merge(struct request *rq);
|
2012-02-08 12:19:38 +04:00
|
|
|
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
|
|
|
|
int blk_try_merge(struct request *rq, struct bio *bio);
|
2008-01-29 16:04:06 +03:00
|
|
|
|
2008-01-29 16:51:59 +03:00
|
|
|
void blk_queue_congestion_threshold(struct request_queue *q);
|
|
|
|
|
2008-03-04 13:23:45 +03:00
|
|
|
int blk_dev_init(void);
|
|
|
|
|
2010-10-25 00:06:02 +04:00
|
|
|
|
2008-01-29 16:51:59 +03:00
|
|
|
/*
|
|
|
|
* Return the threshold (number of used requests) at which the queue is
|
|
|
|
* considered to be congested. It include a little hysteresis to keep the
|
|
|
|
* context switch rate down.
|
|
|
|
*/
|
|
|
|
static inline int queue_congestion_on_threshold(struct request_queue *q)
|
|
|
|
{
|
|
|
|
return q->nr_congestion_on;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The threshold at which a queue is considered to be uncongested
|
|
|
|
*/
|
|
|
|
static inline int queue_congestion_off_threshold(struct request_queue *q)
|
|
|
|
{
|
|
|
|
return q->nr_congestion_off;
|
|
|
|
}
|
|
|
|
|
2014-05-20 21:49:02 +04:00
|
|
|
extern int blk_update_nr_requests(struct request_queue *, unsigned int);
|
|
|
|
|
2009-04-24 10:10:11 +04:00
|
|
|
/*
|
|
|
|
* Contribute to IO statistics IFF:
|
|
|
|
*
|
|
|
|
* a) it's attached to a gendisk, and
|
|
|
|
* b) the queue had IO stats enabled when this request was started, and
|
2012-09-18 20:19:25 +04:00
|
|
|
* c) it's a file system request
|
2009-04-24 10:10:11 +04:00
|
|
|
*/
|
2009-03-27 12:31:51 +03:00
|
|
|
static inline int blk_do_io_stat(struct request *rq)
|
2009-02-02 10:42:32 +03:00
|
|
|
{
|
2010-08-07 20:17:56 +04:00
|
|
|
return rq->rq_disk &&
|
|
|
|
(rq->cmd_flags & REQ_IO_STAT) &&
|
2012-09-18 20:19:25 +04:00
|
|
|
(rq->cmd_type == REQ_TYPE_FS);
|
2009-02-02 10:42:32 +03:00
|
|
|
}
|
|
|
|
|
2011-12-14 03:33:40 +04:00
|
|
|
/*
|
|
|
|
* Internal io_context interface
|
|
|
|
*/
|
|
|
|
void get_io_context(struct io_context *ioc);
|
2011-12-14 03:33:42 +04:00
|
|
|
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
|
2012-03-06 01:15:24 +04:00
|
|
|
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
|
|
|
|
gfp_t gfp_mask);
|
2011-12-14 03:33:42 +04:00
|
|
|
void ioc_clear_queue(struct request_queue *q);
|
2011-12-14 03:33:40 +04:00
|
|
|
|
2012-03-06 01:15:24 +04:00
|
|
|
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
|
2011-12-14 03:33:40 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* create_io_context - try to create task->io_context
|
|
|
|
* @gfp_mask: allocation mask
|
|
|
|
* @node: allocation node
|
|
|
|
*
|
2012-03-06 01:15:24 +04:00
|
|
|
* If %current->io_context is %NULL, allocate a new io_context and install
|
|
|
|
* it. Returns the current %current->io_context which may be %NULL if
|
|
|
|
* allocation failed.
|
2011-12-14 03:33:40 +04:00
|
|
|
*
|
|
|
|
* Note that this function can't be called with IRQ disabled because
|
2012-03-06 01:15:24 +04:00
|
|
|
* task_lock which protects %current->io_context is IRQ-unsafe.
|
2011-12-14 03:33:40 +04:00
|
|
|
*/
|
2012-03-06 01:15:24 +04:00
|
|
|
static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
|
2011-12-14 03:33:40 +04:00
|
|
|
{
|
|
|
|
WARN_ON_ONCE(irqs_disabled());
|
2012-03-06 01:15:24 +04:00
|
|
|
if (unlikely(!current->io_context))
|
|
|
|
create_task_io_context(current, gfp_mask, node);
|
|
|
|
return current->io_context;
|
2011-12-14 03:33:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Internal throttling interface
|
|
|
|
*/
|
2011-10-19 16:31:18 +04:00
|
|
|
#ifdef CONFIG_BLK_DEV_THROTTLING
|
block: fix request_queue lifetime handling by making blk_queue_cleanup() properly shutdown
request_queue is refcounted but actually depdends on lifetime
management from the queue owner - on blk_cleanup_queue(), block layer
expects that there's no request passing through request_queue and no
new one will.
This is fundamentally broken. The queue owner (e.g. SCSI layer)
doesn't have a way to know whether there are other active users before
calling blk_cleanup_queue() and other users (e.g. bsg) don't have any
guarantee that the queue is and would stay valid while it's holding a
reference.
With delay added in blk_queue_bio() before queue_lock is grabbed, the
following oops can be easily triggered when a device is removed with
in-flight IOs.
sd 0:0:1:0: [sdb] Stopping disk
ata1.01: disabled
general protection fault: 0000 [#1] PREEMPT SMP
CPU 2
Modules linked in:
Pid: 648, comm: test_rawio Not tainted 3.1.0-rc3-work+ #56 Bochs Bochs
RIP: 0010:[<ffffffff8137d651>] [<ffffffff8137d651>] elv_rqhash_find+0x61/0x100
...
Process test_rawio (pid: 648, threadinfo ffff880019efa000, task ffff880019ef8a80)
...
Call Trace:
[<ffffffff8137d774>] elv_merge+0x84/0xe0
[<ffffffff81385b54>] blk_queue_bio+0xf4/0x400
[<ffffffff813838ea>] generic_make_request+0xca/0x100
[<ffffffff81383994>] submit_bio+0x74/0x100
[<ffffffff811c53ec>] dio_bio_submit+0xbc/0xc0
[<ffffffff811c610e>] __blockdev_direct_IO+0x92e/0xb40
[<ffffffff811c39f7>] blkdev_direct_IO+0x57/0x60
[<ffffffff8113b1c5>] generic_file_aio_read+0x6d5/0x760
[<ffffffff8118c1ca>] do_sync_read+0xda/0x120
[<ffffffff8118ce55>] vfs_read+0xc5/0x180
[<ffffffff8118cfaa>] sys_pread64+0x9a/0xb0
[<ffffffff81afaf6b>] system_call_fastpath+0x16/0x1b
This happens because blk_queue_cleanup() destroys the queue and
elevator whether IOs are in progress or not and DEAD tests are
sprinkled in the request processing path without proper
synchronization.
Similar problem exists for blk-throtl. On queue cleanup, blk-throtl
is shutdown whether it has requests in it or not. Depending on
timing, it either oopses or throttled bios are lost putting tasks
which are waiting for bio completion into eternal D state.
The way it should work is having the usual clear distinction between
shutdown and release. Shutdown drains all currently pending requests,
marks the queue dead, and performs partial teardown of the now
unnecessary part of the queue. Even after shutdown is complete,
reference holders are still allowed to issue requests to the queue
although they will be immmediately failed. The rest of teardown
happens on release.
This patch makes the following changes to make blk_queue_cleanup()
behave as proper shutdown.
* QUEUE_FLAG_DEAD is now set while holding both q->exit_mutex and
queue_lock.
* Unsynchronized DEAD check in generic_make_request_checks() removed.
This couldn't make any meaningful difference as the queue could die
after the check.
* blk_drain_queue() updated such that it can drain all requests and is
now called during cleanup.
* blk_throtl updated such that it checks DEAD on grabbing queue_lock,
drains all throttled bios during cleanup and free td when queue is
released.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2011-10-19 16:42:16 +04:00
|
|
|
extern void blk_throtl_drain(struct request_queue *q);
|
2011-10-19 16:31:18 +04:00
|
|
|
extern int blk_throtl_init(struct request_queue *q);
|
|
|
|
extern void blk_throtl_exit(struct request_queue *q);
|
|
|
|
#else /* CONFIG_BLK_DEV_THROTTLING */
|
block: fix request_queue lifetime handling by making blk_queue_cleanup() properly shutdown
request_queue is refcounted but actually depdends on lifetime
management from the queue owner - on blk_cleanup_queue(), block layer
expects that there's no request passing through request_queue and no
new one will.
This is fundamentally broken. The queue owner (e.g. SCSI layer)
doesn't have a way to know whether there are other active users before
calling blk_cleanup_queue() and other users (e.g. bsg) don't have any
guarantee that the queue is and would stay valid while it's holding a
reference.
With delay added in blk_queue_bio() before queue_lock is grabbed, the
following oops can be easily triggered when a device is removed with
in-flight IOs.
sd 0:0:1:0: [sdb] Stopping disk
ata1.01: disabled
general protection fault: 0000 [#1] PREEMPT SMP
CPU 2
Modules linked in:
Pid: 648, comm: test_rawio Not tainted 3.1.0-rc3-work+ #56 Bochs Bochs
RIP: 0010:[<ffffffff8137d651>] [<ffffffff8137d651>] elv_rqhash_find+0x61/0x100
...
Process test_rawio (pid: 648, threadinfo ffff880019efa000, task ffff880019ef8a80)
...
Call Trace:
[<ffffffff8137d774>] elv_merge+0x84/0xe0
[<ffffffff81385b54>] blk_queue_bio+0xf4/0x400
[<ffffffff813838ea>] generic_make_request+0xca/0x100
[<ffffffff81383994>] submit_bio+0x74/0x100
[<ffffffff811c53ec>] dio_bio_submit+0xbc/0xc0
[<ffffffff811c610e>] __blockdev_direct_IO+0x92e/0xb40
[<ffffffff811c39f7>] blkdev_direct_IO+0x57/0x60
[<ffffffff8113b1c5>] generic_file_aio_read+0x6d5/0x760
[<ffffffff8118c1ca>] do_sync_read+0xda/0x120
[<ffffffff8118ce55>] vfs_read+0xc5/0x180
[<ffffffff8118cfaa>] sys_pread64+0x9a/0xb0
[<ffffffff81afaf6b>] system_call_fastpath+0x16/0x1b
This happens because blk_queue_cleanup() destroys the queue and
elevator whether IOs are in progress or not and DEAD tests are
sprinkled in the request processing path without proper
synchronization.
Similar problem exists for blk-throtl. On queue cleanup, blk-throtl
is shutdown whether it has requests in it or not. Depending on
timing, it either oopses or throttled bios are lost putting tasks
which are waiting for bio completion into eternal D state.
The way it should work is having the usual clear distinction between
shutdown and release. Shutdown drains all currently pending requests,
marks the queue dead, and performs partial teardown of the now
unnecessary part of the queue. Even after shutdown is complete,
reference holders are still allowed to issue requests to the queue
although they will be immmediately failed. The rest of teardown
happens on release.
This patch makes the following changes to make blk_queue_cleanup()
behave as proper shutdown.
* QUEUE_FLAG_DEAD is now set while holding both q->exit_mutex and
queue_lock.
* Unsynchronized DEAD check in generic_make_request_checks() removed.
This couldn't make any meaningful difference as the queue could die
after the check.
* blk_drain_queue() updated such that it can drain all requests and is
now called during cleanup.
* blk_throtl updated such that it checks DEAD on grabbing queue_lock,
drains all throttled bios during cleanup and free td when queue is
released.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2011-10-19 16:42:16 +04:00
|
|
|
static inline void blk_throtl_drain(struct request_queue *q) { }
|
2011-10-19 16:31:18 +04:00
|
|
|
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
|
|
|
|
static inline void blk_throtl_exit(struct request_queue *q) { }
|
|
|
|
#endif /* CONFIG_BLK_DEV_THROTTLING */
|
|
|
|
|
|
|
|
#endif /* BLK_INTERNAL_H */
|