blk-mq: init hctx sched after update ctx and hctx mapping

Currently, when update nr_hw_queues, IO scheduler's init_hctx will
be invoked before the mapping between ctx and hctx is adapted
correctly by blk_mq_map_swqueue. The IO scheduler init_hctx (kyber)
may depend on this mapping and get wrong result and panic finally.
A simply way to fix this is that switch the IO scheduler to 'none'
before update the nr_hw_queues, and then switch it back after
update nr_hw_queues. blk_mq_sched_init_/exit_hctx are removed due
to nobody use them any more.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jianchao Wang 2018-08-21 15:15:03 +08:00 коммит произвёл Jens Axboe
Родитель fcedba42d9
Коммит d48ece209f
5 изменённых файлов: 98 добавлений и 65 удалений

Просмотреть файл

@ -462,50 +462,6 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
blk_mq_sched_free_tags(set, hctx, i);
}
int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
struct elevator_queue *e = q->elevator;
int ret;
if (!e)
return 0;
ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
if (ret)
return ret;
if (e->type->ops.mq.init_hctx) {
ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
if (ret) {
blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
return ret;
}
}
blk_mq_debugfs_register_sched_hctx(q, hctx);
return 0;
}
void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
struct elevator_queue *e = q->elevator;
if (!e)
return;
blk_mq_debugfs_unregister_sched_hctx(hctx);
if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
e->type->ops.mq.exit_hctx(hctx, hctx_idx);
hctx->sched_data = NULL;
}
blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
}
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
struct blk_mq_hw_ctx *hctx;

Просмотреть файл

@ -28,11 +28,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx);
void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx);
static inline bool
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{

Просмотреть файл

@ -2147,8 +2147,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_request)
set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@ -2216,12 +2214,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto free_bitmap;
if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
goto exit_hctx;
hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
if (!hctx->fq)
goto sched_exit_hctx;
goto exit_hctx;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
goto free_fq;
@ -2235,8 +2230,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
free_fq:
kfree(hctx->fq);
sched_exit_hctx:
blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@ -2898,10 +2891,81 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
/*
* request_queue and elevator_type pair.
* It is just used by __blk_mq_update_nr_hw_queues to cache
* the elevator_type associated with a request_queue.
*/
struct blk_mq_qe_pair {
struct list_head node;
struct request_queue *q;
struct elevator_type *type;
};
/*
* Cache the elevator_type in qe pair list and switch the
* io scheduler to 'none'
*/
static bool blk_mq_elv_switch_none(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
if (!q->elevator)
return true;
qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
if (!qe)
return false;
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
list_add(&qe->node, head);
mutex_lock(&q->sysfs_lock);
/*
* After elevator_switch_mq, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler
* module get by elevator_get will also be put. So we need to get
* a reference of the io scheduler module here to prevent it to be
* removed.
*/
__module_get(qe->type->elevator_owner);
elevator_switch_mq(q, NULL);
mutex_unlock(&q->sysfs_lock);
return true;
}
static void blk_mq_elv_switch_back(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
struct elevator_type *t = NULL;
list_for_each_entry(qe, head, node)
if (qe->q == q) {
t = qe->type;
break;
}
if (!t)
return;
list_del(&qe->node);
kfree(qe);
mutex_lock(&q->sysfs_lock);
elevator_switch_mq(q, t);
mutex_unlock(&q->sysfs_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
LIST_HEAD(head);
lockdep_assert_held(&set->tag_list_lock);
@ -2912,6 +2976,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
/*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
* updating the new sw to hw queue mappings.
*/
list_for_each_entry(q, &set->tag_list, tag_set_list)
if (!blk_mq_elv_switch_none(&head, q))
goto switch_back;
set->nr_hw_queues = nr_hw_queues;
blk_mq_update_queue_map(set);
@ -2920,6 +2992,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
blk_mq_queue_reinit(q);
}
switch_back:
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_elv_switch_back(&head, q);
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue(q);
}

Просмотреть файл

@ -234,6 +234,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
int elevator_init(struct request_queue *);
int elevator_init_mq(struct request_queue *q);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
void elevator_exit(struct request_queue *, struct elevator_queue *);
int elv_register_queue(struct request_queue *q);
void elv_unregister_queue(struct request_queue *q);

Просмотреть файл

@ -933,16 +933,13 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
static int elevator_switch_mq(struct request_queue *q,
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e)
{
int ret;
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
if (q->elevator->registered)
elv_unregister_queue(q);
@ -968,8 +965,6 @@ static int elevator_switch_mq(struct request_queue *q,
blk_add_trace_msg(q, "elv switch: none");
out:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return ret;
}
@ -1021,8 +1016,17 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
lockdep_assert_held(&q->sysfs_lock);
if (q->mq_ops)
return elevator_switch_mq(q, new_e);
if (q->mq_ops) {
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
err = elevator_switch_mq(q, new_e);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return err;
}
/*
* Turn on BYPASS and drain all requests w/ elevator private data.