md/raid10: fix task hung in raid10d
commitfe630de009
("md/raid10: avoid deadlock on recovery.") allowed normal io and sync io to exist at the same time. Task hung will occur as below: T1 T2 T3 T4 raid10d handle_read_error allow_barrier conf->nr_pending-- -> 0 //submit sync io raid10_sync_request raise_barrier ->will not be blocked ... //submit to drivers raid10_read_request wait_barrier conf->nr_pending++ -> 1 //retry read fail raid10_end_read_request reschedule_retry add to retry_list conf->nr_queued++ -> 1 //sync io fail end_sync_read __end_sync_read reschedule_retry add to retry_list conf->nr_queued++ -> 2 ... handle_read_error get form retry_list conf->nr_queued-- freeze_array wait nr_pending == nr_queued+1 ->1 ->2 //task hung retry read and sync io will be added to retry_list(nr_queued->2) if they fails. raid10d() called handle_read_error() and hung in freeze_array(). nr_queued will not decrease because raid10d is blocked, nr_pending will not increase because conf->barrier is not released. Fix it by moving allow_barrier() after raid10_read_request(). raise_barrier() will wait for nr_waiting to become 0. Therefore, sync io and regular io will not be issued at the same time. Also remove the check of nr_queued in stop_waiting_barrier. It can be 0 but don't need to be blocking. Remove the check for MD_RECOVERY_RUNNING as the check is redundent. Fixes:fe630de009
("md/raid10: avoid deadlock on recovery.") Signed-off-by: Li Nan <linan122@huawei.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20230222041000.3341651-2-linan666@huaweicloud.com
This commit is contained in:
Родитель
bb4c19e030
Коммит
72c215ed87
|
@ -995,11 +995,15 @@ static bool stop_waiting_barrier(struct r10conf *conf)
|
||||||
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
|
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
/* move on if recovery thread is blocked by us */
|
/*
|
||||||
if (conf->mddev->thread->tsk == current &&
|
* move on if io is issued from raid10d(), nr_pending is not released
|
||||||
test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
|
* from original io(see handle_read_error()). All raise barrier is
|
||||||
conf->nr_queued > 0)
|
* blocked until this io is done.
|
||||||
|
*/
|
||||||
|
if (conf->mddev->thread->tsk == current) {
|
||||||
|
WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2978,9 +2982,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
|
||||||
md_error(mddev, rdev);
|
md_error(mddev, rdev);
|
||||||
|
|
||||||
rdev_dec_pending(rdev, mddev);
|
rdev_dec_pending(rdev, mddev);
|
||||||
allow_barrier(conf);
|
|
||||||
r10_bio->state = 0;
|
r10_bio->state = 0;
|
||||||
raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
|
raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
|
||||||
|
/*
|
||||||
|
* allow_barrier after re-submit to ensure no sync io
|
||||||
|
* can be issued while regular io pending.
|
||||||
|
*/
|
||||||
|
allow_barrier(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
|
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче