From f0cc9a057151892b885be21a1d19b0185568281d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2014 10:06:23 +1000 Subject: [PATCH 1/8] md/raid1: intialise start_next_window for READ case to avoid hang r1_bio->start_next_window is not initialised in the READ case, so allow_barrier may incorrectly decrement conf->current_window_requests which can cause raise_barrier() to block forever. Fixes: 79ef3a8aa1cb1523cc231c9a90a278333c21f761 cc: stable@vger.kernel.org (v3.13+) Reported-by: Brassow Jonathan Signed-off-by: NeilBrown --- drivers/md/raid1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d7690f86fdb9..9ebccf624571 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1186,6 +1186,7 @@ read_again: atomic_read(&bitmap->behind_writes) == 0); } r1_bio->read_disk = rdisk; + r1_bio->start_next_window = 0; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, From c6d119cf1b5a778e9ed60a006e2a434fcc4471a2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Sep 2014 13:49:46 +1000 Subject: [PATCH 2/8] md/raid1: be more cautious where we read-balance during resync. commit 79ef3a8aa1cb1523cc231c9a90a278333c21f761 made it possible for reads to happen concurrently with resync. This means that we need to be more careful where read_balancing is allowed during resync - we can no longer be sure that any resync that has already started will definitely finish. So keep read_balancing to before recovery_cp, which is conservative but safe. This bug makes it possible to read from a device that doesn't have up-to-date data, so it can cause data corruption. So it is suitable for any kernel since 3.11. Fixes: 79ef3a8aa1cb1523cc231c9a90a278333c21f761 cc: stable@vger.kernel.org (v3.13+) Signed-off-by: NeilBrown --- drivers/md/raid1.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9ebccf624571..ad0468c42d23 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -540,11 +540,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect has_nonrot_disk = 0; choose_next_idle = 0; - if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) - choose_first = 1; - else - choose_first = 0; + choose_first = (conf->mddev->recovery_cp < this_sector + sectors); for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { sector_t dist; From 669cc7ba77864e7b1ac39c9f2b2afb8730f341f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 4 Sep 2014 16:30:38 +1000 Subject: [PATCH 3/8] md/raid1: clean up request counts properly in close_sync() If there are outstanding writes when close_sync is called, the change to ->start_next_window might cause them to decrement the wrong counter when they complete. Fix this by merging the two counters into the one that will be decremented. Having an incorrect value in a counter can cause raise_barrier() to hangs, so this is suitable for -stable. Fixes: 79ef3a8aa1cb1523cc231c9a90a278333c21f761 cc: stable@vger.kernel.org (v3.13+) Signed-off-by: NeilBrown --- drivers/md/raid1.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ad0468c42d23..a31c92bbcfc9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1545,8 +1545,13 @@ static void close_sync(struct r1conf *conf) mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + spin_lock_irq(&conf->resync_lock); conf->next_resync = 0; conf->start_next_window = MaxSector; + conf->current_window_requests += + conf->next_window_requests; + conf->next_window_requests = 0; + spin_unlock_irq(&conf->resync_lock); } static int raid1_spare_active(struct mddev *mddev) From 2f73d3c55d09ce60647b96ad2a9b539c95a530ee Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 10 Sep 2014 15:01:49 +1000 Subject: [PATCH 4/8] md/raid1: make sure resync waits for conflicting writes to complete. The resync/recovery process for raid1 was recently changed so that writes could happen in parallel with resync providing they were in different regions of the device. There is a problem though: While a write request will always wait for conflicting resync to complete, a resync request will *not* always wait for conflicting writes to complete. Two changes are needed to fix this: 1/ raise_barrier (which waits until it is safe to do resync) must wait until current_window_requests is zero 2/ wait_battier (which waits at the start of a new write request) must update current_window_requests if the request could possible conflict with a concurrent resync. As concurrent writes and resync can lead to data loss, this patch is suitable for -stable. Fixes: 79ef3a8aa1cb1523cc231c9a90a278333c21f761 Cc: stable@vger.kernel.org (v3.13+) Cc: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid1.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a31c92bbcfc9..b13c218997da 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -845,10 +845,12 @@ static void raise_barrier(struct r1conf *conf) * C: next_resync + RESYNC_SECTORS > start_next_window, meaning * next resync will reach to the window which normal bios are * handling. + * D: while there are any active requests in the current window. */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && conf->barrier < RESYNC_DEPTH && + conf->current_window_requests == 0 && (conf->start_next_window >= conf->next_resync + RESYNC_SECTORS), conf->resync_lock); @@ -915,8 +917,8 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) } if (bio && bio_data_dir(bio) == WRITE) { - if (conf->next_resync + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector) { + if (bio->bi_iter.bi_sector >= + conf->next_resync) { if (conf->start_next_window == MaxSector) conf->start_next_window = conf->next_resync + From 235549605eb7f1c5a37cef8b09d12e6d412c5cd6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 10 Sep 2014 15:56:57 +1000 Subject: [PATCH 5/8] md/raid1: Don't use next_resync to determine how far resync has progressed next_resync is (approximately) the location for the next resync request. However it does *not* reliably determine the earliest location at which resync might be happening. This is because resync requests can complete out of order, and we only limit the number of current requests, not the distance from the earliest pending request to the latest. mddev->curr_resync_completed is a reliable indicator of the earliest position at which resync could be happening. It is updated less frequently, but is actually reliable which is more important. So use it to determine if a write request is before the region being resynced and so safe from conflict. This error can allow resync IO to interfere with normal IO which could lead to data corruption. Hence: stable. Fixes: 79ef3a8aa1cb1523cc231c9a90a278333c21f761 cc: stable@vger.kernel.org (v3.13+) Signed-off-by: NeilBrown --- drivers/md/raid1.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b13c218997da..3b14a495c424 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -875,12 +875,10 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) if (conf->array_frozen || !bio) wait = true; else if (conf->barrier && bio_data_dir(bio) == WRITE) { - if (conf->next_resync < RESYNC_WINDOW_SECTORS) - wait = true; - else if ((conf->next_resync - RESYNC_WINDOW_SECTORS - >= bio_end_sector(bio)) || - (conf->next_resync + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector)) + if ((conf->mddev->curr_resync_completed + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_iter.bi_sector)) wait = false; else wait = true; @@ -918,7 +916,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) if (bio && bio_data_dir(bio) == WRITE) { if (bio->bi_iter.bi_sector >= - conf->next_resync) { + conf->mddev->curr_resync_completed) { if (conf->start_next_window == MaxSector) conf->start_next_window = conf->next_resync + From c2fd4c94deedb89ac1746c4a53219be499372c06 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 10 Sep 2014 16:01:24 +1000 Subject: [PATCH 6/8] md/raid1: update next_resync under resync_lock. raise_barrier() uses next_resync as part of its calculations, so it really should be updated first, instead of afterwards. next_resync is always used under resync_lock so update it under resync lock to, just before it is used. That is safest. This could cause normal IO and resync IO to interact badly so it suitable for -stable. Fixes: 79ef3a8aa1cb1523cc231c9a90a278333c21f761 cc: stable@vger.kernel.org (v3.13+) Signed-off-by: NeilBrown --- drivers/md/raid1.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3b14a495c424..acfccec9ea3a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -827,7 +827,7 @@ static void flush_pending_writes(struct r1conf *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -static void raise_barrier(struct r1conf *conf) +static void raise_barrier(struct r1conf *conf, sector_t sector_nr) { spin_lock_irq(&conf->resync_lock); @@ -837,6 +837,7 @@ static void raise_barrier(struct r1conf *conf) /* block any new IO from starting */ conf->barrier++; + conf->next_resync = sector_nr; /* For these conditions we must wait: * A: while the array is in frozen state @@ -2543,9 +2544,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bitmap_cond_end_sync(mddev->bitmap, sector_nr); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); - raise_barrier(conf); - conf->next_resync = sector_nr; + raise_barrier(conf, sector_nr); rcu_read_lock(); /* From 34e97f170149bfa14979581c4c748bc9b4b79d5b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 16 Sep 2014 12:14:14 +1000 Subject: [PATCH 7/8] md/raid1: count resync requests in nr_pending. Both normal IO and resync IO can be retried with reschedule_retry() and so be counted into ->nr_queued, but only normal IO gets counted in ->nr_pending. Before the recent improvement to RAID1 resync there could only possibly have been one or the other on the queue. When handling a read failure it could only be normal IO. So when handle_read_error() called freeze_array() the fact that freeze_array only compares ->nr_queued against ->nr_pending was safe. But now that these two types can interleave, we can have both normal and resync IO requests queued, so we need to count them both in nr_pending. This error can lead to freeze_array() hanging if there is a read error, so it is suitable for -stable. Fixes: 79ef3a8aa1cb1523cc231c9a90a278333c21f761 cc: stable@vger.kernel.org (v3.13+) Reported-by: Brassow Jonathan Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index acfccec9ea3a..35649ddac9bc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -856,6 +856,7 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) conf->next_resync + RESYNC_SECTORS), conf->resync_lock); + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); } @@ -865,6 +866,7 @@ static void lower_barrier(struct r1conf *conf) BUG_ON(conf->barrier <= 0); spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; + conf->nr_pending--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } From b8cb6b4c121e1bf1963c16ed69e7adcb1bc301cd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 18 Sep 2014 11:09:04 +1000 Subject: [PATCH 8/8] md/raid1: fix_read_error should act on all non-faulty devices. If a devices is being recovered it is not InSync and is not Faulty. If a read error is experienced on that device, fix_read_error() will be called, but it ignores non-InSync devices. So it will neither fix the error nor fail the device. It is incorrect that fix_read_error() ignores non-InSync devices. It should only ignore Faulty devices. So fix it. This became a bug when we allowed reading from a device that was being recovered. It is suitable for any subsequent -stable kernel. Fixes: da8840a747c0dbf49506ec906757a6b87b9741e9 Cc: stable@vger.kernel.org (v3.5+) Reported-by: Alexander Lyakas Tested-by: Alexander Lyakas Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 35649ddac9bc..55de4f6f7eaf 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2155,7 +2155,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) + !test_bit(Faulty, &rdev->flags)) r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); } @@ -2167,7 +2167,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) { + !test_bit(Faulty, &rdev->flags)) { if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors);