From 9092c02d943515b3c9ffd5d0003527f8cc1dd77b Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 2 May 2013 14:19:24 -0500 Subject: [PATCH 01/14] DM RAID: Add ability to restore transiently failed devices on resume DM RAID: Add ability to restore transiently failed devices on resume This patch adds code to the resume function to check over the devices in the RAID array. If any are found to be marked as failed and their superblocks can be read, an attempt is made to reintegrate them into the array. This allows the user to refresh the array with a simple suspend and resume of the array - rather than having to load a completely new table, allocate and initialize all the structures and throw away the old instantiation. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- Documentation/device-mapper/dm-raid.txt | 1 + drivers/md/dm-raid.c | 44 ++++++++++++++++++++++++- drivers/md/raid1.c | 7 ++-- drivers/md/raid10.c | 10 +++--- 4 files changed, 54 insertions(+), 8 deletions(-) diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index e9192283e5a5..2bb3a6823dc7 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -222,3 +222,4 @@ Version History 1.4.2 Add RAID10 "far" and "offset" algorithm support. 1.5.0 Add message interface to allow manipulation of the sync_action. New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. +1.5.1 Add ability to restore transiently failed devices on resume. diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1d3fe1a40a9b..facaf9142d5a 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1574,12 +1574,54 @@ static void raid_postsuspend(struct dm_target *ti) static void raid_resume(struct dm_target *ti) { + int i; + uint64_t failed_devices, cleared_failed_devices = 0; + unsigned long flags; + struct dm_raid_superblock *sb; struct raid_set *rs = ti->private; + struct md_rdev *r; set_bit(MD_CHANGE_DEVS, &rs->md.flags); if (!rs->bitmap_loaded) { bitmap_load(&rs->md); rs->bitmap_loaded = 1; + } else { + /* + * A secondary resume while the device is active. + * Take this opportunity to check whether any failed + * devices are reachable again. + */ + for (i = 0; i < rs->md.raid_disks; i++) { + r = &rs->dev[i].rdev; + if (test_bit(Faulty, &r->flags) && r->sb_page && + sync_page_io(r, 0, r->sb_size, + r->sb_page, READ, 1)) { + DMINFO("Faulty device #%d has readable super" + "block. Attempting to revive it.", i); + r->raid_disk = i; + r->saved_raid_disk = i; + flags = r->flags; + clear_bit(Faulty, &r->flags); + clear_bit(WriteErrorSeen, &r->flags); + clear_bit(In_sync, &r->flags); + if (r->mddev->pers->hot_add_disk(r->mddev, r)) { + r->raid_disk = -1; + r->saved_raid_disk = -1; + r->flags = flags; + } else { + r->recovery_offset = 0; + cleared_failed_devices |= 1 << i; + } + } + } + if (cleared_failed_devices) { + rdev_for_each(r, &rs->md) { + sb = page_address(r->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + failed_devices &= ~cleared_failed_devices; + sb->failed_devices = cpu_to_le64(failed_devices); + } + } } clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); @@ -1588,7 +1630,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 5, 0}, + .version = {1, 5, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6e17f8181c4b..ec734588a1c6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1519,8 +1519,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) p = conf->mirrors+mirror; if (!p->rdev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); p->head_position = 0; rdev->raid_disk = mirror; @@ -1559,7 +1560,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6ddae2501b9a..3c6b193cefd5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1819,15 +1819,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) set_bit(Replacement, &rdev->flags); rdev->raid_disk = mirror; err = 0; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); conf->fullsync = 1; rcu_assign_pointer(p->replacement, rdev); break; } - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; From f381e71b042af910fbe5f8222792cc5092750993 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Wed, 8 May 2013 17:57:13 -0500 Subject: [PATCH 02/14] DM RAID: Break-up untidy function DM RAID: Break-up untidy function Clean-up excessive indentation by moving some code in raid_resume() into its own function. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 72 ++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index facaf9142d5a..59d15ec0ba81 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1572,15 +1572,51 @@ static void raid_postsuspend(struct dm_target *ti) mddev_suspend(&rs->md); } -static void raid_resume(struct dm_target *ti) +static void attempt_restore_of_faulty_devices(struct raid_set *rs) { int i; uint64_t failed_devices, cleared_failed_devices = 0; unsigned long flags; struct dm_raid_superblock *sb; - struct raid_set *rs = ti->private; struct md_rdev *r; + for (i = 0; i < rs->md.raid_disks; i++) { + r = &rs->dev[i].rdev; + if (test_bit(Faulty, &r->flags) && r->sb_page && + sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) { + DMINFO("Faulty %s device #%d has readable super block." + " Attempting to revive it.", + rs->raid_type->name, i); + r->raid_disk = i; + r->saved_raid_disk = i; + flags = r->flags; + clear_bit(Faulty, &r->flags); + clear_bit(WriteErrorSeen, &r->flags); + clear_bit(In_sync, &r->flags); + if (r->mddev->pers->hot_add_disk(r->mddev, r)) { + r->raid_disk = -1; + r->saved_raid_disk = -1; + r->flags = flags; + } else { + r->recovery_offset = 0; + cleared_failed_devices |= 1 << i; + } + } + } + if (cleared_failed_devices) { + rdev_for_each(r, &rs->md) { + sb = page_address(r->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + failed_devices &= ~cleared_failed_devices; + sb->failed_devices = cpu_to_le64(failed_devices); + } + } +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + set_bit(MD_CHANGE_DEVS, &rs->md.flags); if (!rs->bitmap_loaded) { bitmap_load(&rs->md); @@ -1591,37 +1627,7 @@ static void raid_resume(struct dm_target *ti) * Take this opportunity to check whether any failed * devices are reachable again. */ - for (i = 0; i < rs->md.raid_disks; i++) { - r = &rs->dev[i].rdev; - if (test_bit(Faulty, &r->flags) && r->sb_page && - sync_page_io(r, 0, r->sb_size, - r->sb_page, READ, 1)) { - DMINFO("Faulty device #%d has readable super" - "block. Attempting to revive it.", i); - r->raid_disk = i; - r->saved_raid_disk = i; - flags = r->flags; - clear_bit(Faulty, &r->flags); - clear_bit(WriteErrorSeen, &r->flags); - clear_bit(In_sync, &r->flags); - if (r->mddev->pers->hot_add_disk(r->mddev, r)) { - r->raid_disk = -1; - r->saved_raid_disk = -1; - r->flags = flags; - } else { - r->recovery_offset = 0; - cleared_failed_devices |= 1 << i; - } - } - } - if (cleared_failed_devices) { - rdev_for_each(r, &rs->md) { - sb = page_address(r->sb_page); - failed_devices = le64_to_cpu(sb->failed_devices); - failed_devices &= ~cleared_failed_devices; - sb->failed_devices = cpu_to_le64(failed_devices); - } - } + attempt_restore_of_faulty_devices(rs); } clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); From a4dc163a55964d683f92742705c90c78c0f56c0c Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Wed, 8 May 2013 18:00:54 -0500 Subject: [PATCH 03/14] DM RAID: Fix raid_resume not reviving failed devices in all cases DM RAID: Fix raid_resume not reviving failed devices in all cases When a device fails in a RAID array, it is marked as Faulty. Later, md_check_recovery is called which (through the call chain) calls 'hot_remove_disk' in order to have the personalities remove the device from use in the array. Sometimes, it is possible for the array to be suspended before the personalities get their chance to perform 'hot_remove_disk'. This is normally not an issue. If the array is deactivated, then the failed device will be noticed when the array is reinstantiated. If the array is resumed and the disk is still missing, md_check_recovery will be called upon resume and 'hot_remove_disk' will be called at that time. However, (for dm-raid) if the device has been restored, a resume on the array would cause it to attempt to revive the device by calling 'hot_add_disk'. If 'hot_remove_disk' had not been called, a situation is then created where the device is thought to concurrently be the replacement and the device to be replaced. Thus, the device is first sync'ed with the rest of the array (because it is the replacement device) and then marked Faulty and removed from the array (because it is also the device being replaced). The solution is to check and see if the device had properly been removed before the array was suspended. This is done by seeing whether the device's 'raid_disk' field is -1 - a condition that implies that 'md_check_recovery -> remove_and_add_spares (where raid_disk is set to -1) -> hot_remove_disk' has been called. If 'raid_disk' is not -1, then 'hot_remove_disk' must be called to complete the removal of the previously faulty device before it can be revived via 'hot_add_disk'. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 59d15ec0ba81..49f0bd510fb9 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1587,6 +1587,21 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) DMINFO("Faulty %s device #%d has readable super block." " Attempting to revive it.", rs->raid_type->name, i); + + /* + * Faulty bit may be set, but sometimes the array can + * be suspended before the personalities can respond + * by removing the device from the array (i.e. calling + * 'hot_remove_disk'). If they haven't yet removed + * the failed device, its 'raid_disk' number will be + * '>= 0' - meaning we must call this function + * ourselves. + */ + if ((r->raid_disk >= 0) && + (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0)) + /* Failed to revive this device, try next */ + continue; + r->raid_disk = i; r->saved_raid_disk = i; flags = r->flags; From 3f6bbd3ffd7b733dd705e494663e5761aa2cb9c1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 May 2013 10:27:49 +1000 Subject: [PATCH 04/14] dm-raid: silence compiler warning on rebuilds_per_group. This doesn't really need to be initialised, but it doesn't hurt, silences the compiler, and as it is a counter it makes sense for it to start at zero. Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 49f0bd510fb9..0f5a6fb5978f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -380,7 +380,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) static int validate_raid_redundancy(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; - unsigned rebuilds_per_group, copies, d; + unsigned rebuilds_per_group = 0, copies, d; unsigned group_size, last_group_start; for (i = 0; i < rs->md.raid_disks; i++) From 90f5f7ad4f38d67626b0c220a0ac390603e0bded Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 2 Apr 2013 08:38:55 +0200 Subject: [PATCH 05/14] md: Wait for md_check_recovery before attempting device removal. When a device has failed, it needs to be removed from the personality module before it can be removed from the array as a whole. The first step is performed by md_check_recovery() which is called from the raid management thread. So when a HOT_REMOVE ioctl arrives, wait briefly for md_check_recovery to have run. This increases the chance that the ioctl will succeed. Signed-off-by: Hannes Reinecke Signed-off-by: Neil Brown --- drivers/md/md.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9b82377a833b..3b25b72ef0b6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6405,6 +6405,12 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* need to ensure md_delayed_delete() has completed */ flush_workqueue(md_misc_wq); + if (cmd == HOT_REMOVE_DISK) + /* need to ensure recovery thread has run */ + wait_event_interruptible_timeout(mddev->sb_wait, + !test_bit(MD_RECOVERY_NEEDED, + &mddev->flags), + msecs_to_jiffies(5000)); err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -7892,6 +7898,8 @@ void md_check_recovery(struct mddev *mddev) md_new_event(mddev); } unlock: + wake_up(&mddev->sb_wait); + if (!mddev->sync_thread) { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_RECOVER, From b29bebd66dbd492105668ec3515a5ffb0b25e4c1 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Sat, 1 Jun 2013 16:15:16 +0900 Subject: [PATCH 06/14] md: replace strict_strto*() with kstrto*() The usage of strict_strtoul() is not preferred, because strict_strtoul() is obsolete. Thus, kstrtoul() should be used. Signed-off-by: Jingoo Han Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 8 ++++---- drivers/md/dm-raid.c | 8 ++++---- drivers/md/md.c | 16 ++++++++-------- drivers/md/raid5.c | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a2c75499824..a7fd82133b12 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -2002,9 +2002,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } else { int rv; if (buf[0] == '+') - rv = strict_strtoll(buf+1, 10, &offset); + rv = kstrtoll(buf+1, 10, &offset); else - rv = strict_strtoll(buf, 10, &offset); + rv = kstrtoll(buf, 10, &offset); if (rv) return rv; if (offset == 0) @@ -2139,7 +2139,7 @@ static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; - int rv = strict_strtoul(buf, 10, &backlog); + int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) @@ -2165,7 +2165,7 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len) unsigned long csize; if (mddev->bitmap) return -EBUSY; - rv = strict_strtoul(buf, 10, &csize); + rv = kstrtoul(buf, 10, &csize); if (rv) return rv; if (csize < 512 || diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 0f5a6fb5978f..21e8e4660c59 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -504,7 +504,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, * First, parse the in-order required arguments * "chunk_size" is the only argument of this type. */ - if ((strict_strtoul(argv[0], 10, &value) < 0)) { + if ((kstrtoul(argv[0], 10, &value) < 0)) { rs->ti->error = "Bad chunk size"; return -EINVAL; } else if (rs->raid_type->level == 1) { @@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, continue; } - if (strict_strtoul(argv[i], 10, &value) < 0) { + if (kstrtoul(argv[i], 10, &value) < 0) { rs->ti->error = "Bad numerical argument given in raid params"; return -EINVAL; } @@ -1181,7 +1181,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) argv++; /* number of RAID parameters */ - if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { + if (kstrtoul(argv[0], 10, &num_raid_params) < 0) { ti->error = "Cannot understand number of RAID parameters"; return -EINVAL; } @@ -1194,7 +1194,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } - if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || (num_raid_devs >= INT_MAX)) { ti->error = "Cannot understand number of raid devices"; return -EINVAL; diff --git a/drivers/md/md.c b/drivers/md/md.c index 3b25b72ef0b6..26f9452ea61c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2867,7 +2867,7 @@ static ssize_t offset_store(struct md_rdev *rdev, const char *buf, size_t len) { unsigned long long offset; - if (strict_strtoull(buf, 10, &offset) < 0) + if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; @@ -2895,7 +2895,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev, unsigned long long new_offset; struct mddev *mddev = rdev->mddev; - if (strict_strtoull(buf, 10, &new_offset) < 0) + if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; if (mddev->sync_thread) @@ -2961,7 +2961,7 @@ static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) unsigned long long blocks; sector_t new; - if (strict_strtoull(buf, 10, &blocks) < 0) + if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) @@ -3069,7 +3069,7 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ if (cmd_match(buf, "none")) recovery_start = MaxSector; - else if (strict_strtoull(buf, 10, &recovery_start)) + else if (kstrtoull(buf, 10, &recovery_start)) return -EINVAL; if (rdev->mddev->pers && @@ -3497,7 +3497,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (clevel[len-1] == '\n') len--; clevel[len] = 0; - if (strict_strtol(clevel, 10, &level)) + if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; if (request_module("md-%s", clevel) != 0) @@ -4356,7 +4356,7 @@ sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; - if (strict_strtol(buf, 10, &n)) + if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) @@ -4424,7 +4424,7 @@ static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; - if (strict_strtoull(buf, 10, &min)) + if (kstrtoull(buf, 10, &min)) return -EINVAL; if (min > mddev->resync_max) return -EINVAL; @@ -4461,7 +4461,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len) mddev->resync_max = MaxSector; else { unsigned long long max; - if (strict_strtoull(buf, 10, &max)) + if (kstrtoull(buf, 10, &max)) return -EINVAL; if (max < mddev->resync_min) return -EINVAL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 05e4a105b9c7..cd9aab9329fd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4924,7 +4924,7 @@ raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) if (!conf) return -ENODEV; - if (strict_strtoul(page, 10, &new)) + if (kstrtoul(page, 10, &new)) return -EINVAL; err = raid5_set_cache_size(mddev, new); if (err) @@ -4957,7 +4957,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) if (!conf) return -ENODEV; - if (strict_strtoul(page, 10, &new)) + if (kstrtoul(page, 10, &new)) return -EINVAL; if (new > conf->max_nr_stripes) return -EINVAL; From 635f6416a2e983adac8ccf90bffbeed0c1a76454 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 11 Jun 2013 14:57:09 +1000 Subject: [PATCH 07/14] md/raid10: locking changes for 'enough()'. As 'enough' accesses conf->prev and conf->geo, which can change spontanously, it should guard against changes. This can be done with device_lock as start_reshape holds device_lock while updating 'geo' and end_reshape holds it while updating 'prev'. So 'error' needs to hold 'device_lock'. On the other hand, raid10_end_read_request knows which of the two it really wants to access, and as it is an active request on that one, the value cannot change underneath it. So change _enough to take flag rather than a pointer, pass the appropriate flag from raid10_end_read_request(), and remove the locking. All other calls to 'enough' are made with reconfig_mutex held, so neither 'prev' nor 'geo' can change. Signed-off-by: NeilBrown --- drivers/md/raid10.c | 49 ++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3c6b193cefd5..5169ed2a9156 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -97,7 +97,7 @@ static int max_queued_requests = 1024; static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); -static int enough(struct r10conf *conf, int ignore); +static int _enough(struct r10conf *conf, int previous, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); @@ -392,11 +392,9 @@ static void raid10_end_read_request(struct bio *bio, int error) * than fail the last device. Here we redefine * "uptodate" to mean "Don't want to retry" */ - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - if (!enough(conf, rdev->raid_disk)) + if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), + rdev->raid_disk)) uptodate = 1; - spin_unlock_irqrestore(&conf->device_lock, flags); } if (uptodate) { raid_end_bio_io(r10_bio); @@ -1632,9 +1630,17 @@ static void status(struct seq_file *seq, struct mddev *mddev) * Don't consider the device numbered 'ignore' * as we might be about to remove it. */ -static int _enough(struct r10conf *conf, struct geom *geo, int ignore) +static int _enough(struct r10conf *conf, int previous, int ignore) { int first = 0; + int disks, ncopies; + if (previous) { + disks = conf->prev.raid_disks; + ncopies = conf->prev.near_copies; + } else { + disks = conf->geo.raid_disks; + ncopies = conf->geo.near_copies; + } do { int n = conf->copies; @@ -1644,25 +1650,31 @@ static int _enough(struct r10conf *conf, struct geom *geo, int ignore) if (conf->mirrors[this].rdev && this != ignore) cnt++; - this = (this+1) % geo->raid_disks; + this = (this+1) % disks; } if (cnt == 0) return 0; - first = (first + geo->near_copies) % geo->raid_disks; + first = (first + ncopies) % disks; } while (first != 0); return 1; } static int enough(struct r10conf *conf, int ignore) { - return _enough(conf, &conf->geo, ignore) && - _enough(conf, &conf->prev, ignore); + /* when calling 'enough', both 'prev' and 'geo' must + * be stable. + * This is ensured if ->reconfig_mutex or ->device_lock + * is held. + */ + return _enough(conf, 0, ignore) && + _enough(conf, 1, ignore); } static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r10conf *conf = mddev->private; + unsigned long flags; /* * If it is not operational, then we have already marked it as dead @@ -1670,18 +1682,18 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) * next level up know. * else mark the drive as failed */ + spin_lock_irqsave(&conf->device_lock, flags); if (test_bit(In_sync, &rdev->flags) - && !enough(conf, rdev->raid_disk)) + && !enough(conf, rdev->raid_disk)) { /* * Don't fail the drive, just return an IO error. */ - return; - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; spin_unlock_irqrestore(&conf->device_lock, flags); - /* + return; + } + if (test_and_clear_bit(In_sync, &rdev->flags)) { + mddev->degraded++; + /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); @@ -1689,6 +1701,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" "md/raid10:%s: Operation continuing on %d devices.\n", @@ -1791,7 +1804,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) * very different from resync */ return -EBUSY; - if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) + if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; if (rdev->raid_disk >= 0) From 725d6e579f06360744fc101b1af2f82d8aa282f1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 11 Jun 2013 15:08:03 +1000 Subject: [PATCH 08/14] md/raid10: check In_sync flag in 'enough()'. It isn't really enough to check that the rdev is present, we need to also be sure that the device is still In_sync. Doing this requires using rcu_dereference to access the rdev, and holding the rcu_read_lock() to ensure the rdev doesn't disappear while we look at it. Signed-off-by: NeilBrown --- drivers/md/raid10.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5169ed2a9156..aa8ba0760cac 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1633,6 +1633,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) static int _enough(struct r10conf *conf, int previous, int ignore) { int first = 0; + int has_enough = 0; int disks, ncopies; if (previous) { disks = conf->prev.raid_disks; @@ -1642,21 +1643,27 @@ static int _enough(struct r10conf *conf, int previous, int ignore) ncopies = conf->geo.near_copies; } + rcu_read_lock(); do { int n = conf->copies; int cnt = 0; int this = first; while (n--) { - if (conf->mirrors[this].rdev && - this != ignore) + struct md_rdev *rdev; + if (this != ignore && + (rdev = rcu_dereference(conf->mirrors[this].rdev)) && + test_bit(In_sync, &rdev->flags)) cnt++; this = (this+1) % disks; } if (cnt == 0) - return 0; + goto out; first = (first + ncopies) % disks; } while (first != 0); - return 1; + has_enough = 1; +out: + rcu_read_unlock(); + return has_enough; } static int enough(struct r10conf *conf, int ignore) From eea136d69f9facb2d3807386bbac1e6b1161795f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 26 Jun 2013 11:55:20 +1000 Subject: [PATCH 09/14] md: fix buglet in RAID5 -> RAID0 conversion. RAID5 uses a 'per-array' value for the 'size' of each device. RAID0 uses a 'per-device' value - it can be different for each device. When converting a RAID5 to a RAID0 we must ensure that the per-device size of each device matches the per-array size for the RAID5, else the array will change size. If the metadata cannot record a changed per-device size (as is the case with v0.90 metadata) the array could get bigger on restart. This does not cause data corruption, so it not a big issue and is mainly yet another a reason to not use 0.90. Signed-off-by: NeilBrown --- drivers/md/raid0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index fcf65e512cf5..c4d420b7d2f4 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -597,6 +597,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } + rdev->sectors = mddev->dev_sectors; } /* Set new parameters */ From c4a39551451666229b4ea5e8aae8ca0131d00665 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 25 Jun 2013 01:23:59 -0500 Subject: [PATCH 10/14] MD: Remember the last sync operation that was performed MD: Remember the last sync operation that was performed This patch adds a field to the mddev structure to track the last sync operation that was performed. This is especially useful when it comes to what is recorded in mismatch_cnt in sysfs. If the last operation was "data-check", then it reports the number of descrepancies found by the user-initiated check. If it was a "repair" operation, then it is reporting the number of descrepancies repaired. etc. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- Documentation/device-mapper/dm-raid.txt | 1 + drivers/md/dm-raid.c | 3 ++- drivers/md/md.c | 29 ++++++++++++++++++------- drivers/md/md.h | 8 +++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 2bb3a6823dc7..ef8ba9fa58c4 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -223,3 +223,4 @@ Version History 1.5.0 Add message interface to allow manipulation of the sync_action. New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. 1.5.1 Add ability to restore transiently failed devices on resume. +1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 21e8e4660c59..4880b69e2e9e 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1388,6 +1388,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * performing a "check" of the array. */ DMEMIT(" %llu", + (strcmp(rs->md.last_sync_action, "check")) ? 0 : (unsigned long long) atomic64_read(&rs->md.resync_mismatches)); break; @@ -1651,7 +1652,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 5, 1}, + .version = {1, 5, 2}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/md.c b/drivers/md/md.c index 26f9452ea61c..dddc87bcf64a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -521,6 +521,7 @@ void mddev_init(struct mddev *mddev) init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; + mddev->last_sync_action = "none"; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; @@ -4272,6 +4273,17 @@ action_store(struct mddev *mddev, const char *page, size_t len) return len; } +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); + +static ssize_t +last_sync_action_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", mddev->last_sync_action); +} + +static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); + static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { @@ -4280,10 +4292,6 @@ mismatch_cnt_show(struct mddev *mddev, char *page) atomic64_read(&mddev->resync_mismatches)); } -static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); - - static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t @@ -4686,6 +4694,7 @@ static struct attribute *md_default_attrs[] = { static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, + &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, @@ -7329,7 +7338,7 @@ void md_do_sync(struct md_thread *thread) sector_t last_check; int skipped = 0; struct md_rdev *rdev; - char *desc; + char *desc, *action = NULL; struct blk_plug plug; /* just incase thread restarts... */ @@ -7339,17 +7348,21 @@ void md_do_sync(struct md_thread *thread) return; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; - else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + action = "check"; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { desc = "requested-resync"; - else + action = "repair"; + } else desc = "resync"; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) desc = "reshape"; else desc = "recovery"; + mddev->last_sync_action = action ?: desc; + /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync diff --git a/drivers/md/md.h b/drivers/md/md.h index 653f992b687a..20f02c0b5f2d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -268,6 +268,14 @@ struct mddev { struct md_thread *thread; /* management thread */ struct md_thread *sync_thread; /* doing resync or reconstruct */ + + /* 'last_sync_action' is initialized to "none". It is set when a + * sync operation (i.e "data-check", "requested-resync", "resync", + * "recovery", or "reshape") is started. It holds this value even + * when the sync thread is "frozen" (interrupted) or "idle" (stopped + * or finished). It is overwritten when a new sync operation is begun. + */ + char *last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until From 0baac4db56b1895743391a348ee9ce4ae5076f9f Mon Sep 17 00:00:00 2001 From: CoolCold Date: Wed, 26 Jun 2013 19:46:58 +0400 Subject: [PATCH 11/14] md: remove doubled description for sync_max, merging it within sync_min/sync_max After last md.txt edits for sync_min/max, sync_max description became doubled. Removing 1st copy, merging details into common sync_min/sync_max section. Signed-off-by: Roman Ovchinnikov Signed-off-by: NeilBrown --- Documentation/md.txt | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Documentation/md.txt b/Documentation/md.txt index e0ddd327632d..fbb2fcbf16b6 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -566,13 +566,6 @@ also have when it reaches the current sync_max (below) and possibly at other times. - sync_max - This is a number of sectors at which point a resync/recovery - process will pause. When a resync is active, the value can - only ever be increased, never decreased. The value of 'max' - effectively disables the limit. - - sync_speed This shows the current actual speed, in K/sec, of the current sync_action. It is averaged over the last 30 seconds. @@ -593,6 +586,12 @@ also have that number to reach sync_max. Then you can either increase "sync_max", or can write 'idle' to "sync_action". + The value of 'max' for "sync_max" effectively disables the limit. + When a resync is active, the value can only ever be increased, + never decreased. + The value of '0' is the minimum for "sync_min". + + Each active md device may also have attributes specific to the personality module that manages it. From 78eaa0d4cbcdb345992fa3dd22b3bcbb473cc064 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 2 Jul 2013 15:58:05 +1000 Subject: [PATCH 12/14] md/raid10: fix two bugs affecting RAID10 reshape. 1/ If a RAID10 is being reshaped to a fewer number of devices and is stopped while this is ongoing, then when the array is reassembled the 'mirrors' array will be allocated too small. This will lead to an access error or memory corruption. 2/ A sanity test for a reshaping RAID10 array is restarted is slightly incorrect. Due to the first bug, this is suitable for any -stable kernel since 3.5 where this code was introduced. Cc: stable@vger.kernel.org (v3.5+) Signed-off-by: NeilBrown --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index aa8ba0760cac..3480bf7c20d4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3554,7 +3554,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) /* FIXME calc properly */ conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + - max(0,mddev->delta_disks)), + max(0,-mddev->delta_disks)), GFP_KERNEL); if (!conf->mirrors) goto out; @@ -3713,7 +3713,7 @@ static int run(struct mddev *mddev) conf->geo.far_offset == 0) goto out_free_conf; if (conf->prev.far_copies != 1 && - conf->geo.far_offset == 0) + conf->prev.far_offset == 0) goto out_free_conf; } From fdcfbbb653b27964c4daa4d2bcb364259c257e7d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 4 Jul 2013 16:38:16 +1000 Subject: [PATCH 13/14] md/raid5: allow 5-device RAID6 to be reshaped to 4-device. There is a bug in 'check_reshape' for raid5.c To checks that the new minimum number of devices is large enough (which is good), but it does so also after the reshape has started (bad). This is bad because - the calculation is now wrong as mddev->raid_disks has changed already, and - it is pointless because it is now too late to stop. So only perform that test when reshape has not been committed to. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cd9aab9329fd..2bf094a587cb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5914,7 +5914,7 @@ static int check_reshape(struct mddev *mddev) return 0; /* nothing to do */ if (has_failed(conf)) return -EINVAL; - if (mddev->delta_disks < 0) { + if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { /* We might be able to shrink, but the devices must * be made bigger first. * For raid6, 4 is the minimum size. From 1376512065b23f39d5f9a160948f313397dde972 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 4 Jul 2013 16:41:53 +1000 Subject: [PATCH 14/14] md/raid10: fix bug which causes all RAID10 reshapes to move no data. The recent comment: commit 7e83ccbecd608b971f340e951c9e84cd0343002f md/raid10: Allow skipping recovery when clean arrays are assembled Causes raid10 to skip a recovery in certain cases where it is safe to do so. Unfortunately it also causes a reshape to be skipped which is never safe. The result is that an attempt to reshape a RAID10 will appear to complete instantly, but no data will have been moves so the array will now contain garbage. (If nothing is written, you can recovery by simple performing the reverse reshape which will also complete instantly). Bug was introduced in 3.10, so this is suitable for 3.10-stable. Cc: stable@vger.kernel.org (3.10) Cc: Martin Wilck Signed-off-by: NeilBrown --- drivers/md/raid10.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3480bf7c20d4..cd066b63bdaf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2931,14 +2931,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, */ if (mddev->bitmap == NULL && mddev->recovery_cp == MaxSector && + mddev->reshape_position == MaxSector && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; - max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || - test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sector = mddev->resync_max_sectors; - return max_sector - sector_nr; + return mddev->dev_sectors - sector_nr; } skipped: