From 6076905b5ef39e0ea58db32583c9e0036c05e47b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:51:52 +0000 Subject: [PATCH 01/80] dm: avoid _hash_lock deadlock Fix a reported deadlock if there are still unprocessed multipath events on a device that is being removed. _hash_lock is held during dev_remove while trying to send the outstanding events. Sending the events requests the _hash_lock again in dm_copy_name_and_uuid. This patch introduces a separate lock around regions that modify the link to the hash table (dm_set_mdptr) or the name or uuid so that dm_copy_name_and_uuid no longer needs _hash_lock. Additionally, dm_copy_name_and_uuid can only be called if md exists so we can drop the dm_get() and dm_put() which can lead to a BUG() while md is being freed. The deadlock: #0 [ffff8106298dfb48] schedule at ffffffff80063035 #1 [ffff8106298dfc20] __down_read at ffffffff8006475d #2 [ffff8106298dfc60] dm_copy_name_and_uuid at ffffffff8824f740 #3 [ffff8106298dfc90] dm_send_uevents at ffffffff88252685 #4 [ffff8106298dfcd0] event_callback at ffffffff8824c678 #5 [ffff8106298dfd00] dm_table_event at ffffffff8824dd01 #6 [ffff8106298dfd10] __hash_remove at ffffffff882507ad #7 [ffff8106298dfd30] dev_remove at ffffffff88250865 #8 [ffff8106298dfd60] ctl_ioctl at ffffffff88250d80 #9 [ffff8106298dfee0] do_ioctl at ffffffff800418c4 #10 [ffff8106298dff00] vfs_ioctl at ffffffff8002fab9 #11 [ffff8106298dff40] sys_ioctl at ffffffff8004bdaf #12 [ffff8106298dff80] tracesys at ffffffff8005d28d (via system_call) Cc: stable@kernel.org Reported-by: guy keren Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 17 +++++++++++++---- drivers/md/dm-uevent.c | 9 ++++----- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a67942931582..d19854c98184 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices); */ static DECLARE_RWSEM(_hash_lock); +/* + * Protects use of mdptr to obtain hash cell name and uuid from mapped device. + */ +static DEFINE_MUTEX(dm_hash_cells_mutex); + static void init_buckets(struct list_head *buckets) { unsigned int i; @@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); } dm_get(md); + mutex_lock(&dm_hash_cells_mutex); dm_set_mdptr(md, cell); + mutex_unlock(&dm_hash_cells_mutex); up_write(&_hash_lock); return 0; @@ -224,7 +231,9 @@ static void __hash_remove(struct hash_cell *hc) /* remove from the dev hash */ list_del(&hc->uuid_list); list_del(&hc->name_list); + mutex_lock(&dm_hash_cells_mutex); dm_set_mdptr(hc->md, NULL); + mutex_unlock(&dm_hash_cells_mutex); table = dm_get_table(hc->md); if (table) { @@ -321,7 +330,9 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) */ list_del(&hc->name_list); old_name = hc->name; + mutex_lock(&dm_hash_cells_mutex); hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); list_add(&hc->name_list, _name_buckets + hash_str(new_name)); /* @@ -1582,8 +1593,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) if (!md) return -ENXIO; - dm_get(md); - down_read(&_hash_lock); + mutex_lock(&dm_hash_cells_mutex); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { r = -ENXIO; @@ -1596,8 +1606,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) strcpy(uuid, hc->uuid ? : ""); out: - up_read(&_hash_lock); - dm_put(md); + mutex_unlock(&dm_hash_cells_mutex); return r; } diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index 6f65883aef12..c7c555a8c7b2 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c @@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj) list_del_init(&event->elist); /* - * Need to call dm_copy_name_and_uuid from here for now. - * Context of previous var adds and locking used for - * hash_cell not compatable. + * When a device is being removed this copy fails and we + * discard these unsent events. */ if (dm_copy_name_and_uuid(event->md, event->name, event->uuid)) { - DMERR("%s: dm_copy_name_and_uuid() failed", - __func__); + DMINFO("%s: skipping sending uevent for lost device", + __func__); goto uevent_free; } From 613978f8711c7fd4d0aa856872375d2abd7c92ff Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 10 Dec 2009 23:51:52 +0000 Subject: [PATCH 02/80] dm exception store: free tmp_store on persistent flag error Error handling code following a kmalloc should free the allocated data. Cc: stable@kernel.org Signed-off-by: Julia Lawall Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 7dbe652efb5a..205215968ff1 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -216,7 +216,8 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, type = get_type("N"); else { ti->error = "Persistent flag is not P or N"; - return -EINVAL; + r = -EINVAL; + goto bad_type; } if (!type) { From d2bb7df8cac647b92f51fb84ae735771e7adbfa7 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 10 Dec 2009 23:51:53 +0000 Subject: [PATCH 03/80] dm: sysfs add empty release function to avoid debug warning This patch just removes an unnecessary warning: kobject: 'dm': does not have a release() function, it is broken and must be fixed. The kobject is embedded in mapped device struct, so code does not need to release memory explicitly here. Cc: stable@kernel.org Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index 4b045903a4e2..b000de38c99a 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -79,6 +79,13 @@ static struct sysfs_ops dm_sysfs_ops = { .show = dm_attr_show, }; +/* + * The sysfs structure is embedded in md struct, nothing to do here + */ +static void dm_sysfs_release(struct kobject *kobj) +{ +} + /* * dm kobject is embedded in mapped_device structure * no need to define release function here @@ -86,6 +93,7 @@ static struct sysfs_ops dm_sysfs_ops = { static struct kobj_type dm_ktype = { .sysfs_ops = &dm_sysfs_ops, .default_attrs = dm_attrs, + .release = dm_sysfs_release }; /* From 94e76572b5dd37b1f0f4b3742ee8a565daead932 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:51:53 +0000 Subject: [PATCH 04/80] dm snapshot: only take lock for statustype info not table Take snapshot lock only for STATUSTYPE_INFO, not STATUSTYPE_TABLE. Commit 4c6fff445d7aa753957856278d4d93bcad6e2c14 (dm-snapshot-lock-snapshot-while-supplying-status.patch) introduced this use of the lock, but userspace applications using libdevmapper have been found to request STATUSTYPE_TABLE while the device is suspended and the lock is already held, leading to deadlock. Since the lock is not necessary in this case, don't try to take it. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3a3ba46e6d4b..d135212958f1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1152,10 +1152,11 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, unsigned sz = 0; struct dm_snapshot *snap = ti->private; - down_write(&snap->lock); - switch (type) { case STATUSTYPE_INFO: + + down_write(&snap->lock); + if (!snap->valid) DMEMIT("Invalid"); else { @@ -1171,6 +1172,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, else DMEMIT("Unknown"); } + + up_write(&snap->lock); + break; case STATUSTYPE_TABLE: @@ -1185,8 +1189,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, break; } - up_write(&snap->lock); - return 0; } From 8e87b9b81b3c370f7e53c1ab6e1c3519ef37a644 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:51:54 +0000 Subject: [PATCH 05/80] dm snapshot: cope with chunk size larger than origin Under some special conditions the snapshot hash_size is calculated as zero. This patch instead sets a minimum value of 64, the same as for the pending exception table. rounddown_pow_of_two(0) is an undefined operation (it expands to shift by -1). init_exception_table with an argument of 0 would fail with -ENOMEM. The way to trigger the problem is to create a snapshot with a chunk size that is larger than the origin device. Cc: stable@kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d135212958f1..8a4a9c838afd 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -553,6 +553,8 @@ static int init_hash_tables(struct dm_snapshot *s) hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; hash_size = min(hash_size, max_buckets); + if (hash_size < 64) + hash_size = 64; hash_size = rounddown_pow_of_two(hash_size); if (init_exception_table(&s->complete, hash_size, DM_CHUNK_CONSECUTIVE_BITS)) From 0b4309581b5be8749afdd5a9087fd82a2a5c9932 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 10 Dec 2009 23:51:55 +0000 Subject: [PATCH 06/80] dm crypt: make wipe message also wipe tfm key The "wipe key" message is used to wipe a volume key from memory temporarily, for example when suspending to RAM. There are two instances of the key in memory (inside crypto tfm) but only one got wiped. This patch wipes them both. Cc: stable@kernel.org Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index e412980763bd..f2c139305e13 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -934,14 +934,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key) set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return 0; + return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); } static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return 0; + return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); } /* @@ -983,11 +983,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - if (crypt_set_key(cc, argv[1])) { - ti->error = "Error decoding key"; - goto bad_cipher; - } - /* Compatibility mode for old dm-crypt cipher strings */ if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { chainmode = "cbc"; @@ -1015,6 +1010,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) strcpy(cc->chainmode, chainmode); cc->tfm = tfm; + if (crypt_set_key(cc, argv[1]) < 0) { + ti->error = "Error decoding and setting key"; + goto bad_ivmode; + } + /* * Choose ivmode. Valid modes: "plain", "essiv:", "benbi". * See comments at iv code @@ -1085,11 +1085,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_bs; } - if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { - ti->error = "Error setting key"; - goto bad_device; - } - if (sscanf(argv[2], "%llu", &tmpll) != 1) { ti->error = "Invalid iv_offset sector"; goto bad_device; From 6047359277517c4e56d8bfd6ea4966d7a3924151 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 10 Dec 2009 23:51:55 +0000 Subject: [PATCH 07/80] dm crypt: move private iv fields to structs Define private structures for IV so it's easy to add further attributes in a following patch which fixes the way key material is wiped from memory. Also move ESSIV destructor and remove unnecessary 'status' operation. There are no functional changes in this patch. Cc: stable@kernel.org Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f2c139305e13..bec5ac54e23e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -71,10 +71,17 @@ struct crypt_iv_operations { int (*ctr)(struct crypt_config *cc, struct dm_target *ti, const char *opts); void (*dtr)(struct crypt_config *cc); - const char *(*status)(struct crypt_config *cc); int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); }; +struct iv_essiv_private { + struct crypto_cipher *tfm; +}; + +struct iv_benbi_private { + int shift; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. @@ -102,8 +109,8 @@ struct crypt_config { struct crypt_iv_operations *iv_gen_ops; char *iv_mode; union { - struct crypto_cipher *essiv_tfm; - int benbi_shift; + struct iv_essiv_private essiv; + struct iv_benbi_private benbi; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; @@ -169,6 +176,14 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) return 0; } +static void crypt_iv_essiv_dtr(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + + crypto_free_cipher(essiv->tfm); + essiv->tfm = NULL; +} + static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { @@ -236,21 +251,15 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, } kfree(salt); - cc->iv_gen_private.essiv_tfm = essiv_tfm; + cc->iv_gen_private.essiv.tfm = essiv_tfm; return 0; } -static void crypt_iv_essiv_dtr(struct crypt_config *cc) -{ - crypto_free_cipher(cc->iv_gen_private.essiv_tfm); - cc->iv_gen_private.essiv_tfm = NULL; -} - static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) { memset(iv, 0, cc->iv_size); *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); + crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); return 0; } @@ -273,7 +282,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - cc->iv_gen_private.benbi_shift = 9 - log; + cc->iv_gen_private.benbi.shift = 9 - log; return 0; } @@ -288,7 +297,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); + val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); return 0; From 5861f1be00b3b70f8ab5e5a81392a6cf69666cd2 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 10 Dec 2009 23:51:56 +0000 Subject: [PATCH 08/80] dm crypt: restructure essiv error path Use kzfree for salt deallocation because it is derived from the volume key. Use a common error path in ESSIV constructor. Required by a later patch which fixes the way key material is wiped from memory. Cc: stable@kernel.org Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 46 ++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index bec5ac54e23e..2301d223f2ae 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -187,15 +187,15 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc) static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - struct crypto_cipher *essiv_tfm; - struct crypto_hash *hash_tfm; + struct crypto_cipher *essiv_tfm = NULL; + struct crypto_hash *hash_tfm = NULL; struct hash_desc desc; struct scatterlist sg; unsigned int saltsize; - u8 *salt; + u8 *salt = NULL; int err; - if (opts == NULL) { + if (!opts) { ti->error = "Digest algorithm missing for ESSIV mode"; return -EINVAL; } @@ -204,15 +204,16 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash_tfm)) { ti->error = "Error initializing ESSIV hash"; - return PTR_ERR(hash_tfm); + err = PTR_ERR(hash_tfm); + goto bad; } saltsize = crypto_hash_digestsize(hash_tfm); - salt = kmalloc(saltsize, GFP_KERNEL); - if (salt == NULL) { + salt = kzalloc(saltsize, GFP_KERNEL); + if (!salt) { ti->error = "Error kmallocing salt storage in ESSIV"; - crypto_free_hash(hash_tfm); - return -ENOMEM; + err = -ENOMEM; + goto bad; } sg_init_one(&sg, cc->key, cc->key_size); @@ -220,39 +221,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_hash_digest(&desc, &sg, cc->key_size, salt); crypto_free_hash(hash_tfm); + hash_tfm = NULL; if (err) { ti->error = "Error calculating hash in ESSIV"; - kfree(salt); - return err; + goto bad; } /* Setup the essiv_tfm with the given salt */ essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(essiv_tfm)) { ti->error = "Error allocating crypto tfm for ESSIV"; - kfree(salt); - return PTR_ERR(essiv_tfm); + err = PTR_ERR(essiv_tfm); + goto bad; } if (crypto_cipher_blocksize(essiv_tfm) != crypto_ablkcipher_ivsize(cc->tfm)) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; - crypto_free_cipher(essiv_tfm); - kfree(salt); - return -EINVAL; + err = -EINVAL; + goto bad; } err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); if (err) { ti->error = "Failed to set key for ESSIV cipher"; - crypto_free_cipher(essiv_tfm); - kfree(salt); - return err; + goto bad; } - kfree(salt); + kzfree(salt); cc->iv_gen_private.essiv.tfm = essiv_tfm; return 0; + +bad: + if (essiv_tfm && !IS_ERR(essiv_tfm)) + crypto_free_cipher(essiv_tfm); + if (hash_tfm && !IS_ERR(hash_tfm)) + crypto_free_hash(hash_tfm); + kzfree(salt); + return err; } static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) From b95bf2d3d5a48b095bffe2a0cd8c40453cf59557 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 10 Dec 2009 23:51:56 +0000 Subject: [PATCH 09/80] dm crypt: separate essiv allocation from initialisation This patch separates the construction of IV from its initialisation. (For ESSIV it is a hash calculation based on volume key.) Constructor code now preallocates hash tfm and salt array and saves it in a private IV structure. The next patch requires this to reinitialise the wiped IV without reallocating memory when resuming a suspended device. Cc: stable@kernel.org Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 69 +++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2301d223f2ae..446153a071d6 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -71,11 +71,14 @@ struct crypt_iv_operations { int (*ctr)(struct crypt_config *cc, struct dm_target *ti, const char *opts); void (*dtr)(struct crypt_config *cc); + int (*init)(struct crypt_config *cc); int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); }; struct iv_essiv_private { struct crypto_cipher *tfm; + struct crypto_hash *hash_tfm; + u8 *salt; }; struct iv_benbi_private { @@ -176,12 +179,38 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) return 0; } +/* Initialise ESSIV - compute salt but no local memory allocations */ +static int crypt_iv_essiv_init(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + struct hash_desc desc; + struct scatterlist sg; + int err; + + sg_init_one(&sg, cc->key, cc->key_size); + desc.tfm = essiv->hash_tfm; + desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); + if (err) + return err; + + return crypto_cipher_setkey(essiv->tfm, essiv->salt, + crypto_hash_digestsize(essiv->hash_tfm)); +} + static void crypt_iv_essiv_dtr(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; crypto_free_cipher(essiv->tfm); essiv->tfm = NULL; + + crypto_free_hash(essiv->hash_tfm); + essiv->hash_tfm = NULL; + + kzfree(essiv->salt); + essiv->salt = NULL; } static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -189,9 +218,6 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, { struct crypto_cipher *essiv_tfm = NULL; struct crypto_hash *hash_tfm = NULL; - struct hash_desc desc; - struct scatterlist sg; - unsigned int saltsize; u8 *salt = NULL; int err; @@ -200,7 +226,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - /* Hash the cipher key with the given hash algorithm */ + /* Allocate hash algorithm */ hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash_tfm)) { ti->error = "Error initializing ESSIV hash"; @@ -208,27 +234,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, goto bad; } - saltsize = crypto_hash_digestsize(hash_tfm); - salt = kzalloc(saltsize, GFP_KERNEL); + salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); if (!salt) { ti->error = "Error kmallocing salt storage in ESSIV"; err = -ENOMEM; goto bad; } - sg_init_one(&sg, cc->key, cc->key_size); - desc.tfm = hash_tfm; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - err = crypto_hash_digest(&desc, &sg, cc->key_size, salt); - crypto_free_hash(hash_tfm); - hash_tfm = NULL; - - if (err) { - ti->error = "Error calculating hash in ESSIV"; - goto bad; - } - - /* Setup the essiv_tfm with the given salt */ + /* Allocate essiv_tfm */ essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(essiv_tfm)) { ti->error = "Error allocating crypto tfm for ESSIV"; @@ -242,14 +255,11 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, err = -EINVAL; goto bad; } - err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); - if (err) { - ti->error = "Failed to set key for ESSIV cipher"; - goto bad; - } - kzfree(salt); + cc->iv_gen_private.essiv.salt = salt; cc->iv_gen_private.essiv.tfm = essiv_tfm; + cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + return 0; bad: @@ -257,7 +267,7 @@ bad: crypto_free_cipher(essiv_tfm); if (hash_tfm && !IS_ERR(hash_tfm)) crypto_free_hash(hash_tfm); - kzfree(salt); + kfree(salt); return err; } @@ -323,6 +333,7 @@ static struct crypt_iv_operations crypt_iv_plain_ops = { static struct crypt_iv_operations crypt_iv_essiv_ops = { .ctr = crypt_iv_essiv_ctr, .dtr = crypt_iv_essiv_dtr, + .init = crypt_iv_essiv_init, .generator = crypt_iv_essiv_gen }; @@ -1054,6 +1065,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) goto bad_ivmode; + if (cc->iv_gen_ops && cc->iv_gen_ops->init && + cc->iv_gen_ops->init(cc) < 0) { + ti->error = "Error initialising IV"; + goto bad_slab_pool; + } + cc->iv_size = crypto_ablkcipher_ivsize(tfm); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ From 542da317668c35036e8471822a564b609d05af66 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 10 Dec 2009 23:51:57 +0000 Subject: [PATCH 10/80] dm crypt: make wipe message also wipe essiv key The "wipe key" message is used to wipe the volume key from memory temporarily, for example when suspending to RAM. But the initialisation vector in ESSIV mode is calculated from the hashed volume key, so the wipe message should wipe this IV key too and reinitialise it when the volume key is reinstated. This patch adds an IV wipe method called from a wipe message callback. ESSIV is then reinitialised using the init function added by the last patch. Cc: stable@kernel.org Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 446153a071d6..91e1bf91769f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2003 Christophe Saout * Copyright (C) 2004 Clemens Fruhwirth - * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -72,6 +72,7 @@ struct crypt_iv_operations { const char *opts); void (*dtr)(struct crypt_config *cc); int (*init)(struct crypt_config *cc); + int (*wipe)(struct crypt_config *cc); int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); }; @@ -199,6 +200,17 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) crypto_hash_digestsize(essiv->hash_tfm)); } +/* Wipe salt and reset key derived from volume key */ +static int crypt_iv_essiv_wipe(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + + memset(essiv->salt, 0, salt_size); + + return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); +} + static void crypt_iv_essiv_dtr(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; @@ -334,6 +346,7 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = { .ctr = crypt_iv_essiv_ctr, .dtr = crypt_iv_essiv_dtr, .init = crypt_iv_essiv_init, + .wipe = crypt_iv_essiv_wipe, .generator = crypt_iv_essiv_gen }; @@ -1305,6 +1318,7 @@ static void crypt_resume(struct dm_target *ti) static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) { struct crypt_config *cc = ti->private; + int ret = -EINVAL; if (argc < 2) goto error; @@ -1314,10 +1328,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) DMWARN("not suspended during key manipulation."); return -EINVAL; } - if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) - return crypt_set_key(cc, argv[2]); - if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) + if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { + ret = crypt_set_key(cc, argv[2]); + if (ret) + return ret; + if (cc->iv_gen_ops && cc->iv_gen_ops->init) + ret = cc->iv_gen_ops->init(cc); + return ret; + } + if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { + ret = cc->iv_gen_ops->wipe(cc); + if (ret) + return ret; + } return crypt_wipe_key(cc); + } } error: From 952b355760c196ec014dd0b6878f85a11496e3da Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:51:57 +0000 Subject: [PATCH 11/80] dm io: use slab for struct io Allocate "struct io" from a slab. This patch changes dm-io, so that "struct io" is allocated from a slab cache. It used to be allocated with kmalloc. Allocating from a slab will be needed for the next patch, because it requires a special alignment of "struct io" and kmalloc cannot meet this alignment. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-io.c | 21 ++++++++++++++++++++- drivers/md/dm.c | 2 ++ drivers/md/dm.h | 3 +++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3a2e6a2f8bdd..b0d264e684fd 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -5,6 +5,8 @@ * This file is released under the GPL. */ +#include "dm.h" + #include #include @@ -30,6 +32,8 @@ struct io { void *context; }; +static struct kmem_cache *_dm_io_cache; + /* * io contexts are only dynamically allocated for asynchronous * io. Since async io is likely to be the majority of io we'll @@ -53,7 +57,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) if (!client) return ERR_PTR(-ENOMEM); - client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); + client->pool = mempool_create_slab_pool(ios, _dm_io_cache); if (!client->pool) goto bad; @@ -472,3 +476,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, &dp, io_req->notify.fn, io_req->notify.context); } EXPORT_SYMBOL(dm_io); + +int __init dm_io_init(void) +{ + _dm_io_cache = KMEM_CACHE(io, 0); + if (!_dm_io_cache) + return -ENOMEM; + + return 0; +} + +void dm_io_exit(void) +{ + kmem_cache_destroy(_dm_io_cache); + _dm_io_cache = NULL; +} diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 724efc63904d..473f0c3c0192 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -275,6 +275,7 @@ static int (*_inits[])(void) __initdata = { dm_target_init, dm_linear_init, dm_stripe_init, + dm_io_init, dm_kcopyd_init, dm_interface_init, }; @@ -284,6 +285,7 @@ static void (*_exits[])(void) = { dm_target_exit, dm_linear_exit, dm_stripe_exit, + dm_io_exit, dm_kcopyd_exit, dm_interface_exit, }; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a7663eba17e2..4a95e8fa3607 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -118,6 +118,9 @@ int dm_lock_for_deletion(struct mapped_device *md); void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned cookie); +int dm_io_init(void); +void dm_io_exit(void); + int dm_kcopyd_init(void); void dm_kcopyd_exit(void); From f1e539874655ae9e74c1644fd54133b19f1b14e2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:51:58 +0000 Subject: [PATCH 12/80] dm io: remove extra bi_io_vec region hack Remove the hack where we allocate an extra bi_io_vec to store additional private data. This hack prevents us from supporting barriers in dm-raid1 without first making another little block layer change. Instead of doing that, this patch eliminates the bi_io_vec abuse by storing the region number directly in the low bits of bi_private. We need to store two things for each bio, the pointer to the main io structure and, if parallel writes were requested, an index indicating which of these writes this bio belongs to. There can be at most BITS_PER_LONG regions - 32 or 64. The index (region number) was stored in the last (hidden) bio vector and the pointer to struct io was stored in bi_private. This patch now aligns "struct io" on BITS_PER_LONG bytes and stores the region number in the low BITS_PER_LONG bits of bi_private. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-io.c | 89 ++++++++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index b0d264e684fd..f6a714c5aab0 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -16,12 +16,19 @@ #include #include +#define DM_MSG_PREFIX "io" + +#define DM_IO_MAX_REGIONS BITS_PER_LONG + struct dm_io_client { mempool_t *pool; struct bio_set *bios; }; -/* FIXME: can we shrink this ? */ +/* + * Aligning 'struct io' reduces the number of bits required to store + * its address. Refer to store_io_and_region_in_bio() below. + */ struct io { unsigned long error_bits; unsigned long eopnotsupp_bits; @@ -30,7 +37,7 @@ struct io { struct dm_io_client *client; io_notify_fn callback; void *context; -}; +} __attribute__((aligned(DM_IO_MAX_REGIONS))); static struct kmem_cache *_dm_io_cache; @@ -92,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy); /*----------------------------------------------------------------- * We need to keep track of which region a bio is doing io for. - * In order to save a memory allocation we store this the last - * bvec which we know is unused (blech). - * XXX This is ugly and can OOPS with some configs... find another way. + * To avoid a memory allocation to store just 5 or 6 bits, we + * ensure the 'struct io' pointer is aligned so enough low bits are + * always zero and then combine it with the region number directly in + * bi_private. *---------------------------------------------------------------*/ -static inline void bio_set_region(struct bio *bio, unsigned region) +static void store_io_and_region_in_bio(struct bio *bio, struct io *io, + unsigned region) { - bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; + if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { + DMCRIT("Unaligned struct io pointer %p", io); + BUG(); + } + + bio->bi_private = (void *)((unsigned long)io | region); } -static inline unsigned bio_get_region(struct bio *bio) +static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, + unsigned *region) { - return bio->bi_io_vec[bio->bi_max_vecs].bv_len; + unsigned long val = (unsigned long)bio->bi_private; + + *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); + *region = val & (DM_IO_MAX_REGIONS - 1); } /*----------------------------------------------------------------- @@ -144,10 +162,8 @@ static void endio(struct bio *bio, int error) /* * The bio destructor in bio_put() may use the io object. */ - io = bio->bi_private; - region = bio_get_region(bio); + retrieve_io_and_region_from_bio(bio, &io, ®ion); - bio->bi_max_vecs++; bio_put(bio); dec_count(io, region, error); @@ -247,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data) static void dm_bio_destructor(struct bio *bio) { - struct io *io = bio->bi_private; + unsigned region; + struct io *io; + + retrieve_io_and_region_from_bio(bio, &io, ®ion); bio_free(bio, io->client->bios); } @@ -292,24 +311,17 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, while (remaining) { /* - * Allocate a suitably sized-bio: we add an extra - * bvec for bio_get/set_region() and decrement bi_max_vecs - * to hide it from bio_add_page(). + * Allocate a suitably sized-bio. */ num_bvecs = dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)); - num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), - num_bvecs); - if (unlikely(num_bvecs > BIO_MAX_PAGES)) - num_bvecs = BIO_MAX_PAGES; + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; - bio->bi_private = io; bio->bi_destructor = dm_bio_destructor; - bio->bi_max_vecs--; - bio_set_region(bio, region); + store_io_and_region_in_bio(bio, io, region); /* * Try and add as many pages as possible. @@ -337,6 +349,8 @@ static void dispatch_io(int rw, unsigned int num_regions, int i; struct dpages old_pages = *dp; + BUG_ON(num_regions > DM_IO_MAX_REGIONS); + if (sync) rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); @@ -361,7 +375,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, int rw, struct dpages *dp, unsigned long *error_bits) { - struct io io; + /* + * gcc <= 4.3 can't do the alignment for stack variables, so we must + * align it on our own. + * volatile prevents the optimizer from removing or reusing + * "io_" field from the stack frame (allowed in ANSI C). + */ + volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; + struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); @@ -369,33 +390,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, } retry: - io.error_bits = 0; - io.eopnotsupp_bits = 0; - atomic_set(&io.count, 1); /* see dispatch_io() */ - io.sleeper = current; - io.client = client; + io->error_bits = 0; + io->eopnotsupp_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->sleeper = current; + io->client = client; - dispatch_io(rw, num_regions, where, dp, &io, 1); + dispatch_io(rw, num_regions, where, dp, io, 1); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&io.count)) + if (!atomic_read(&io->count)) break; io_schedule(); } set_current_state(TASK_RUNNING); - if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { + if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { rw &= ~(1 << BIO_RW_BARRIER); goto retry; } if (error_bits) - *error_bits = io.error_bits; + *error_bits = io->error_bits; - return io.error_bits ? -EIO : 0; + return io->error_bits ? -EIO : 0; } static int async_io(struct dm_io_client *client, unsigned int num_regions, From 4184153f9e483f9bb63339ed316e059962fe9794 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:51:59 +0000 Subject: [PATCH 13/80] dm raid1: support flush Flush support for dm-raid1. When it receives an empty barrier, submit it to all the devices via dm-io. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 13 +++++++++++-- drivers/md/dm-region-hash.c | 25 +++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index cc9dc79b0784..752a29e1855b 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -396,6 +396,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) */ static sector_t map_sector(struct mirror *m, struct bio *bio) { + if (unlikely(!bio->bi_size)) + return 0; return m->offset + (bio->bi_sector - m->ms->ti->begin); } @@ -562,7 +564,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct dm_io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE, + .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), .mem.type = DM_IO_BVEC, .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, .notify.fn = write_callback, @@ -603,6 +605,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { + if (unlikely(bio_empty_barrier(bio))) { + bio_list_add(&sync, bio); + continue; + } + region = dm_rh_bio_to_region(ms->rh, bio); if (log->type->is_remote_recovering && @@ -995,6 +1002,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = ms; ti->split_io = dm_rh_get_region_size(ms->rh); + ti->num_flush_requests = 1; ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); if (!ms->kmirrord_wq) { @@ -1122,7 +1130,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * We need to dec pending if this was a write. */ if (rw == WRITE) { - dm_rh_dec(ms->rh, map_context->ll); + if (likely(!bio_empty_barrier(bio))) + dm_rh_dec(ms->rh, map_context->ll); return error; } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 36dbe29f2fd6..00806b760ccd 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -79,6 +79,11 @@ struct dm_region_hash { struct list_head recovered_regions; struct list_head failed_recovered_regions; + /* + * If there was a barrier failure no regions can be marked clean. + */ + int barrier_failure; + void *context; sector_t target_begin; @@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create( INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); INIT_LIST_HEAD(&rh->failed_recovered_regions); + rh->barrier_failure = 0; rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, sizeof(struct dm_region)); @@ -395,6 +401,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, region_t region = dm_rh_bio_to_region(rh, bio); int recovering = 0; + if (bio_empty_barrier(bio)) { + rh->barrier_failure = 1; + return; + } + /* We must inform the log that the sync count has changed. */ log->type->set_region_sync(log, region, 0); @@ -515,8 +526,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) { struct bio *bio; - for (bio = bios->head; bio; bio = bio->bi_next) + for (bio = bios->head; bio; bio = bio->bi_next) { + if (bio_empty_barrier(bio)) + continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + } } EXPORT_SYMBOL_GPL(dm_rh_inc_pending); @@ -544,7 +558,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) */ /* do nothing for DM_RH_NOSYNC */ - if (reg->state == DM_RH_RECOVERING) { + if (unlikely(rh->barrier_failure)) { + /* + * If a write barrier failed some time ago, we + * don't know whether or not this write made it + * to the disk, so we must resync the device. + */ + reg->state = DM_RH_NOSYNC; + } else if (reg->state == DM_RH_RECOVERING) { list_add_tail(®->list, &rh->quiesced_regions); } else if (reg->state == DM_RH_DIRTY) { reg->state = DM_RH_CLEAN; From b09acf1aa79462bdacfe6744b469a17722a52702 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:51:59 +0000 Subject: [PATCH 14/80] dm raid1: split touched state into two Split the variable "touched" into two, "touched_dirtied" and "touched_cleaned", set when some region was dirtied or cleaned. This will be used to optimize flushes. After a transition from "dirty" to "clean" state we don't have flush hardware cache on the log device. After a transition from "clean" to "dirty" the cache must be flushed. Before a transition from "clean" to "dirty" state we don't have to flush all the raid legs. Before a transition from "dirty" to "clean" we must flush all the legs to make sure that they are really in sync. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 9443896ede07..31dc33df95c7 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -208,7 +208,8 @@ struct log_header { struct log_c { struct dm_target *ti; - int touched; + int touched_dirtied; + int touched_cleaned; uint32_t region_size; unsigned int region_count; region_t sync_count; @@ -253,14 +254,14 @@ static inline void log_set_bit(struct log_c *l, uint32_t *bs, unsigned bit) { ext2_set_bit(bit, (unsigned long *) bs); - l->touched = 1; + l->touched_cleaned = 1; } static inline void log_clear_bit(struct log_c *l, uint32_t *bs, unsigned bit) { ext2_clear_bit(bit, (unsigned long *) bs); - l->touched = 1; + l->touched_dirtied = 1; } /*---------------------------------------------------------------- @@ -378,7 +379,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } lc->ti = ti; - lc->touched = 0; + lc->touched_dirtied = 0; + lc->touched_cleaned = 0; lc->region_size = region_size; lc->region_count = region_count; lc->sync = sync; @@ -660,14 +662,16 @@ static int disk_flush(struct dm_dirty_log *log) struct log_c *lc = (struct log_c *) log->context; /* only write if the log has changed */ - if (!lc->touched) + if (!lc->touched_cleaned && !lc->touched_dirtied) return 0; r = rw_header(lc, WRITE); if (r) fail_log_device(lc); - else - lc->touched = 0; + else { + lc->touched_dirtied = 0; + lc->touched_cleaned = 0; + } return r; } From 20a34a8ecc7d03eaa5054f58169ebff12f5f1f8c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:00 +0000 Subject: [PATCH 15/80] dm log: add flush_header function Introduce flush_header and use it to flush the log device. Note that we don't have to flush if all the regions transition from "dirty" to "clean" state. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 31dc33df95c7..6b23631db5b5 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -288,6 +288,19 @@ static int rw_header(struct log_c *lc, int rw) return dm_io(&lc->io_req, 1, &lc->header_location, NULL); } +static int flush_header(struct log_c *lc) +{ + struct dm_io_region null_location = { + .bdev = lc->header_location.bdev, + .sector = 0, + .count = 0, + }; + + lc->io_req.bi_rw = WRITE_BARRIER; + + return dm_io(&lc->io_req, 1, &null_location, NULL); +} + static int read_header(struct log_c *log) { int r; @@ -616,6 +629,8 @@ static int disk_resume(struct dm_dirty_log *log) /* write the new header */ r = rw_header(lc, WRITE); + if (!r) + r = flush_header(lc); if (r) { DMWARN("%s: Failed to write header on dirty region log device", lc->log_dev->name); @@ -669,7 +684,13 @@ static int disk_flush(struct dm_dirty_log *log) if (r) fail_log_device(lc); else { - lc->touched_dirtied = 0; + if (lc->touched_dirtied) { + r = flush_header(lc); + if (r) + fail_log_device(lc); + else + lc->touched_dirtied = 0; + } lc->touched_cleaned = 0; } From 5adc78d0d231b030405b31759f125f13404fdb64 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:00 +0000 Subject: [PATCH 16/80] dm log: introduce flush_failed variable Introduce "flush failed" variable. When a flush before clearing a bit in the log fails, we don't know anything about which which regions are in-sync and which not. So we need to set all regions as not-in-sync and set the variable "flush_failed" to prevent setting the in-sync bit in the future. A target reload is the only way to get out of this situation. The variable will be set in following patches. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 6b23631db5b5..d779f8c915dd 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -210,6 +210,7 @@ struct log_c { struct dm_target *ti; int touched_dirtied; int touched_cleaned; + int flush_failed; uint32_t region_size; unsigned int region_count; region_t sync_count; @@ -394,6 +395,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, lc->ti = ti; lc->touched_dirtied = 0; lc->touched_cleaned = 0; + lc->flush_failed = 0; lc->region_size = region_size; lc->region_count = region_count; lc->sync = sync; @@ -706,7 +708,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region) static void core_clear_region(struct dm_dirty_log *log, region_t region) { struct log_c *lc = (struct log_c *) log->context; - log_set_bit(lc, lc->clean_bits, region); + if (likely(!lc->flush_failed)) + log_set_bit(lc, lc->clean_bits, region); } static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) From 87a8f240e9bcf025ba45e4563c842b0d59c5e8ef Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:01 +0000 Subject: [PATCH 17/80] dm log: add flush callback fn Introduce a callback pointer from the log to dm-raid1 layer. Before some region is set as "in-sync", we need to flush hardware cache on all the disks. But the log module doesn't have access to the mirror_set structure. So it will use this callback. So far the callback is unused, it will be used in further patches. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 6 ++++-- drivers/md/dm-raid1.c | 2 +- include/linux/dm-dirty-log.h | 6 ++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index d779f8c915dd..666a80e3602e 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) EXPORT_SYMBOL(dm_dirty_log_type_unregister); struct dm_dirty_log *dm_dirty_log_create(const char *type_name, - struct dm_target *ti, - unsigned int argc, char **argv) + struct dm_target *ti, + int (*flush_callback_fn)(struct dm_target *ti), + unsigned int argc, char **argv) { struct dm_dirty_log_type *type; struct dm_dirty_log *log; @@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name, return NULL; } + log->flush_callback_fn = flush_callback_fn; log->type = type; if (type->ctr(log, ti, argc, argv)) { kfree(log); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 752a29e1855b..d44bc497dad9 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -896,7 +896,7 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, return NULL; } - dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); + dl = dm_dirty_log_create(argv[0], ti, NULL, param_count, argv + 2); if (!dl) { ti->error = "Error creating mirror dirty log"; return NULL; diff --git a/include/linux/dm-dirty-log.h b/include/linux/dm-dirty-log.h index 5e8b11d88f6f..7084503c3405 100644 --- a/include/linux/dm-dirty-log.h +++ b/include/linux/dm-dirty-log.h @@ -21,6 +21,7 @@ struct dm_dirty_log_type; struct dm_dirty_log { struct dm_dirty_log_type *type; + int (*flush_callback_fn)(struct dm_target *ti); void *context; }; @@ -136,8 +137,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type); * type->constructor/destructor() directly. */ struct dm_dirty_log *dm_dirty_log_create(const char *type_name, - struct dm_target *ti, - unsigned argc, char **argv); + struct dm_target *ti, + int (*flush_callback_fn)(struct dm_target *ti), + unsigned argc, char **argv); void dm_dirty_log_destroy(struct dm_dirty_log *log); #endif /* __KERNEL__ */ From 076010e2e6ea5b66dfd1f81a6133fb014c9b291d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:01 +0000 Subject: [PATCH 18/80] dm log: use flush callback fn Call the flush callback from the log. If flush failed, we have no alternative but to mark the whole log as dirty. Also we set the variable flush_failed to prevent any bits ever being marked as clean again. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 666a80e3602e..315e36a96b6f 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -677,13 +677,26 @@ static int core_flush(struct dm_dirty_log *log) static int disk_flush(struct dm_dirty_log *log) { - int r; - struct log_c *lc = (struct log_c *) log->context; + int r, i; + struct log_c *lc = log->context; /* only write if the log has changed */ if (!lc->touched_cleaned && !lc->touched_dirtied) return 0; + if (lc->touched_cleaned && log->flush_callback_fn && + log->flush_callback_fn(lc->ti)) { + /* + * At this point it is impossible to determine which + * regions are clean and which are dirty (without + * re-reading the log off disk). So mark all of them + * dirty. + */ + lc->flush_failed = 1; + for (i = 0; i < lc->region_count; i++) + log_clear_bit(lc, lc->clean_bits, i); + } + r = rw_header(lc, WRITE); if (r) fail_log_device(lc); From c0da3748b9a894b9f9b561ecc2d090a913988a0f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:02 +0000 Subject: [PATCH 19/80] dm raid1: implement mirror_flush Implement flush callee. It uses dm_io to send zero-size barrier synchronously and concurrently to all the mirror legs. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d44bc497dad9..751660b0c574 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -237,6 +237,40 @@ out: schedule_work(&ms->trigger_event); } +static int mirror_flush(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + unsigned long error_bits; + + unsigned int i; + struct dm_io_region io[ms->nr_mirrors]; + struct mirror *m; + struct dm_io_request io_req = { + .bi_rw = WRITE_BARRIER, + .mem.type = DM_IO_KMEM, + .mem.ptr.bvec = NULL, + .client = ms->io_client, + }; + + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { + io[i].bdev = m->dev->bdev; + io[i].sector = 0; + io[i].count = 0; + } + + error_bits = -1; + dm_io(&io_req, ms->nr_mirrors, io, &error_bits); + if (unlikely(error_bits != 0)) { + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error_bits)) + fail_mirror(ms->mirror + i, + DM_RAID1_WRITE_ERROR); + return -EIO; + } + + return 0; +} + /*----------------------------------------------------------------- * Recovery. * @@ -896,7 +930,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, return NULL; } - dl = dm_dirty_log_create(argv[0], ti, NULL, param_count, argv + 2); + dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, + argv + 2); if (!dl) { ti->error = "Error creating mirror dirty log"; return NULL; From 64b30c46e866bbff8a9e17883a18636adc358455 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:02 +0000 Subject: [PATCH 20/80] dm raid1: report flush errors separately in status Report flush errors as 'F' instead of 'D' for log and mirror devices. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 16 ++++++++++++---- drivers/md/dm-raid1.c | 6 ++++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 315e36a96b6f..7035582786fb 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -237,6 +237,7 @@ struct log_c { * Disk log fields */ int log_dev_failed; + int log_dev_flush_failed; struct dm_dev *log_dev; struct log_header header; @@ -425,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } else { lc->log_dev = dev; lc->log_dev_failed = 0; + lc->log_dev_flush_failed = 0; lc->header_location.bdev = lc->log_dev->bdev; lc->header_location.sector = 0; @@ -633,8 +635,11 @@ static int disk_resume(struct dm_dirty_log *log) /* write the new header */ r = rw_header(lc, WRITE); - if (!r) + if (!r) { r = flush_header(lc); + if (r) + lc->log_dev_flush_failed = 1; + } if (r) { DMWARN("%s: Failed to write header on dirty region log device", lc->log_dev->name); @@ -703,9 +708,10 @@ static int disk_flush(struct dm_dirty_log *log) else { if (lc->touched_dirtied) { r = flush_header(lc); - if (r) + if (r) { + lc->log_dev_flush_failed = 1; fail_log_device(lc); - else + } else lc->touched_dirtied = 0; } lc->touched_cleaned = 0; @@ -805,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status, switch(status) { case STATUSTYPE_INFO: DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, - lc->log_dev_failed ? 'D' : 'A'); + lc->log_dev_flush_failed ? 'F' : + lc->log_dev_failed ? 'D' : + 'A'); break; case STATUSTYPE_TABLE: diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 751660b0c574..85c8704c67bb 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); *---------------------------------------------------------------*/ enum dm_raid1_error { DM_RAID1_WRITE_ERROR, + DM_RAID1_FLUSH_ERROR, DM_RAID1_SYNC_ERROR, DM_RAID1_READ_ERROR }; @@ -264,7 +265,7 @@ static int mirror_flush(struct dm_target *ti) for (i = 0; i < ms->nr_mirrors; i++) if (test_bit(i, &error_bits)) fail_mirror(ms->mirror + i, - DM_RAID1_WRITE_ERROR); + DM_RAID1_FLUSH_ERROR); return -EIO; } @@ -1288,7 +1289,8 @@ static char device_status_char(struct mirror *m) if (!atomic_read(&(m->error_count))) return 'A'; - return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : + return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : + (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; } From 04788507686d184d8166918b70ef52311bc36dcb Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:03 +0000 Subject: [PATCH 21/80] dm raid1: add framework to hold bios during suspend Add framework to delay bios until a suspend and then resubmit them with either DM_ENDIO_REQUEUE (if the suspend was noflush) or complete them with -EIO. I/O barrier support will use this. Signed-off-by: Mikulas Patocka Reviewed-by: Takahiro Yasui Tested-by: Takahiro Yasui Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 85c8704c67bb..895cce75eee9 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -58,6 +58,7 @@ struct mirror_set { struct bio_list reads; struct bio_list writes; struct bio_list failures; + struct bio_list holds; /* bios are waiting until suspend */ struct dm_region_hash *rh; struct dm_kcopyd_client *kcopyd_client; @@ -450,6 +451,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m, io->count = bio->bi_size >> 9; } +static void hold_bio(struct mirror_set *ms, struct bio *bio) +{ + /* + * If device is suspended, complete the bio. + */ + if (atomic_read(&ms->suspend)) { + if (dm_noflush_suspending(ms->ti)) + bio_endio(bio, DM_ENDIO_REQUEUE); + else + bio_endio(bio, -EIO); + return; + } + + /* + * Hold bio until the suspend is complete. + */ + spin_lock_irq(&ms->lock); + bio_list_add(&ms->holds, bio); + spin_unlock_irq(&ms->lock); +} + /*----------------------------------------------------------------- * Reads *---------------------------------------------------------------*/ @@ -1225,6 +1247,9 @@ static void mirror_presuspend(struct dm_target *ti) struct mirror_set *ms = (struct mirror_set *) ti->private; struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + struct bio_list holds; + struct bio *bio; + atomic_set(&ms->suspend, 1); /* @@ -1247,6 +1272,22 @@ static void mirror_presuspend(struct dm_target *ti) * we know that all of our I/O has been pushed. */ flush_workqueue(ms->kmirrord_wq); + + /* + * Now set ms->suspend is set and the workqueue flushed, no more + * entries can be added to ms->hold list, so process it. + * + * Bios can still arrive concurrently with or after this + * presuspend function, but they cannot join the hold list + * because ms->suspend is set. + */ + spin_lock_irq(&ms->lock); + holds = ms->holds; + bio_list_init(&ms->holds); + spin_unlock_irq(&ms->lock); + + while ((bio = bio_list_pop(&holds))) + hold_bio(ms, bio); } static void mirror_postsuspend(struct dm_target *ti) From 0f398a8403e31c737b429fddc3850093d0bf58d0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:04 +0000 Subject: [PATCH 22/80] dm raid1: use hold framework in do_failures Use the hold framework in do_failures. This patch doesn't change the bio processing logic, it just simplifies failure handling and avoids periodically polling the failures list. Signed-off-by: Mikulas Patocka Reviewed-by: Takahiro Yasui Tested-by: Takahiro Yasui Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 895cce75eee9..0253130fa13a 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -745,20 +745,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) { struct bio *bio; - if (!failures->head) + if (likely(!failures->head)) return; - if (!ms->log_failure) { - while ((bio = bio_list_pop(failures))) { - ms->in_sync = 0; - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); - } - return; - } - /* * If the log has failed, unattempted writes are being - * put on the failures list. We can't issue those writes + * put on the holds list. We can't issue those writes * until a log has been marked, so we must store them. * * If a 'noflush' suspend is in progress, we can requeue @@ -773,23 +765,15 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) * for us to treat them the same and requeue them * as well. */ - if (dm_noflush_suspending(ms->ti)) { - while ((bio = bio_list_pop(failures))) - bio_endio(bio, DM_ENDIO_REQUEUE); - return; + + while ((bio = bio_list_pop(failures))) { + if (ms->log_failure) + hold_bio(ms, bio); + else { + ms->in_sync = 0; + dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); + } } - - if (atomic_read(&ms->suspend)) { - while ((bio = bio_list_pop(failures))) - bio_endio(bio, -EIO); - return; - } - - spin_lock_irq(&ms->lock); - bio_list_merge(&ms->failures, failures); - spin_unlock_irq(&ms->lock); - - delayed_wake(ms); } static void trigger_event(struct work_struct *work) From 87968ddd2f3be1c21b932cac30157a83a1c4f935 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:04 +0000 Subject: [PATCH 23/80] dm raid1: abstract get_valid_mirror function Move the logic to get a valid mirror leg into a function for re-use in a later patch. Signed-off-by: Mikulas Patocka Reviewed-by: Takahiro Yasui Tested-by: Takahiro Yasui Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 0253130fa13a..d1a7f1a4789c 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -181,6 +181,17 @@ static void set_default_mirror(struct mirror *m) atomic_set(&ms->default_mirror, m - m0); } +static struct mirror *get_valid_mirror(struct mirror_set *ms) +{ + struct mirror *m; + + for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) + if (!atomic_read(&m->error_count)) + return m; + + return NULL; +} + /* fail_mirror * @m: mirror device to fail * @error_type: one of the enum's, DM_RAID1_*_ERROR @@ -226,13 +237,10 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) goto out; } - for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) - if (!atomic_read(&new->error_count)) { - set_default_mirror(new); - break; - } - - if (unlikely(new == ms->mirror + ms->nr_mirrors)) + new = get_valid_mirror(ms); + if (new) + set_default_mirror(new); + else DMWARN("All sides of mirror have failed."); out: From c58098be979509a54021e837a47fcad08db31f94 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:05 +0000 Subject: [PATCH 24/80] dm raid1: remove bio_endio from dm_rh_mark_nosync Move bio completion out of dm_rh_mark_nosync in preparation for the next patch. Signed-off-by: Mikulas Patocka Reviewed-by: Takahiro Yasui Tested-by: Takahiro Yasui Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 3 ++- drivers/md/dm-region-hash.c | 6 +----- include/linux/dm-region-hash.h | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d1a7f1a4789c..4f466ad75680 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -779,7 +779,8 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) hold_bio(ms, bio); else { ms->in_sync = 0; - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); + dm_rh_mark_nosync(ms->rh, bio); + bio_endio(bio, 0); } } } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 00806b760ccd..5f19ceb6fe91 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -383,8 +383,6 @@ static void complete_resync_work(struct dm_region *reg, int success) /* dm_rh_mark_nosync * @ms * @bio - * @done - * @error * * The bio was written on some mirror(s) but failed on other mirror(s). * We can successfully endio the bio but should avoid the region being @@ -392,8 +390,7 @@ static void complete_resync_work(struct dm_region *reg, int success) * * This function is _not_ safe in interrupt context! */ -void dm_rh_mark_nosync(struct dm_region_hash *rh, - struct bio *bio, unsigned done, int error) +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) { unsigned long flags; struct dm_dirty_log *log = rh->log; @@ -430,7 +427,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, BUG_ON(!list_empty(®->list)); spin_unlock_irqrestore(&rh->region_lock, flags); - bio_endio(bio, error); if (recovering) complete_resync_work(reg, 0); } diff --git a/include/linux/dm-region-hash.h b/include/linux/dm-region-hash.h index a9e652a41373..9e2a7a401df5 100644 --- a/include/linux/dm-region-hash.h +++ b/include/linux/dm-region-hash.h @@ -78,8 +78,7 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region); /* Delay bios on regions. */ void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio); -void dm_rh_mark_nosync(struct dm_region_hash *rh, - struct bio *bio, unsigned done, int error); +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio); /* * Region recovery control. From 60f355ead31e2be8d06ac8acb163df91a1c64e3b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:05 +0000 Subject: [PATCH 25/80] dm raid1: hold write bios when errors are handled Hold all write bios when errors are handled. Previously the failures list was used only when handling errors with a userspace daemon such as dmeventd. Now, it is always used for all bios. The regions where some writes failed must be marked as nosync. This can only be done in process context (i.e. in raid1 workqueue), not in the write_callback function. Previously the write would succeed if writing to at least one leg succeeded. This is wrong because data from the failed leg may be replicated to the correct leg. Now, if using a userspace daemon, the write with some failures will be held until the daemon has done its job and reconfigured the array. If not using a daemon, the write still succeeds if at least one leg succeeds. This is bad, but it is consistent with current behavior. Signed-off-by: Mikulas Patocka Reviewed-by: Takahiro Yasui Tested-by: Takahiro Yasui Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 63 ++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 4f466ad75680..e363335e8d81 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -578,7 +578,6 @@ static void write_callback(unsigned long error, void *context) unsigned i, ret = 0; struct bio *bio = (struct bio *) context; struct mirror_set *ms; - int uptodate = 0; int should_wake = 0; unsigned long flags; @@ -591,36 +590,27 @@ static void write_callback(unsigned long error, void *context) * This way we handle both writes to SYNC and NOSYNC * regions with the same code. */ - if (likely(!error)) - goto out; + if (likely(!error)) { + bio_endio(bio, ret); + return; + } for (i = 0; i < ms->nr_mirrors; i++) if (test_bit(i, &error)) fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); - else - uptodate = 1; - if (unlikely(!uptodate)) { - DMERR("All replicated volumes dead, failing I/O"); - /* None of the writes succeeded, fail the I/O. */ - ret = -EIO; - } else if (errors_handled(ms)) { - /* - * Need to raise event. Since raising - * events can block, we need to do it in - * the main thread. - */ - spin_lock_irqsave(&ms->lock, flags); - if (!ms->failures.head) - should_wake = 1; - bio_list_add(&ms->failures, bio); - spin_unlock_irqrestore(&ms->lock, flags); - if (should_wake) - wakeup_mirrord(ms); - return; - } -out: - bio_endio(bio, ret); + /* + * Need to raise event. Since raising + * events can block, we need to do it in + * the main thread. + */ + spin_lock_irqsave(&ms->lock, flags); + if (!ms->failures.head) + should_wake = 1; + bio_list_add(&ms->failures, bio); + spin_unlock_irqrestore(&ms->lock, flags); + if (should_wake) + wakeup_mirrord(ms); } static void do_write(struct mirror_set *ms, struct bio *bio) @@ -773,15 +763,26 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) * for us to treat them the same and requeue them * as well. */ - while ((bio = bio_list_pop(failures))) { - if (ms->log_failure) - hold_bio(ms, bio); - else { + if (!ms->log_failure) { ms->in_sync = 0; dm_rh_mark_nosync(ms->rh, bio); - bio_endio(bio, 0); } + + /* + * If all the legs are dead, fail the I/O. + * If we have been told to handle errors, hold the bio + * and wait for userspace to deal with the problem. + * Otherwise pretend that the I/O succeeded. (This would + * be wrong if the failed leg returned after reboot and + * got replicated back to the good legs.) + */ + if (!get_valid_mirror(ms)) + bio_endio(bio, -EIO); + else if (errors_handled(ms)) + hold_bio(ms, bio); + else + bio_endio(bio, 0); } } From 929be8fcb4b4b65d038e73d3bb34715851a95ca2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:06 +0000 Subject: [PATCH 26/80] dm raid1: hold all write bios when leg fails Hold all write bios when leg fails and errors are handled When using a userspace daemon such as dmeventd to handle errors, we must delay completing bios until it has done its job. This patch prevents the following race: - primary leg fails - write "1" fail, the write is held, secondary leg is set default - write "2" goes straight to the secondary leg Signed-off-by: Mikulas Patocka Reviewed-by: Takahiro Yasui Tested-by: Takahiro Yasui Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index e363335e8d81..f8d7b3aa46c6 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -69,6 +69,7 @@ struct mirror_set { region_t nr_regions; int in_sync; int log_failure; + int leg_failure; atomic_t suspend; atomic_t default_mirror; /* Default mirror */ @@ -211,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) struct mirror_set *ms = m->ms; struct mirror *new; + ms->leg_failure = 1; + /* * error_count is used for nothing more than a * simple way to tell if a device has encountered @@ -734,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) dm_rh_delay(ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(get_default_mirror(ms), bio); - generic_make_request(bio); + if (unlikely(ms->leg_failure) && errors_handled(ms)) + hold_bio(ms, bio); + else { + map_bio(get_default_mirror(ms), bio); + generic_make_request(bio); + } } } @@ -848,6 +855,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; ms->log_failure = 0; + ms->leg_failure = 0; atomic_set(&ms->suspend, 0); atomic_set(&ms->default_mirror, DEFAULT_MIRROR); From 5339fc2d47d1d720e027b9b832bf5aae8fba2ac0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:06 +0000 Subject: [PATCH 27/80] dm raid1: explicitly initialise bio_lists Explicitly initialize bio lists instead of relying on kzalloc. Signed-off-by: Mikulas Patocka Reviewed-by: Takahiro Yasui Tested-by: Takahiro Yasui Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index f8d7b3aa46c6..ad779bd13aec 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -849,6 +849,10 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, } spin_lock_init(&ms->lock); + bio_list_init(&ms->reads); + bio_list_init(&ms->writes); + bio_list_init(&ms->failures); + bio_list_init(&ms->holds); ms->ti = ti; ms->nr_mirrors = nr_mirrors; From a518b86d0b1b6a474f154697dc6f33e0a317ae72 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 10 Dec 2009 23:52:07 +0000 Subject: [PATCH 28/80] dm ioctl: prefer strlcpy over strncpy strlcpy() will always null terminate the string. The code should already guarantee this as the last bytes are already NULs and the string lengths were restricted before being stored in hc. Removing the '-1' becomes necessary so strlcpy() doesn't lose the last character of a maximum-length string. - agk Signed-off-by: Roel Kluin Signed-off-by: Andrew Morton Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d19854c98184..99de0e4ce831 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -645,9 +645,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param) * Sneakily write in both the name and the uuid * while we have the cell. */ - strncpy(param->name, hc->name, sizeof(param->name)); + strlcpy(param->name, hc->name, sizeof(param->name)); if (hc->uuid) - strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); + strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); else param->uuid[0] = '\0'; From f5acc834287dd4f6cf712093ce0e779e38caad70 Mon Sep 17 00:00:00 2001 From: Jon Brassow Date: Thu, 10 Dec 2009 23:52:07 +0000 Subject: [PATCH 29/80] dm snapshot: avoid else clause in persistent_read_metadata Minor code touch-up. We don't need the 'else'. Signed-off-by: Jonathan Brassow Reviewed-by: Mikulas Patocka Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap-persistent.c | 48 +++++++++++++++------------------ 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 0c746420c008..7e855fbeb22e 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -552,35 +552,31 @@ static int persistent_read_metadata(struct dm_exception_store *store, ps->current_area = 0; zero_memory_area(ps); r = zero_disk_area(ps, 0); - if (r) { - DMWARN("zero_disk_area(0) failed"); - return r; - } - } else { - /* - * Sanity checks. - */ - if (ps->version != SNAPSHOT_DISK_VERSION) { - DMWARN("unable to handle snapshot disk version %d", - ps->version); - return -EINVAL; - } - - /* - * Metadata are valid, but snapshot is invalidated - */ - if (!ps->valid) - return 1; - - /* - * Read the metadata. - */ - r = read_exceptions(ps, callback, callback_context); if (r) - return r; + DMWARN("zero_disk_area(0) failed"); + return r; + } + /* + * Sanity checks. + */ + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + return -EINVAL; } - return 0; + /* + * Metadata are valid, but snapshot is invalidated + */ + if (!ps->valid) + return 1; + + /* + * Read the metadata. + */ + r = read_exceptions(ps, callback, callback_context); + + return r; } static int persistent_prepare_exception(struct dm_exception_store *store, From 102c6ddb1d081a6a1fede38c43a42c9811313ec7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:08 +0000 Subject: [PATCH 30/80] dm snapshot: simplify sector_to_chunk expression Removed unnecessary 'and' masking: The right shift discards the lower bits so there is no need to clear them. (A later patch needs this change to support a 32-bit chunk_mask.) Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Reviewed-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 8a223a48802c..5f9315b32a42 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -162,7 +162,7 @@ static inline sector_t get_dev_size(struct block_device *bdev) static inline chunk_t sector_to_chunk(struct dm_exception_store *store, sector_t sector) { - return (sector & ~store->chunk_mask) >> store->chunk_shift; + return sector >> store->chunk_shift; } int dm_exception_store_type_register(struct dm_exception_store_type *type); From 7e201b35132a1f02c931a0a06760766c846bb49b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:08 +0000 Subject: [PATCH 31/80] dm snapshot: abstract minimum_chunk_size fn The origin needs to find minimum chunksize of all snapshots. This logic is moved to a separate function because it will be used at another place in the snapshot merge patches. Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Reviewed-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 8a4a9c838afd..48978ab42ae5 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -523,6 +523,25 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) return 0; } +#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) + +/* + * Return a minimum chunk size of all snapshots that have the specified origin. + * Return zero if the origin has no snapshots. + */ +static sector_t __minimum_chunk_size(struct origin *o) +{ + struct dm_snapshot *snap; + unsigned chunk_size = 0; + + if (o) + list_for_each_entry(snap, &o->snapshots, list) + chunk_size = min_not_zero(chunk_size, + snap->store->chunk_size); + + return chunk_size; +} + /* * Hard coded magic. */ @@ -1395,8 +1414,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio, return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; } -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - /* * Set the target "split_io" field to the minimum of all the snapshots' * chunk sizes. @@ -1404,19 +1421,12 @@ static int origin_map(struct dm_target *ti, struct bio *bio, static void origin_resume(struct dm_target *ti) { struct dm_dev *dev = ti->private; - struct dm_snapshot *snap; - struct origin *o; - unsigned chunk_size = 0; down_read(&_origins_lock); - o = __lookup_origin(dev->bdev); - if (o) - list_for_each_entry (snap, &o->snapshots, list) - chunk_size = min_not_zero(chunk_size, - snap->store->chunk_size); - up_read(&_origins_lock); - ti->split_io = chunk_size; + ti->split_io = __minimum_chunk_size(__lookup_origin(dev->bdev)); + + up_read(&_origins_lock); } static int origin_status(struct dm_target *ti, status_type_t type, char *result, From d32a6ea65fbc33621f9c790da3dff10201640b2a Mon Sep 17 00:00:00 2001 From: Jon Brassow Date: Thu, 10 Dec 2009 23:52:09 +0000 Subject: [PATCH 32/80] dm snapshot: consolidate insert exception functions Consolidate the insert_*exception functions. 'insert_completed_exception' already contains all the logic to handle 'insert_exception' (via check for a hash_shift of 0), so remove redundant function. Signed-off-by: Jonathan Brassow Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 48978ab42ae5..9135498213ef 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -390,13 +390,6 @@ static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) return (chunk >> et->hash_shift) & et->hash_mask; } -static void insert_exception(struct exception_table *eh, - struct dm_snap_exception *e) -{ - struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; - list_add(&e->hash_list, l); -} - static void remove_exception(struct dm_snap_exception *e) { list_del(&e->hash_list); @@ -457,10 +450,9 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) atomic_dec(&s->pending_exceptions_count); } -static void insert_completed_exception(struct dm_snapshot *s, - struct dm_snap_exception *new_e) +static void insert_exception(struct exception_table *eh, + struct dm_snap_exception *new_e) { - struct exception_table *eh = &s->complete; struct list_head *l; struct dm_snap_exception *e = NULL; @@ -518,7 +510,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; - insert_completed_exception(s, e); + insert_exception(&s->complete, e); return 0; } @@ -925,7 +917,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) * Add a proper exception, and remove the * in-flight exception from the list. */ - insert_completed_exception(s, e); + insert_exception(&s->complete, e); out: remove_exception(&pe->e); From 1d4989c858093bda0426be536fc7f9c415857836 Mon Sep 17 00:00:00 2001 From: Jon Brassow Date: Thu, 10 Dec 2009 23:52:10 +0000 Subject: [PATCH 33/80] dm snapshot: rename dm_snap_exception to dm_exception The exception structure is not necessarily just a snapshot element (especially after we pull it out of dm-snap.c). Renaming appropriately. Signed-off-by: Jonathan Brassow Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.h | 14 +++++++------- drivers/md/dm-snap-persistent.c | 4 ++-- drivers/md/dm-snap-transient.c | 4 ++-- drivers/md/dm-snap.c | 34 ++++++++++++++++----------------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 5f9315b32a42..92749696e359 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -26,7 +26,7 @@ typedef sector_t chunk_t; * of chunks that follow contiguously. Remaining bits hold the number of the * chunk within the device. */ -struct dm_snap_exception { +struct dm_exception { struct list_head hash_list; chunk_t old_chunk; @@ -64,13 +64,13 @@ struct dm_exception_store_type { * Find somewhere to store the next exception. */ int (*prepare_exception) (struct dm_exception_store *store, - struct dm_snap_exception *e); + struct dm_exception *e); /* * Update the metadata with this exception. */ void (*commit_exception) (struct dm_exception_store *store, - struct dm_snap_exception *e, + struct dm_exception *e, void (*callback) (void *, int success), void *callback_context); @@ -120,12 +120,12 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); } -static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) { return e->new_chunk >> DM_CHUNK_NUMBER_BITS; } -static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) { e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); @@ -140,12 +140,12 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) return chunk; } -static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) { return 0; } -static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) { } diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 7e855fbeb22e..24b8acd1be83 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -580,7 +580,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, } static int persistent_prepare_exception(struct dm_exception_store *store, - struct dm_snap_exception *e) + struct dm_exception *e) { struct pstore *ps = get_info(store); uint32_t stride; @@ -607,7 +607,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store, } static void persistent_commit_exception(struct dm_exception_store *store, - struct dm_snap_exception *e, + struct dm_exception *e, void (*callback) (void *, int success), void *callback_context) { diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index cde5aa558e6d..267801b34ff6 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c @@ -36,7 +36,7 @@ static int transient_read_metadata(struct dm_exception_store *store, } static int transient_prepare_exception(struct dm_exception_store *store, - struct dm_snap_exception *e) + struct dm_exception *e) { struct transient_c *tc = store->context; sector_t size = get_dev_size(store->cow->bdev); @@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store, } static void transient_commit_exception(struct dm_exception_store *store, - struct dm_snap_exception *e, + struct dm_exception *e, void (*callback) (void *, int success), void *callback_context) { diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9135498213ef..a7d60f644063 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -116,7 +116,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs) } struct dm_snap_pending_exception { - struct dm_snap_exception e; + struct dm_exception e; /* * Origin buffers waiting for this to complete are held @@ -371,7 +371,7 @@ static int init_exception_table(struct exception_table *et, uint32_t size, static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) { struct list_head *slot; - struct dm_snap_exception *ex, *next; + struct dm_exception *ex, *next; int i, size; size = et->hash_mask + 1; @@ -390,7 +390,7 @@ static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) return (chunk >> et->hash_shift) & et->hash_mask; } -static void remove_exception(struct dm_snap_exception *e) +static void remove_exception(struct dm_exception *e) { list_del(&e->hash_list); } @@ -399,11 +399,11 @@ static void remove_exception(struct dm_snap_exception *e) * Return the exception data for a sector, or NULL if not * remapped. */ -static struct dm_snap_exception *lookup_exception(struct exception_table *et, +static struct dm_exception *lookup_exception(struct exception_table *et, chunk_t chunk) { struct list_head *slot; - struct dm_snap_exception *e; + struct dm_exception *e; slot = &et->table[exception_hash(et, chunk)]; list_for_each_entry (e, slot, hash_list) @@ -414,9 +414,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, return NULL; } -static struct dm_snap_exception *alloc_exception(void) +static struct dm_exception *alloc_exception(void) { - struct dm_snap_exception *e; + struct dm_exception *e; e = kmem_cache_alloc(exception_cache, GFP_NOIO); if (!e) @@ -425,7 +425,7 @@ static struct dm_snap_exception *alloc_exception(void) return e; } -static void free_exception(struct dm_snap_exception *e) +static void free_exception(struct dm_exception *e) { kmem_cache_free(exception_cache, e); } @@ -451,10 +451,10 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) } static void insert_exception(struct exception_table *eh, - struct dm_snap_exception *new_e) + struct dm_exception *new_e) { struct list_head *l; - struct dm_snap_exception *e = NULL; + struct dm_exception *e = NULL; l = &eh->table[exception_hash(eh, new_e->old_chunk)]; @@ -499,7 +499,7 @@ out: static int dm_add_exception(void *context, chunk_t old, chunk_t new) { struct dm_snapshot *s = context; - struct dm_snap_exception *e; + struct dm_exception *e; e = alloc_exception(); if (!e) @@ -876,7 +876,7 @@ static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe) static void pending_complete(struct dm_snap_pending_exception *pe, int success) { - struct dm_snap_exception *e; + struct dm_exception *e; struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; @@ -988,7 +988,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) static struct dm_snap_pending_exception * __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) { - struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); + struct dm_exception *e = lookup_exception(&s->pending, chunk); if (!e) return NULL; @@ -1034,7 +1034,7 @@ __find_pending_exception(struct dm_snapshot *s, return pe; } -static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, +static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { bio->bi_bdev = s->store->cow->bdev; @@ -1048,7 +1048,7 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, static int snapshot_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { - struct dm_snap_exception *e; + struct dm_exception *e; struct dm_snapshot *s = ti->private; int r = DM_MAPIO_REMAPPED; chunk_t chunk; @@ -1221,7 +1221,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) { int r = DM_MAPIO_REMAPPED, first = 0; struct dm_snapshot *snap; - struct dm_snap_exception *e; + struct dm_exception *e; struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; chunk_t chunk; LIST_HEAD(pe_queue); @@ -1500,7 +1500,7 @@ static int __init dm_snapshot_init(void) goto bad2; } - exception_cache = KMEM_CACHE(dm_snap_exception, 0); + exception_cache = KMEM_CACHE(dm_exception, 0); if (!exception_cache) { DMERR("Couldn't create exception cache."); r = -ENOMEM; From 191437a53c8269df3a2c6199206781e742c57bb5 Mon Sep 17 00:00:00 2001 From: Jon Brassow Date: Thu, 10 Dec 2009 23:52:10 +0000 Subject: [PATCH 34/80] dm snapshot: rename exception_table to dm_exception_table Rename exception_table for broader use outside dm-snap.c Signed-off-by: Jonathan Brassow Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a7d60f644063..f40331cb1f6e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -49,7 +49,7 @@ #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ (DM_TRACKED_CHUNK_HASH_SIZE - 1)) -struct exception_table { +struct dm_exception_table { uint32_t hash_mask; unsigned hash_shift; struct list_head *table; @@ -73,8 +73,8 @@ struct dm_snapshot { atomic_t pending_exceptions_count; - struct exception_table pending; - struct exception_table complete; + struct dm_exception_table pending; + struct dm_exception_table complete; /* * pe_lock protects all pending_exception operations and access @@ -351,7 +351,7 @@ static void unregister_snapshot(struct dm_snapshot *s) * The lowest hash_shift bits of the chunk number are ignored, allowing * some consecutive chunks to be grouped together. */ -static int init_exception_table(struct exception_table *et, uint32_t size, +static int init_exception_table(struct dm_exception_table *et, uint32_t size, unsigned hash_shift) { unsigned int i; @@ -368,7 +368,8 @@ static int init_exception_table(struct exception_table *et, uint32_t size, return 0; } -static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) +static void exit_exception_table(struct dm_exception_table *et, + struct kmem_cache *mem) { struct list_head *slot; struct dm_exception *ex, *next; @@ -385,7 +386,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * vfree(et->table); } -static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) { return (chunk >> et->hash_shift) & et->hash_mask; } @@ -399,7 +400,7 @@ static void remove_exception(struct dm_exception *e) * Return the exception data for a sector, or NULL if not * remapped. */ -static struct dm_exception *lookup_exception(struct exception_table *et, +static struct dm_exception *lookup_exception(struct dm_exception_table *et, chunk_t chunk) { struct list_head *slot; @@ -450,7 +451,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) atomic_dec(&s->pending_exceptions_count); } -static void insert_exception(struct exception_table *eh, +static void insert_exception(struct dm_exception_table *eh, struct dm_exception *new_e) { struct list_head *l; From 3510cb94ff7b04b016bd22bfee913e2c1c05c066 Mon Sep 17 00:00:00 2001 From: Jon Brassow Date: Thu, 10 Dec 2009 23:52:11 +0000 Subject: [PATCH 35/80] dm snapshot: rename exception functions Rename exception functions. Preparing to pull them out of dm-snap.c for broader use. Signed-off-by: Jonathan Brassow Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 66 ++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f40331cb1f6e..cb4c2c3a43f0 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -351,8 +351,8 @@ static void unregister_snapshot(struct dm_snapshot *s) * The lowest hash_shift bits of the chunk number are ignored, allowing * some consecutive chunks to be grouped together. */ -static int init_exception_table(struct dm_exception_table *et, uint32_t size, - unsigned hash_shift) +static int dm_exception_table_init(struct dm_exception_table *et, + uint32_t size, unsigned hash_shift) { unsigned int i; @@ -368,8 +368,8 @@ static int init_exception_table(struct dm_exception_table *et, uint32_t size, return 0; } -static void exit_exception_table(struct dm_exception_table *et, - struct kmem_cache *mem) +static void dm_exception_table_exit(struct dm_exception_table *et, + struct kmem_cache *mem) { struct list_head *slot; struct dm_exception *ex, *next; @@ -391,7 +391,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) return (chunk >> et->hash_shift) & et->hash_mask; } -static void remove_exception(struct dm_exception *e) +static void dm_remove_exception(struct dm_exception *e) { list_del(&e->hash_list); } @@ -400,8 +400,8 @@ static void remove_exception(struct dm_exception *e) * Return the exception data for a sector, or NULL if not * remapped. */ -static struct dm_exception *lookup_exception(struct dm_exception_table *et, - chunk_t chunk) +static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, + chunk_t chunk) { struct list_head *slot; struct dm_exception *e; @@ -415,7 +415,7 @@ static struct dm_exception *lookup_exception(struct dm_exception_table *et, return NULL; } -static struct dm_exception *alloc_exception(void) +static struct dm_exception *alloc_completed_exception(void) { struct dm_exception *e; @@ -426,7 +426,7 @@ static struct dm_exception *alloc_exception(void) return e; } -static void free_exception(struct dm_exception *e) +static void free_completed_exception(struct dm_exception *e) { kmem_cache_free(exception_cache, e); } @@ -451,8 +451,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) atomic_dec(&s->pending_exceptions_count); } -static void insert_exception(struct dm_exception_table *eh, - struct dm_exception *new_e) +static void dm_insert_exception(struct dm_exception_table *eh, + struct dm_exception *new_e) { struct list_head *l; struct dm_exception *e = NULL; @@ -471,7 +471,7 @@ static void insert_exception(struct dm_exception_table *eh, new_e->new_chunk == (dm_chunk_number(e->new_chunk) + dm_consecutive_chunk_count(e) + 1)) { dm_consecutive_chunk_count_inc(e); - free_exception(new_e); + free_completed_exception(new_e); return; } @@ -481,7 +481,7 @@ static void insert_exception(struct dm_exception_table *eh, dm_consecutive_chunk_count_inc(e); e->old_chunk--; e->new_chunk--; - free_exception(new_e); + free_completed_exception(new_e); return; } @@ -502,7 +502,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) struct dm_snapshot *s = context; struct dm_exception *e; - e = alloc_exception(); + e = alloc_completed_exception(); if (!e) return -ENOMEM; @@ -511,7 +511,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; - insert_exception(&s->complete, e); + dm_insert_exception(&s->complete, e); return 0; } @@ -568,8 +568,8 @@ static int init_hash_tables(struct dm_snapshot *s) if (hash_size < 64) hash_size = 64; hash_size = rounddown_pow_of_two(hash_size); - if (init_exception_table(&s->complete, hash_size, - DM_CHUNK_CONSECUTIVE_BITS)) + if (dm_exception_table_init(&s->complete, hash_size, + DM_CHUNK_CONSECUTIVE_BITS)) return -ENOMEM; /* @@ -580,8 +580,8 @@ static int init_hash_tables(struct dm_snapshot *s) if (hash_size < 64) hash_size = 64; - if (init_exception_table(&s->pending, hash_size, 0)) { - exit_exception_table(&s->complete, exception_cache); + if (dm_exception_table_init(&s->pending, hash_size, 0)) { + dm_exception_table_exit(&s->complete, exception_cache); return -ENOMEM; } @@ -716,8 +716,8 @@ bad_pending_pool: dm_kcopyd_client_destroy(s->kcopyd_client); bad_kcopyd: - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); bad_hash_tables: dm_put_device(ti, s->origin); @@ -737,8 +737,8 @@ static void __free_exceptions(struct dm_snapshot *s) dm_kcopyd_client_destroy(s->kcopyd_client); s->kcopyd_client = NULL; - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); } static void snapshot_dtr(struct dm_target *ti) @@ -891,7 +891,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) goto out; } - e = alloc_exception(); + e = alloc_completed_exception(); if (!e) { down_write(&s->lock); __invalidate_snapshot(s, -ENOMEM); @@ -902,7 +902,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) down_write(&s->lock); if (!s->valid) { - free_exception(e); + free_completed_exception(e); error = 1; goto out; } @@ -918,10 +918,10 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) * Add a proper exception, and remove the * in-flight exception from the list. */ - insert_exception(&s->complete, e); + dm_insert_exception(&s->complete, e); out: - remove_exception(&pe->e); + dm_remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = put_pending_exception(pe); @@ -989,7 +989,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) static struct dm_snap_pending_exception * __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) { - struct dm_exception *e = lookup_exception(&s->pending, chunk); + struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); if (!e) return NULL; @@ -1030,7 +1030,7 @@ __find_pending_exception(struct dm_snapshot *s, } get_pending_exception(pe); - insert_exception(&s->pending, &pe->e); + dm_insert_exception(&s->pending, &pe->e); return pe; } @@ -1077,7 +1077,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, } /* If the block is already remapped - use that, else remap it */ - e = lookup_exception(&s->complete, chunk); + e = dm_lookup_exception(&s->complete, chunk); if (e) { remap_exception(s, e, bio, chunk); goto out_unlock; @@ -1101,7 +1101,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, goto out_unlock; } - e = lookup_exception(&s->complete, chunk); + e = dm_lookup_exception(&s->complete, chunk); if (e) { free_pending_exception(pe); remap_exception(s, e, bio, chunk); @@ -1254,7 +1254,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) * ref_count is initialised to 1 so pending_complete() * won't destroy the primary_pe while we're inside this loop. */ - e = lookup_exception(&snap->complete, chunk); + e = dm_lookup_exception(&snap->complete, chunk); if (e) goto next_snapshot; @@ -1269,7 +1269,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) goto next_snapshot; } - e = lookup_exception(&snap->complete, chunk); + e = dm_lookup_exception(&snap->complete, chunk); if (e) { free_pending_exception(pe); goto next_snapshot; From 985903bb3a6d98623360ab6c855417f638840029 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:11 +0000 Subject: [PATCH 36/80] dm snapshot: add allocated metadata to snapshot status Add number of sectors used by metadata to the end of the snapshot's status line. Renamed dm_exception_store_type's 'fraction_full' to 'usage'. Renamed arguments to be clearer about what is being returned. Also added 'metadata_sectors'. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.h | 6 +++--- drivers/md/dm-snap-persistent.c | 23 +++++++++++++++++------ drivers/md/dm-snap-transient.c | 15 +++++++++------ drivers/md/dm-snap.c | 21 ++++++++++++--------- 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 92749696e359..366c8b1fca37 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -86,9 +86,9 @@ struct dm_exception_store_type { /* * Return how full the snapshot is. */ - void (*fraction_full) (struct dm_exception_store *store, - sector_t *numerator, - sector_t *denominator); + void (*usage) (struct dm_exception_store *store, + sector_t *total_sectors, sector_t *sectors_allocated, + sector_t *metadata_sectors); /* For internal device-mapper use only. */ struct list_head list; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 24b8acd1be83..767065f6c5f3 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -489,11 +489,22 @@ static struct pstore *get_info(struct dm_exception_store *store) return (struct pstore *) store->context; } -static void persistent_fraction_full(struct dm_exception_store *store, - sector_t *numerator, sector_t *denominator) +static void persistent_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) { - *numerator = get_info(store)->next_free * store->chunk_size; - *denominator = get_dev_size(store->cow->bdev); + struct pstore *ps = get_info(store); + + *sectors_allocated = ps->next_free * store->chunk_size; + *total_sectors = get_dev_size(store->cow->bdev); + + /* + * First chunk is the fixed header. + * Then there are (ps->current_area + 1) metadata chunks, each one + * separated from the next by ps->exceptions_per_area data chunks. + */ + *metadata_sectors = (ps->current_area + 2) * store->chunk_size; } static void persistent_dtr(struct dm_exception_store *store) @@ -738,7 +749,7 @@ static struct dm_exception_store_type _persistent_type = { .prepare_exception = persistent_prepare_exception, .commit_exception = persistent_commit_exception, .drop_snapshot = persistent_drop_snapshot, - .fraction_full = persistent_fraction_full, + .usage = persistent_usage, .status = persistent_status, }; @@ -751,7 +762,7 @@ static struct dm_exception_store_type _persistent_compat_type = { .prepare_exception = persistent_prepare_exception, .commit_exception = persistent_commit_exception, .drop_snapshot = persistent_drop_snapshot, - .fraction_full = persistent_fraction_full, + .usage = persistent_usage, .status = persistent_status, }; diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index 267801b34ff6..245a50c7337e 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c @@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store, callback(callback_context, 1); } -static void transient_fraction_full(struct dm_exception_store *store, - sector_t *numerator, sector_t *denominator) +static void transient_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) { - *numerator = ((struct transient_c *) store->context)->next_free; - *denominator = get_dev_size(store->cow->bdev); + *sectors_allocated = ((struct transient_c *) store->context)->next_free; + *total_sectors = get_dev_size(store->cow->bdev); + *metadata_sectors = 0; } static int transient_ctr(struct dm_exception_store *store, @@ -106,7 +109,7 @@ static struct dm_exception_store_type _transient_type = { .read_metadata = transient_read_metadata, .prepare_exception = transient_prepare_exception, .commit_exception = transient_commit_exception, - .fraction_full = transient_fraction_full, + .usage = transient_usage, .status = transient_status, }; @@ -118,7 +121,7 @@ static struct dm_exception_store_type _transient_compat_type = { .read_metadata = transient_read_metadata, .prepare_exception = transient_prepare_exception, .commit_exception = transient_commit_exception, - .fraction_full = transient_fraction_full, + .usage = transient_usage, .status = transient_status, }; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index cb4c2c3a43f0..8bd77cbd7e45 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1174,14 +1174,17 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, if (!snap->valid) DMEMIT("Invalid"); else { - if (snap->store->type->fraction_full) { - sector_t numerator, denominator; - snap->store->type->fraction_full(snap->store, - &numerator, - &denominator); - DMEMIT("%llu/%llu", - (unsigned long long)numerator, - (unsigned long long)denominator); + if (snap->store->type->usage) { + sector_t total_sectors, sectors_allocated, + metadata_sectors; + snap->store->type->usage(snap->store, + &total_sectors, + §ors_allocated, + &metadata_sectors); + DMEMIT("%llu/%llu %llu", + (unsigned long long)sectors_allocated, + (unsigned long long)total_sectors, + (unsigned long long)metadata_sectors); } else DMEMIT("Unknown"); @@ -1462,7 +1465,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, From fc56f6fbcca3672c63c93c65f45105faacfc13cb Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:12 +0000 Subject: [PATCH 37/80] dm snapshot: move cow ref from exception store to snap core Store the reference to the snapshot cow device in the core snapshot code instead of each exception store. It can be accessed through the new function dm_snap_cow(). Exception stores should each now maintain a reference to their parent snapshot struct. This is cleaner and makes part of the forthcoming snapshot merge code simpler. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon Reviewed-by: Jonathan Brassow Cc: Mikulas Patocka --- drivers/md/dm-exception-store.c | 30 +++++-------- drivers/md/dm-exception-store.h | 12 ++++-- drivers/md/dm-snap-persistent.c | 12 +++--- drivers/md/dm-snap-transient.c | 7 ++- drivers/md/dm-snap.c | 76 ++++++++++++++++++++++----------- 5 files changed, 79 insertions(+), 58 deletions(-) diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 205215968ff1..2b7907b6dd09 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, } /* Validate the chunk size against the device block size */ - if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { + if (chunk_size % + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } @@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, } int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, unsigned *args_used, struct dm_exception_store **store) { @@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, struct dm_exception_store *tmp_store; char persistent; - if (argc < 3) { + if (argc < 2) { ti->error = "Insufficient exception store arguments"; return -EINVAL; } @@ -209,7 +211,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, return -ENOMEM; } - persistent = toupper(*argv[1]); + persistent = toupper(*argv[0]); if (persistent == 'P') type = get_type("P"); else if (persistent == 'N') @@ -227,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, } tmp_store->type = type; - tmp_store->ti = ti; + tmp_store->snap = snap; - r = dm_get_device(ti, argv[0], 0, 0, - FMODE_READ | FMODE_WRITE, &tmp_store->cow); - if (r) { - ti->error = "Cannot get COW device"; - goto bad_cow; - } - - r = set_chunk_size(tmp_store, argv[2], &ti->error); + r = set_chunk_size(tmp_store, argv[1], &ti->error); if (r) - goto bad_ctr; + goto bad; r = type->ctr(tmp_store, 0, NULL); if (r) { ti->error = "Exception store type constructor failed"; - goto bad_ctr; + goto bad; } - *args_used = 3; + *args_used = 2; *store = tmp_store; return 0; -bad_ctr: - dm_put_device(ti, tmp_store->cow); -bad_cow: +bad: put_type(type); bad_type: kfree(tmp_store); @@ -263,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create); void dm_exception_store_destroy(struct dm_exception_store *store) { store->type->dtr(store); - dm_put_device(store->ti, store->cow); put_type(store->type); kfree(store); } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 366c8b1fca37..bb8874653de1 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -94,11 +94,11 @@ struct dm_exception_store_type { struct list_head list; }; +struct dm_snapshot; + struct dm_exception_store { struct dm_exception_store_type *type; - struct dm_target *ti; - - struct dm_dev *cow; + struct dm_snapshot *snap; /* Size of data blocks saved - must be a power of 2 */ unsigned chunk_size; @@ -108,6 +108,11 @@ struct dm_exception_store { void *context; }; +/* + * Obtain the cow device used by a given snapshot. + */ +struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); + /* * Funtions to manipulate consecutive chunks */ @@ -173,6 +178,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, char **error); int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, unsigned *args_used, struct dm_exception_store **store); void dm_exception_store_destroy(struct dm_exception_store *store); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 767065f6c5f3..157999ebd236 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -214,7 +214,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, int metadata) { struct dm_io_region where = { - .bdev = ps->store->cow->bdev, + .bdev = dm_snap_cow(ps->store->snap)->bdev, .sector = ps->store->chunk_size * chunk, .count = ps->store->chunk_size, }; @@ -294,7 +294,8 @@ static int read_header(struct pstore *ps, int *new_snapshot) */ if (!ps->store->chunk_size) { ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_logical_block_size(ps->store->cow->bdev) >> 9); + bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> + bdev) >> 9); ps->store->chunk_mask = ps->store->chunk_size - 1; ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; chunk_size_supplied = 0; @@ -497,7 +498,7 @@ static void persistent_usage(struct dm_exception_store *store, struct pstore *ps = get_info(store); *sectors_allocated = ps->next_free * store->chunk_size; - *total_sectors = get_dev_size(store->cow->bdev); + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); /* * First chunk is the fixed header. @@ -596,7 +597,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store, struct pstore *ps = get_info(store); uint32_t stride; chunk_t next_free; - sector_t size = get_dev_size(store->cow->bdev); + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); /* Is there enough room ? */ if (size < ((ps->next_free + 1) * store->chunk_size)) @@ -733,8 +734,7 @@ static unsigned persistent_status(struct dm_exception_store *store, case STATUSTYPE_INFO: break; case STATUSTYPE_TABLE: - DMEMIT(" %s P %llu", store->cow->name, - (unsigned long long)store->chunk_size); + DMEMIT(" P %llu", (unsigned long long)store->chunk_size); } return sz; diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index 245a50c7337e..a0898a66a2f8 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c @@ -39,7 +39,7 @@ static int transient_prepare_exception(struct dm_exception_store *store, struct dm_exception *e) { struct transient_c *tc = store->context; - sector_t size = get_dev_size(store->cow->bdev); + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); if (size < (tc->next_free + store->chunk_size)) return -1; @@ -65,7 +65,7 @@ static void transient_usage(struct dm_exception_store *store, sector_t *metadata_sectors) { *sectors_allocated = ((struct transient_c *) store->context)->next_free; - *total_sectors = get_dev_size(store->cow->bdev); + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); *metadata_sectors = 0; } @@ -94,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store, case STATUSTYPE_INFO: break; case STATUSTYPE_TABLE: - DMEMIT(" %s N %llu", store->cow->name, - (unsigned long long)store->chunk_size); + DMEMIT(" N %llu", (unsigned long long)store->chunk_size); } return sz; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 8bd77cbd7e45..dc500a6f6232 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -59,6 +59,9 @@ struct dm_snapshot { struct rw_semaphore lock; struct dm_dev *origin; + struct dm_dev *cow; + + struct dm_target *ti; /* List of snapshots per Origin */ struct list_head list; @@ -97,6 +100,12 @@ struct dm_snapshot { struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; }; +struct dm_dev *dm_snap_cow(struct dm_snapshot *s) +{ + return s->cow; +} +EXPORT_SYMBOL(dm_snap_cow); + static struct workqueue_struct *ksnapd; static void flush_queued_bios(struct work_struct *work); @@ -558,7 +567,7 @@ static int init_hash_tables(struct dm_snapshot *s) * Calculate based on the size of the original volume or * the COW volume... */ - cow_dev_size = get_dev_size(s->store->cow->bdev); + cow_dev_size = get_dev_size(s->cow->bdev); origin_dev_size = get_dev_size(s->origin->bdev); max_buckets = calc_max_buckets(); @@ -596,45 +605,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_snapshot *s; int i; int r = -EINVAL; - char *origin_path; - struct dm_exception_store *store; + char *origin_path, *cow_path; unsigned args_used; if (argc != 4) { ti->error = "requires exactly 4 arguments"; r = -EINVAL; - goto bad_args; + goto bad; } origin_path = argv[0]; argv++; argc--; - r = dm_exception_store_create(ti, argc, argv, &args_used, &store); - if (r) { - ti->error = "Couldn't create exception store"; - r = -EINVAL; - goto bad_args; - } - - argv += args_used; - argc -= args_used; - s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) { ti->error = "Cannot allocate snapshot context private " "structure"; r = -ENOMEM; - goto bad_snap; + goto bad; } + cow_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, cow_path, 0, 0, + FMODE_READ | FMODE_WRITE, &s->cow); + if (r) { + ti->error = "Cannot get COW device"; + goto bad_cow; + } + + r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad_store; + } + + argv += args_used; + argc -= args_used; + r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); if (r) { ti->error = "Cannot get origin device"; goto bad_origin; } - s->store = store; + s->ti = ti; s->valid = 1; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); @@ -723,12 +742,15 @@ bad_hash_tables: dm_put_device(ti, s->origin); bad_origin: + dm_exception_store_destroy(s->store); + +bad_store: + dm_put_device(ti, s->cow); + +bad_cow: kfree(s); -bad_snap: - dm_exception_store_destroy(store); - -bad_args: +bad: return r; } @@ -777,6 +799,8 @@ static void snapshot_dtr(struct dm_target *ti) dm_exception_store_destroy(s->store); + dm_put_device(ti, s->cow); + kfree(s); } @@ -839,7 +863,7 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) s->valid = 0; - dm_table_event(s->store->ti->table); + dm_table_event(s->ti->table); } static void get_pending_exception(struct dm_snap_pending_exception *pe) @@ -977,7 +1001,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) src.sector = chunk_to_sector(s->store, pe->e.old_chunk); src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); - dest.bdev = s->store->cow->bdev; + dest.bdev = s->cow->bdev; dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); dest.count = src.count; @@ -1038,7 +1062,7 @@ __find_pending_exception(struct dm_snapshot *s, static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { - bio->bi_bdev = s->store->cow->bdev; + bio->bi_bdev = s->cow->bdev; bio->bi_sector = chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + (chunk - e->old_chunk)) + @@ -1056,7 +1080,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, struct dm_snap_pending_exception *pe = NULL; if (unlikely(bio_empty_barrier(bio))) { - bio->bi_bdev = s->store->cow->bdev; + bio->bi_bdev = s->cow->bdev; return DM_MAPIO_REMAPPED; } @@ -1200,7 +1224,7 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, * to make private copies if the output is to * make sense. */ - DMEMIT("%s", snap->origin->name); + DMEMIT("%s %s", snap->origin->name, snap->cow->name); snap->store->type->status(snap->store, type, result + sz, maxlen - sz); break; @@ -1240,7 +1264,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) goto next_snapshot; /* Nothing to do if writing beyond end of snapshot */ - if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) + if (bio->bi_sector >= dm_table_get_size(snap->ti->table)) goto next_snapshot; /* From c26655ca3ca7550740a63820ee981e5c7c797523 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:12 +0000 Subject: [PATCH 38/80] dm snapshot: track suspended state in target Keep track of whether or not the device is suspended within the snapshot target module, the same as we do in dm-raid1. We will use this later to enforce the correct sequence of ioctls to transfer the in-core exceptions from a snapshot target instance in one table to a replacement one capable of merging them back into the origin. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index dc500a6f6232..fd04caa90340 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -72,6 +72,9 @@ struct dm_snapshot { /* Origin writes don't trigger exceptions until this is set */ int active; + /* Whether or not owning mapped_device is suspended */ + int suspended; + mempool_t *pending_pool; atomic_t pending_exceptions_count; @@ -656,6 +659,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->ti = ti; s->valid = 1; s->active = 0; + s->suspended = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); spin_lock_init(&s->pe_lock); @@ -1175,12 +1179,22 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio, return 0; } +static void snapshot_postsuspend(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + down_write(&s->lock); + s->suspended = 1; + up_write(&s->lock); +} + static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = ti->private; down_write(&s->lock); s->active = 1; + s->suspended = 0; up_write(&s->lock); } @@ -1489,12 +1503,13 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, .end_io = snapshot_end_io, + .postsuspend = snapshot_postsuspend, .resume = snapshot_resume, .status = snapshot_status, .iterate_devices = snapshot_iterate_devices, From 9ca170a3c0cbb0d5251cef6f5a3300fa436ba8ec Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:13 +0000 Subject: [PATCH 39/80] dm kcopyd: accept zero size jobs dm-kcopyd: accept zero-size jobs This patch changes dm-kcopyd so that it accepts zero-size jobs and completes them immediatelly via its completion thread. It is needed for multisnapshots snapshot resizing. When we are writing to a chunk beyond origin end, no copying is done. To simplify the code, we submit an empty request to kcopyd and let kcopyd complete it. If we didn't submit a request to kcopyd and called the completion routine immediatelly, it would violate the principle that completion is called only from one thread and it would need additional locking. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-kcopyd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3e3fc06cb861..addf83475040 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job) { struct dm_kcopyd_client *kc = job->kc; atomic_inc(&kc->nr_jobs); - push(&kc->pages_jobs, job); + if (unlikely(!job->source.count)) + push(&kc->complete_jobs, job); + else + push(&kc->pages_jobs, job); wake(kc); } From 90abb8c4cec8f0aa4ce58790542e3cf13071601a Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:13 +0000 Subject: [PATCH 40/80] dm: abstract dm_in_flight function This patch adds md_in_flight() to get the number of in_flight I/Os. No functional change. This patch is a preparation for a later patch in this series, which changes I/O counter to md->pending from q->in_flight in request-based dm. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 473f0c3c0192..73b89afd6565 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -452,6 +452,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info) mempool_free(info, info->tio->md->io_pool); } +static int md_in_flight(struct mapped_device *md) +{ + return atomic_read(&md->pending[READ]) + + atomic_read(&md->pending[WRITE]); +} + static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; @@ -2100,8 +2106,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) break; } spin_unlock_irqrestore(q->queue_lock, flags); - } else if (!atomic_read(&md->pending[0]) && - !atomic_read(&md->pending[1])) + } else if (!md_in_flight(md)) break; if (interruptible == TASK_INTERRUPTIBLE && From 598de40947909e6b948569710383661ecc0ddc8e Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:14 +0000 Subject: [PATCH 41/80] dm: use clone in map_request function This patch changes the argument of map_request() to clone request from original request. No functional change. This patch is a preparation for PATCH 9, which needs to use map_request() for clones sharing an original barrier request. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 73b89afd6565..cf0b455b21ef 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1495,11 +1495,10 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_OK; } -static void map_request(struct dm_target *ti, struct request *rq, +static void map_request(struct dm_target *ti, struct request *clone, struct mapped_device *md) { int r; - struct request *clone = rq->special; struct dm_rq_target_io *tio = clone->end_io_data; /* @@ -1576,7 +1575,7 @@ static void dm_request_fn(struct request_queue *q) blk_start_request(rq); spin_unlock(q->queue_lock); - map_request(ti, rq, md); + map_request(ti, rq->special, md); spin_lock_irq(q->queue_lock); } From 0888564393a1277ce2d0564d819e1bcff1120343 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:15 +0000 Subject: [PATCH 42/80] dm: pass gfp_mask to alloc_rq_tio This patch adds the gfp_mask argument to alloc_rq_tio(). No functional change. This patch is a preparation for a later patch in this series which needs to allocate tio (for barrier I/O) with different allocation flag (GFP_NOIO) from the one in the normal I/O code path. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index cf0b455b21ef..a42dfb7a718e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -432,9 +432,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) mempool_free(tio, md->tio_pool); } -static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) +static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, + gfp_t gfp_mask) { - return mempool_alloc(md->tio_pool, GFP_ATOMIC); + return mempool_alloc(md->tio_pool, gfp_mask); } static void free_rq_tio(struct dm_rq_target_io *tio) @@ -1471,7 +1472,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_KILL; } - tio = alloc_rq_tio(md); /* Only one for each original request */ + tio = alloc_rq_tio(md, GFP_ATOMIC); if (!tio) /* -ENOMEM */ return BLKPREP_DEFER; From 6facdaff229f2b25d0de82be9be99b9f562e72ba Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:15 +0000 Subject: [PATCH 43/80] dm: abstract clone_rq This patch factors out the request cloning code in dm_prep_fn() as clone_rq(). No functional change. This patch is a preparation for a later patch in this series which needs to make clones from an original barrier request. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a42dfb7a718e..30f5dc8e52bc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1445,6 +1445,32 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } +static struct request *clone_rq(struct request *rq, struct mapped_device *md, + gfp_t gfp_mask) +{ + struct request *clone; + struct dm_rq_target_io *tio; + + tio = alloc_rq_tio(md, gfp_mask); + if (!tio) + return NULL; + + tio->md = md; + tio->ti = NULL; + tio->orig = rq; + tio->error = 0; + memset(&tio->info, 0, sizeof(tio->info)); + + clone = &tio->clone; + if (setup_clone(clone, rq, tio)) { + /* -ENOMEM */ + free_rq_tio(tio); + return NULL; + } + + return clone; +} + static int dm_rq_flush_suspending(struct mapped_device *md) { return !md->suspend_rq.special; @@ -1456,7 +1482,6 @@ static int dm_rq_flush_suspending(struct mapped_device *md) static int dm_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; - struct dm_rq_target_io *tio; struct request *clone; if (unlikely(rq == &md->suspend_rq)) { @@ -1472,24 +1497,10 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_KILL; } - tio = alloc_rq_tio(md, GFP_ATOMIC); - if (!tio) - /* -ENOMEM */ + clone = clone_rq(rq, md, GFP_ATOMIC); + if (!clone) return BLKPREP_DEFER; - tio->md = md; - tio->ti = NULL; - tio->orig = rq; - tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); - - clone = &tio->clone; - if (setup_clone(clone, rq, tio)) { - /* -ENOMEM */ - free_rq_tio(tio); - return BLKPREP_DEFER; - } - rq->special = clone; rq->cmd_flags |= REQ_DONTPREP; From 9f518b27cf682dd5155a4c1679d52cd4b5be82f2 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:16 +0000 Subject: [PATCH 44/80] dm: simplify request based suspend The semantics of bio-based dm were changed recently in the case of suspend with "--nolockfs" but without "--noflush". Before 2.6.30, I/Os submitted before the suspend invocation were always flushed. From 2.6.30 onwards, I/Os submitted before the suspend invocation might not be flushed. (For details, see http://marc.info/?t=123994433400003&r=1&w=2) This patch brings the behaviour of request-based dm into line with bio-based dm, simplifying the code and preparing for a subsequent patch that will wait for all in_flight I/Os to complete without stopping request_queue and use dm_wait_for_completion() for it. This change in semantics simplifies the suspend code as follows: o Suspend is implemented as stopping request_queue in request-based dm, and all I/Os are queued in the request_queue even after suspend is invoked. o In the old semantics, we had to track whether I/Os were queued before or after the suspend invocation, so a special barrier-like request called 'suspend marker' was introduced. o With the new semantics, we don't need to flush any I/O so we can remove the marker and the code related to the marker handling and I/O flushing. After removing this codes, the suspend sequence is now: 1. Flush all I/Os by lock_fs() if needed. 2. Stop dispatching any I/O by stopping the request_queue. 3. Wait for all in-flight I/Os to be completed or requeued. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 158 +++++------------------------------------------- 1 file changed, 14 insertions(+), 144 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 30f5dc8e52bc..634b1daab2d4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -178,9 +178,6 @@ struct mapped_device { /* forced geometry settings */ struct hd_geometry geometry; - /* marker of flush suspend for request-based dm */ - struct request suspend_rq; - /* For saving the address of __make_request for request based dm */ make_request_fn *saved_make_request_fn; @@ -1471,11 +1468,6 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, return clone; } -static int dm_rq_flush_suspending(struct mapped_device *md) -{ - return !md->suspend_rq.special; -} - /* * Called with the queue lock held. */ @@ -1484,14 +1476,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) struct mapped_device *md = q->queuedata; struct request *clone; - if (unlikely(rq == &md->suspend_rq)) { - if (dm_rq_flush_suspending(md)) - return BLKPREP_OK; - else - /* The flush suspend was interrupted */ - return BLKPREP_KILL; - } - if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; @@ -1560,27 +1544,15 @@ static void dm_request_fn(struct request_queue *q) struct request *rq; /* - * For noflush suspend, check blk_queue_stopped() to immediately - * quit I/O dispatching. + * For suspend, check blk_queue_stopped() and don't increment + * the number of in-flight I/Os after the queue is stopped + * in dm_suspend(). */ while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) goto plug_and_out; - if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ - if (queue_in_flight(q)) - /* Not quiet yet. Wait more */ - goto plug_and_out; - - /* This device should be quiet now */ - __stop_queue(q); - blk_start_request(rq); - __blk_end_request_all(rq, 0); - wake_up(&md->wait); - goto out; - } - ti = dm_table_find_target(map, blk_rq_pos(rq)); if (ti->type->busy && ti->type->busy(ti)) goto plug_and_out; @@ -2112,7 +2084,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) smp_mb(); if (dm_request_based(md)) { spin_lock_irqsave(q->queue_lock, flags); - if (!queue_in_flight(q) && blk_queue_stopped(q)) { + if (!queue_in_flight(q)) { spin_unlock_irqrestore(q->queue_lock, flags); break; } @@ -2245,67 +2217,6 @@ out: return r; } -static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) -{ - md->suspend_rq.special = (void *)0x1; -} - -static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) -{ - struct request_queue *q = md->queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (!noflush) - dm_rq_invalidate_suspend_marker(md); - __start_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void dm_rq_start_suspend(struct mapped_device *md, int noflush) -{ - struct request *rq = &md->suspend_rq; - struct request_queue *q = md->queue; - - if (noflush) - stop_queue(q); - else { - blk_rq_init(q, rq); - blk_insert_request(q, rq, 0, NULL); - } -} - -static int dm_rq_suspend_available(struct mapped_device *md, int noflush) -{ - int r = 1; - struct request *rq = &md->suspend_rq; - struct request_queue *q = md->queue; - unsigned long flags; - - if (noflush) - return r; - - /* The marker must be protected by queue lock if it is in use */ - spin_lock_irqsave(q->queue_lock, flags); - if (unlikely(rq->ref_count)) { - /* - * This can happen, when the previous flush suspend was - * interrupted, the marker is still in the queue and - * this flush suspend has been invoked, because we don't - * remove the marker at the time of suspend interruption. - * We have only one marker per mapped_device, so we can't - * start another flush suspend while it is in use. - */ - BUG_ON(!rq->special); /* The marker should be invalidated */ - DMWARN("Invalidating the previous flush suspend is still in" - " progress. Please retry later."); - r = 0; - } - spin_unlock_irqrestore(q->queue_lock, flags); - - return r; -} - /* * Functions to lock and unlock any filesystem running on the * device. @@ -2348,49 +2259,11 @@ static void unlock_fs(struct mapped_device *md) /* * Suspend mechanism in request-based dm. * - * After the suspend starts, further incoming requests are kept in - * the request_queue and deferred. - * Remaining requests in the request_queue at the start of suspend are flushed - * if it is flush suspend. - * The suspend completes when the following conditions have been satisfied, - * so wait for it: - * 1. q->in_flight is 0 (which means no in_flight request) - * 2. queue has been stopped (which means no request dispatching) + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. * - * - * Noflush suspend - * --------------- - * Noflush suspend doesn't need to dispatch remaining requests. - * So stop the queue immediately. Then, wait for all in_flight requests - * to be completed or requeued. - * - * To abort noflush suspend, start the queue. - * - * - * Flush suspend - * ------------- - * Flush suspend needs to dispatch remaining requests. So stop the queue - * after the remaining requests are completed. (Requeued request must be also - * re-dispatched and completed. Until then, we can't stop the queue.) - * - * During flushing the remaining requests, further incoming requests are also - * inserted to the same queue. To distinguish which requests are to be - * flushed, we insert a marker request to the queue at the time of starting - * flush suspend, like a barrier. - * The dispatching is blocked when the marker is found on the top of the queue. - * And the queue is stopped when all in_flight requests are completed, since - * that means the remaining requests are completely flushed. - * Then, the marker is removed from the queue. - * - * To abort flush suspend, we also need to take care of the marker, not only - * starting the queue. - * We don't remove the marker forcibly from the queue since it's against - * the block-layer manner. Instead, we put a invalidated mark on the marker. - * When the invalidated marker is found on the top of the queue, it is - * immediately removed from the queue, so it doesn't block dispatching. - * Because we have only one marker per mapped_device, we can't start another - * flush suspend until the invalidated marker is removed from the queue. - * So fail and return with -EBUSY in such a case. + * To abort suspend, start the request_queue. */ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { @@ -2406,11 +2279,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) goto out_unlock; } - if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { - r = -EBUSY; - goto out_unlock; - } - map = dm_get_table(md); /* @@ -2424,8 +2292,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) dm_table_presuspend_targets(map); /* - * Flush I/O to the device. noflush supersedes do_lockfs, - * because lock_fs() needs to flush I/Os. + * Flush I/O to the device. + * Any I/O submitted after lock_fs() may not be flushed. + * noflush takes precedence over do_lockfs. + * (lock_fs() flushes I/Os and waits for them to complete.) */ if (!noflush && do_lockfs) { r = lock_fs(md); @@ -2457,7 +2327,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) flush_workqueue(md->wq); if (dm_request_based(md)) - dm_rq_start_suspend(md, noflush); + stop_queue(md->queue); /* * At this point no more requests are entering target request routines. @@ -2476,7 +2346,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) dm_queue_flush(md); if (dm_request_based(md)) - dm_rq_abort_suspend(md, noflush); + start_queue(md->queue); unlock_fs(md); goto out; /* pushback list is already flushed, so skip flush */ From b4324feeae304ae39e631a254d238a7d63be004b Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:16 +0000 Subject: [PATCH 45/80] dm: use md pending for in flight IO counting This patch changes the counter for the number of in_flight I/Os to md->pending from q->in_flight in preparation for a later patch. No functional change. Request-based dm used q->in_flight to count the number of in-flight clones assuming the counter is always incremented for an in-flight original request and original:clone is 1:1 relationship. However, it this no longer true for barrier requests. So use md->pending to count the number of in-flight clones. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 634b1daab2d4..01d741a0c079 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -727,23 +727,16 @@ static void end_clone_bio(struct bio *clone, int error) * the md may be freed in dm_put() at the end of this function. * Or do dm_get() before calling this function and dm_put() later. */ -static void rq_completed(struct mapped_device *md, int run_queue) +static void rq_completed(struct mapped_device *md, int rw, int run_queue) { - int wakeup_waiters = 0; - struct request_queue *q = md->queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (!queue_in_flight(q)) - wakeup_waiters = 1; - spin_unlock_irqrestore(q->queue_lock, flags); + atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ - if (wakeup_waiters) + if (!md_in_flight(md)) wake_up(&md->wait); if (run_queue) - blk_run_queue(q); + blk_run_queue(md->queue); /* * dm_put() must be at the end of this function. See the comment above @@ -774,6 +767,7 @@ static void dm_unprep_request(struct request *rq) */ void dm_requeue_unmapped_request(struct request *clone) { + int rw = rq_data_dir(clone); struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; struct request *rq = tio->orig; @@ -788,7 +782,7 @@ void dm_requeue_unmapped_request(struct request *clone) blk_requeue_request(q, rq); spin_unlock_irqrestore(q->queue_lock, flags); - rq_completed(md, 0); + rq_completed(md, rw, 0); } EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); @@ -827,6 +821,7 @@ static void start_queue(struct request_queue *q) */ static void dm_end_request(struct request *clone, int error) { + int rw = rq_data_dir(clone); struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; struct request *rq = tio->orig; @@ -848,7 +843,7 @@ static void dm_end_request(struct request *clone, int error) blk_end_request_all(rq, error); - rq_completed(md, 1); + rq_completed(md, rw, 1); } /* @@ -1541,12 +1536,13 @@ static void dm_request_fn(struct request_queue *q) struct mapped_device *md = q->queuedata; struct dm_table *map = dm_get_table(md); struct dm_target *ti; - struct request *rq; + struct request *rq, *clone; /* - * For suspend, check blk_queue_stopped() and don't increment - * the number of in-flight I/Os after the queue is stopped - * in dm_suspend(). + * For suspend, check blk_queue_stopped() and increment + * ->pending within a single queue_lock not to increment the + * number of in-flight I/Os after the queue is stopped in + * dm_suspend(). */ while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { rq = blk_peek_request(q); @@ -1558,8 +1554,11 @@ static void dm_request_fn(struct request_queue *q) goto plug_and_out; blk_start_request(rq); + clone = rq->special; + atomic_inc(&md->pending[rq_data_dir(clone)]); + spin_unlock(q->queue_lock); - map_request(ti, rq->special, md); + map_request(ti, clone, md); spin_lock_irq(q->queue_lock); } @@ -2071,8 +2070,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) { int r = 0; DECLARE_WAITQUEUE(wait, current); - struct request_queue *q = md->queue; - unsigned long flags; dm_unplug_all(md->queue); @@ -2082,14 +2079,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) set_current_state(interruptible); smp_mb(); - if (dm_request_based(md)) { - spin_lock_irqsave(q->queue_lock, flags); - if (!queue_in_flight(q)) { - spin_unlock_irqrestore(q->queue_lock, flags); - break; - } - spin_unlock_irqrestore(q->queue_lock, flags); - } else if (!md_in_flight(md)) + if (!md_in_flight(md)) break; if (interruptible == TASK_INTERRUPTIBLE && From 11a68244e16b0c35e122dd55b4e7c595e0fb67a1 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:17 +0000 Subject: [PATCH 46/80] dm: refactor request based completion functions This patch factors out the clone completion code, dm_done(), from dm_softirq_done() in preparation for a subsequent patch. No functional change. dm_done() will be used in barrier completion, which can't use and doesn't need softirq. The softirq_done callback needs to get a clone from an original request but it can't in the case of barrier, where an original request is shared by multiple clones. On the other hand, the completion of barrier clones doesn't involve re-submitting requests, which was the primary reason of the need for softirq. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 01d741a0c079..c65be45a4c42 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -846,32 +846,43 @@ static void dm_end_request(struct request *clone, int error) rq_completed(md, rw, 1); } +static void dm_done(struct request *clone, int error, bool mapped) +{ + int r = error; + struct dm_rq_target_io *tio = clone->end_io_data; + dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; + + if (mapped && rq_end_io) + r = rq_end_io(tio->ti, clone, error, &tio->info); + + if (r <= 0) + /* The target wants to complete the I/O */ + dm_end_request(clone, r); + else if (r == DM_ENDIO_INCOMPLETE) + /* The target will handle the I/O */ + return; + else if (r == DM_ENDIO_REQUEUE) + /* The target wants to requeue the I/O */ + dm_requeue_unmapped_request(clone); + else { + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } +} + /* * Request completion handler for request-based dm */ static void dm_softirq_done(struct request *rq) { + bool mapped = true; struct request *clone = rq->completion_data; struct dm_rq_target_io *tio = clone->end_io_data; - dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; - int error = tio->error; - if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) - error = rq_end_io(tio->ti, clone, error, &tio->info); + if (rq->cmd_flags & REQ_FAILED) + mapped = false; - if (error <= 0) - /* The target wants to complete the I/O */ - dm_end_request(clone, error); - else if (error == DM_ENDIO_INCOMPLETE) - /* The target will handle the I/O */ - return; - else if (error == DM_ENDIO_REQUEUE) - /* The target wants to requeue the I/O */ - dm_requeue_unmapped_request(clone); - else { - DMWARN("unimplemented target endio return value: %d", error); - BUG(); - } + dm_done(clone, tio->error, mapped); } /* From 980691e5f3a1b5ebbb2d34014e028fd7f1c6e4fb Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:17 +0000 Subject: [PATCH 47/80] dm: move dm_end_request This patch moves dm_end_request() to make the next patch more readable. No functional change. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 62 ++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c65be45a4c42..821a5dd6a8d1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -752,6 +752,37 @@ static void free_rq_clone(struct request *clone) free_rq_tio(tio); } +/* + * Complete the clone and the original request. + * Must be called without queue lock. + */ +static void dm_end_request(struct request *clone, int error) +{ + int rw = rq_data_dir(clone); + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + + if (blk_pc_request(rq)) { + rq->errors = clone->errors; + rq->resid_len = clone->resid_len; + + if (rq->sense) + /* + * We are using the sense buffer of the original + * request. + * So setting the length of the sense data is enough. + */ + rq->sense_len = clone->sense_len; + } + + free_rq_clone(clone); + + blk_end_request_all(rq, error); + + rq_completed(md, rw, 1); +} + static void dm_unprep_request(struct request *rq) { struct request *clone = rq->special; @@ -815,37 +846,6 @@ static void start_queue(struct request_queue *q) spin_unlock_irqrestore(q->queue_lock, flags); } -/* - * Complete the clone and the original request. - * Must be called without queue lock. - */ -static void dm_end_request(struct request *clone, int error) -{ - int rw = rq_data_dir(clone); - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - struct request *rq = tio->orig; - - if (blk_pc_request(rq)) { - rq->errors = clone->errors; - rq->resid_len = clone->resid_len; - - if (rq->sense) - /* - * We are using the sense buffer of the original - * request. - * So setting the length of the sense data is enough. - */ - rq->sense_len = clone->sense_len; - } - - free_rq_clone(clone); - - blk_end_request_all(rq, error); - - rq_completed(md, rw, 1); -} - static void dm_done(struct request *clone, int error, bool mapped) { int r = error; From d0bcb8786532b01206f04258eb6b7d4ac858436a Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:18 +0000 Subject: [PATCH 48/80] dm: add request based barrier support This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 214 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 196 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 821a5dd6a8d1..3de8d6d5b0b8 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -142,10 +142,20 @@ struct mapped_device { */ int barrier_error; + /* + * Protect barrier_error from concurrent endio processing + * in request-based dm. + */ + spinlock_t barrier_error_lock; + /* * Processing queue (flush/barriers) */ struct workqueue_struct *wq; + struct work_struct barrier_work; + + /* A pointer to the currently processing pre/post flush request */ + struct request *flush_request; /* * The current mapping. @@ -722,6 +732,23 @@ static void end_clone_bio(struct bio *clone, int error) blk_update_request(tio->orig, 0, nr_bytes); } +static void store_barrier_error(struct mapped_device *md, int error) +{ + unsigned long flags; + + spin_lock_irqsave(&md->barrier_error_lock, flags); + /* + * Basically, the first error is taken, but: + * -EOPNOTSUPP supersedes any I/O error. + * Requeue request supersedes any I/O error but -EOPNOTSUPP. + */ + if (!md->barrier_error || error == -EOPNOTSUPP || + (md->barrier_error != -EOPNOTSUPP && + error == DM_ENDIO_REQUEUE)) + md->barrier_error = error; + spin_unlock_irqrestore(&md->barrier_error_lock, flags); +} + /* * Don't touch any member of the md after calling this function because * the md may be freed in dm_put() at the end of this function. @@ -759,11 +786,13 @@ static void free_rq_clone(struct request *clone) static void dm_end_request(struct request *clone, int error) { int rw = rq_data_dir(clone); + int run_queue = 1; + bool is_barrier = blk_barrier_rq(clone); struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; struct request *rq = tio->orig; - if (blk_pc_request(rq)) { + if (blk_pc_request(rq) && !is_barrier) { rq->errors = clone->errors; rq->resid_len = clone->resid_len; @@ -778,9 +807,14 @@ static void dm_end_request(struct request *clone, int error) free_rq_clone(clone); - blk_end_request_all(rq, error); + if (unlikely(is_barrier)) { + if (unlikely(error)) + store_barrier_error(md, error); + run_queue = 0; + } else + blk_end_request_all(rq, error); - rq_completed(md, rw, 1); + rq_completed(md, rw, run_queue); } static void dm_unprep_request(struct request *rq) @@ -805,6 +839,16 @@ void dm_requeue_unmapped_request(struct request *clone) struct request_queue *q = rq->q; unsigned long flags; + if (unlikely(blk_barrier_rq(clone))) { + /* + * Barrier clones share an original request. + * Leave it to dm_end_request(), which handles this special + * case. + */ + dm_end_request(clone, DM_ENDIO_REQUEUE); + return; + } + dm_unprep_request(rq); spin_lock_irqsave(q->queue_lock, flags); @@ -894,6 +938,19 @@ static void dm_complete_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; + if (unlikely(blk_barrier_rq(clone))) { + /* + * Barrier clones share an original request. So can't use + * softirq_done with the original. + * Pass the clone to dm_done() directly in this special case. + * It is safe (even if clone->q->queue_lock is held here) + * because there is no I/O dispatching during the completion + * of barrier clone. + */ + dm_done(clone, error, true); + return; + } + tio->error = error; rq->completion_data = clone; blk_complete_request(rq); @@ -910,6 +967,17 @@ void dm_kill_unmapped_request(struct request *clone, int error) struct dm_rq_target_io *tio = clone->end_io_data; struct request *rq = tio->orig; + if (unlikely(blk_barrier_rq(clone))) { + /* + * Barrier clones share an original request. + * Leave it to dm_end_request(), which handles this special + * case. + */ + BUG_ON(error > 0); + dm_end_request(clone, error); + return; + } + rq->cmd_flags |= REQ_FAILED; dm_complete_request(clone, error); } @@ -1364,11 +1432,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q->queuedata; - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - bio_endio(bio, -EOPNOTSUPP); - return 0; - } - return md->saved_make_request_fn(q, bio); /* call __make_request() */ } @@ -1387,6 +1450,25 @@ static int dm_request(struct request_queue *q, struct bio *bio) return _dm_request(q, bio); } +/* + * Mark this request as flush request, so that dm_request_fn() can + * recognize. + */ +static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq) +{ + rq->cmd_type = REQ_TYPE_LINUX_BLOCK; + rq->cmd[0] = REQ_LB_OP_FLUSH; +} + +static bool dm_rq_is_flush_request(struct request *rq) +{ + if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK && + rq->cmd[0] == REQ_LB_OP_FLUSH) + return true; + else + return false; +} + void dm_dispatch_request(struct request *rq) { int r; @@ -1432,16 +1514,24 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, static int setup_clone(struct request *clone, struct request *rq, struct dm_rq_target_io *tio) { - int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, - dm_rq_bio_constructor, tio); + int r; - if (r) - return r; + if (dm_rq_is_flush_request(rq)) { + blk_rq_init(NULL, clone); + clone->cmd_type = REQ_TYPE_FS; + clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); + } else { + r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + dm_rq_bio_constructor, tio); + if (r) + return r; + + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; + clone->buffer = rq->buffer; + } - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; - clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; @@ -1482,6 +1572,9 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) struct mapped_device *md = q->queuedata; struct request *clone; + if (unlikely(dm_rq_is_flush_request(rq))) + return BLKPREP_OK; + if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; @@ -1560,6 +1653,14 @@ static void dm_request_fn(struct request_queue *q) if (!rq) goto plug_and_out; + if (unlikely(dm_rq_is_flush_request(rq))) { + BUG_ON(md->flush_request); + md->flush_request = rq; + blk_start_request(rq); + queue_work(md->wq, &md->barrier_work); + goto out; + } + ti = dm_table_find_target(map, blk_rq_pos(rq)); if (ti->type->busy && ti->type->busy(ti)) goto plug_and_out; @@ -1726,6 +1827,7 @@ out: static const struct block_device_operations dm_blk_dops; static void dm_wq_work(struct work_struct *work); +static void dm_rq_barrier_work(struct work_struct *work); /* * Allocate and initialise a blank device with a given minor. @@ -1755,6 +1857,7 @@ static struct mapped_device *alloc_dev(int minor) init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); spin_lock_init(&md->deferred_lock); + spin_lock_init(&md->barrier_error_lock); rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); @@ -1789,6 +1892,8 @@ static struct mapped_device *alloc_dev(int minor) blk_queue_softirq_done(md->queue, dm_softirq_done); blk_queue_prep_rq(md->queue, dm_prep_fn); blk_queue_lld_busy(md->queue, dm_lld_busy); + blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, + dm_rq_prepare_flush); md->disk = alloc_disk(1); if (!md->disk) @@ -1798,6 +1903,7 @@ static struct mapped_device *alloc_dev(int minor) atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); + INIT_WORK(&md->barrier_work, dm_rq_barrier_work); init_waitqueue_head(&md->eventq); md->disk->major = _major; @@ -2185,6 +2291,73 @@ static void dm_queue_flush(struct mapped_device *md) queue_work(md->wq, &md->work); } +static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + + tio->info.flush_request = flush_nr; +} + +/* Issue barrier requests to targets and wait for their completion. */ +static int dm_rq_barrier(struct mapped_device *md) +{ + int i, j; + struct dm_table *map = dm_get_table(md); + unsigned num_targets = dm_table_get_num_targets(map); + struct dm_target *ti; + struct request *clone; + + md->barrier_error = 0; + + for (i = 0; i < num_targets; i++) { + ti = dm_table_get_target(map, i); + for (j = 0; j < ti->num_flush_requests; j++) { + clone = clone_rq(md->flush_request, md, GFP_NOIO); + dm_rq_set_flush_nr(clone, j); + atomic_inc(&md->pending[rq_data_dir(clone)]); + map_request(ti, clone, md); + } + } + + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); + dm_table_put(map); + + return md->barrier_error; +} + +static void dm_rq_barrier_work(struct work_struct *work) +{ + int error; + struct mapped_device *md = container_of(work, struct mapped_device, + barrier_work); + struct request_queue *q = md->queue; + struct request *rq; + unsigned long flags; + + /* + * Hold the md reference here and leave it at the last part so that + * the md can't be deleted by device opener when the barrier request + * completes. + */ + dm_get(md); + + error = dm_rq_barrier(md); + + rq = md->flush_request; + md->flush_request = NULL; + + if (error == DM_ENDIO_REQUEUE) { + spin_lock_irqsave(q->queue_lock, flags); + blk_requeue_request(q, rq); + spin_unlock_irqrestore(q->queue_lock, flags); + } else + blk_end_request_all(rq, error); + + blk_run_queue(q); + + dm_put(md); +} + /* * Swap in a new table (destroying old one). */ @@ -2325,11 +2498,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); up_write(&md->io_lock); - flush_workqueue(md->wq); - + /* + * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which + * can be kicked until md->queue is stopped. So stop md->queue before + * flushing md->wq. + */ if (dm_request_based(md)) stop_queue(md->queue); + flush_workqueue(md->wq); + /* * At this point no more requests are entering target request routines. * We call dm_wait_for_completion to wait for all existing requests From 7c6664114b7342a36f14a6836564e865669b3cea Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 10 Dec 2009 23:52:19 +0000 Subject: [PATCH 49/80] dm: rename dm_get_table to dm_get_live_table Rename dm_get_table to dm_get_live_table. Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 14 +++++++------- drivers/md/dm.c | 24 ++++++++++++------------ include/linux/device-mapper.h | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 99de0e4ce831..bf3d19a0fb78 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -235,7 +235,7 @@ static void __hash_remove(struct hash_cell *hc) dm_set_mdptr(hc->md, NULL); mutex_unlock(&dm_hash_cells_mutex); - table = dm_get_table(hc->md); + table = dm_get_live_table(hc->md); if (table) { dm_table_event(table); dm_table_put(table); @@ -338,7 +338,7 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) /* * Wake up any dm event waiters. */ - table = dm_get_table(hc->md); + table = dm_get_live_table(hc->md); if (table) { dm_table_event(table); dm_table_put(table); @@ -564,7 +564,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) param->event_nr = dm_get_event_nr(md); - table = dm_get_table(md); + table = dm_get_live_table(md); if (table) { param->flags |= DM_ACTIVE_PRESENT_FLAG; param->target_count = dm_table_get_num_targets(table); @@ -993,7 +993,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) if (r) goto out; - table = dm_get_table(md); + table = dm_get_live_table(md); if (table) { retrieve_status(table, param, param_size); dm_table_put(table); @@ -1226,7 +1226,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) if (r) goto out; - table = dm_get_table(md); + table = dm_get_live_table(md); if (table) { retrieve_deps(table, param, param_size); dm_table_put(table); @@ -1255,7 +1255,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size) if (r) goto out; - table = dm_get_table(md); + table = dm_get_live_table(md); if (table) { retrieve_status(table, param, param_size); dm_table_put(table); @@ -1299,7 +1299,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) goto out; } - table = dm_get_table(md); + table = dm_get_live_table(md); if (!table) goto out_argv; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3de8d6d5b0b8..233a2e9156a3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -397,7 +397,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table(md); struct dm_target *tgt; int r = -ENOTTY; @@ -528,7 +528,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio) * function to access the md->map field, and make sure they call * dm_table_put() when finished. */ -struct dm_table *dm_get_table(struct mapped_device *md) +struct dm_table *dm_get_live_table(struct mapped_device *md) { struct dm_table *t; unsigned long flags; @@ -1294,7 +1294,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) struct clone_info ci; int error = 0; - ci.map = dm_get_table(md); + ci.map = dm_get_live_table(md); if (unlikely(!ci.map)) { if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) bio_io_error(bio); @@ -1335,7 +1335,7 @@ static int dm_merge_bvec(struct request_queue *q, struct bio_vec *biovec) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table(md); struct dm_target *ti; sector_t max_sectors; int max_size = 0; @@ -1638,7 +1638,7 @@ static void map_request(struct dm_target *ti, struct request *clone, static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table(md); struct dm_target *ti; struct request *rq, *clone; @@ -1697,7 +1697,7 @@ static int dm_lld_busy(struct request_queue *q) { int r; struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table(md); if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) r = 1; @@ -1712,7 +1712,7 @@ static int dm_lld_busy(struct request_queue *q) static void dm_unplug_all(struct request_queue *q) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table(md); if (map) { if (dm_request_based(md)) @@ -1730,7 +1730,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_table(md); + map = dm_get_live_table(md); if (map) { /* * Request-based dm cares about only own queue for @@ -2166,7 +2166,7 @@ void dm_put(struct mapped_device *md) BUG_ON(test_bit(DMF_FREEING, &md->flags)); if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { - map = dm_get_table(md); + map = dm_get_live_table(md); idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); @@ -2302,7 +2302,7 @@ static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) static int dm_rq_barrier(struct mapped_device *md) { int i, j; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table(md); unsigned num_targets = dm_table_get_num_targets(map); struct dm_target *ti; struct request *clone; @@ -2453,7 +2453,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) goto out_unlock; } - map = dm_get_table(md); + map = dm_get_live_table(md); /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2558,7 +2558,7 @@ int dm_resume(struct mapped_device *md) if (!dm_suspended(md)) goto out; - map = dm_get_table(md); + map = dm_get_live_table(md); if (!map || !dm_table_get_size(map)) goto out; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index df7607e6dce8..fc16a351ef7a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -276,7 +276,7 @@ void dm_table_unplug_all(struct dm_table *t); /* * Table reference counting. */ -struct dm_table *dm_get_table(struct mapped_device *md); +struct dm_table *dm_get_live_table(struct mapped_device *md); void dm_table_get(struct dm_table *t); void dm_table_put(struct dm_table *t); From 6df400ab64c68fc4072a13019fc20fd7e3d51303 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:19 +0000 Subject: [PATCH 50/80] dm mpath: flush workqueues before suspend completes This patch stops the remaining dm-mpath activity during the suspend sequence by flushing workqueues in postsuspend function. The current dm-mpath target may not be quiet even after suspend completes because some workqueues (e.g. device_handler's work, event handling) are not flushed during the suspend sequence, even though suspended devices/targets are supposed to be quiet in this state. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index dce971dbdfa3..5b23f2df9bde 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -885,13 +885,18 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, return r; } -static void multipath_dtr(struct dm_target *ti) +static void flush_multipath_work(void) { - struct multipath *m = (struct multipath *) ti->private; - flush_workqueue(kmpath_handlerd); flush_workqueue(kmultipathd); flush_scheduled_work(); +} + +static void multipath_dtr(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + flush_multipath_work(); free_multipath(m); } @@ -1261,6 +1266,11 @@ static void multipath_presuspend(struct dm_target *ti) queue_if_no_path(m, 0, 1); } +static void multipath_postsuspend(struct dm_target *ti) +{ + flush_multipath_work(); +} + /* * Restore the queue_if_no_path setting. */ @@ -1567,13 +1577,14 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 1, 0}, + .version = {1, 1, 1}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, .map_rq = multipath_map, .rq_end_io = multipath_end_io, .presuspend = multipath_presuspend, + .postsuspend = multipath_postsuspend, .resume = multipath_resume, .status = multipath_status, .message = multipath_message, From 432a212c0dd0f4ca386cf37c5b740ac9dbda4479 Mon Sep 17 00:00:00 2001 From: Mike Anderson Date: Thu, 10 Dec 2009 23:52:20 +0000 Subject: [PATCH 51/80] dm: add dm_deleting_md function Add dm_deleting_md to check whether or not a given mapped device is currently being deleted. Signed-off-by: Mike Anderson Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 9 +++++++-- drivers/md/dm.h | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 233a2e9156a3..16f759fe04fe 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -329,6 +329,11 @@ static void __exit dm_exit(void) /* * Block device functions */ +int dm_deleting_md(struct mapped_device *md) +{ + return test_bit(DMF_DELETING, &md->flags); +} + static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; @@ -340,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) goto out; if (test_bit(DMF_FREEING, &md->flags) || - test_bit(DMF_DELETING, &md->flags)) { + dm_deleting_md(md)) { md = NULL; goto out; } @@ -2659,7 +2664,7 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) return NULL; if (test_bit(DMF_FREEING, &md->flags) || - test_bit(DMF_DELETING, &md->flags)) + dm_deleting_md(md)) return NULL; dm_get(md); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 4a95e8fa3607..604a5f2a2383 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -88,6 +88,11 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt, int dm_split_args(int *argc, char ***argvp, char *input); +/* + * Is this mapped_device being deleted? + */ +int dm_deleting_md(struct mapped_device *md); + /* * The device-mapper can be driven through one of two interfaces; * ioctl or filesystem, depending which patch you have applied. From c50abeb38026ea721a812cf8a9b2fac5d3b7684b Mon Sep 17 00:00:00 2001 From: Mike Anderson Date: Thu, 10 Dec 2009 23:52:20 +0000 Subject: [PATCH 52/80] dm ioctl: forbid messages to devices being deleted Once we begin deleting a device, prevent any further messages being sent to targets of its table (to avoid races). Signed-off-by: Mike Anderson Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index bf3d19a0fb78..d06dd39856f3 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1303,6 +1303,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (!table) goto out_argv; + if (dm_deleting_md(md)) { + r = -ENXIO; + goto out_table; + } + ti = dm_table_find_target(table, tmsg->sector); if (!dm_target_is_valid(ti)) { DMWARN("Target message sector outside device."); @@ -1314,6 +1319,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) r = -EINVAL; } + out_table: dm_table_put(table); out_argv: kfree(argv); From 6380f26f0424034345461cabaab9a7030d905b59 Mon Sep 17 00:00:00 2001 From: Mike Anderson Date: Thu, 10 Dec 2009 23:52:21 +0000 Subject: [PATCH 53/80] dm mpath: add mutex to synchronize adding and flushing work Add a mutex to allow possible creators of new work to synchronize with flushing work queues. Signed-off-by: Mike Anderson Acked-by: Kiyoshi Ueda Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 59 ++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 5b23f2df9bde..700154e21483 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -93,6 +93,8 @@ struct multipath { * can resubmit bios on error. */ mempool_t *mpio_pool; + + struct mutex work_mutex; }; /* @@ -198,6 +200,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m->queue_io = 1; INIT_WORK(&m->process_queued_ios, process_queued_ios); INIT_WORK(&m->trigger_event, trigger_event); + mutex_init(&m->work_mutex); m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); if (!m->mpio_pool) { kfree(m); @@ -1268,7 +1271,11 @@ static void multipath_presuspend(struct dm_target *ti) static void multipath_postsuspend(struct dm_target *ti) { + struct multipath *m = ti->private; + + mutex_lock(&m->work_mutex); flush_multipath_work(); + mutex_unlock(&m->work_mutex); } /* @@ -1407,51 +1414,61 @@ static int multipath_status(struct dm_target *ti, status_type_t type, static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) { - int r; + int r = -EINVAL; struct dm_dev *dev; struct multipath *m = (struct multipath *) ti->private; action_fn action; + mutex_lock(&m->work_mutex); + if (argc == 1) { - if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) - return queue_if_no_path(m, 1, 0); - else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) - return queue_if_no_path(m, 0, 0); + if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { + r = queue_if_no_path(m, 1, 0); + goto out; + } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { + r = queue_if_no_path(m, 0, 0); + goto out; + } } - if (argc != 2) - goto error; + if (argc != 2) { + DMWARN("Unrecognised multipath message received."); + goto out; + } - if (!strnicmp(argv[0], MESG_STR("disable_group"))) - return bypass_pg_num(m, argv[1], 1); - else if (!strnicmp(argv[0], MESG_STR("enable_group"))) - return bypass_pg_num(m, argv[1], 0); - else if (!strnicmp(argv[0], MESG_STR("switch_group"))) - return switch_pg_num(m, argv[1]); - else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) + if (!strnicmp(argv[0], MESG_STR("disable_group"))) { + r = bypass_pg_num(m, argv[1], 1); + goto out; + } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { + r = bypass_pg_num(m, argv[1], 0); + goto out; + } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { + r = switch_pg_num(m, argv[1]); + goto out; + } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) action = reinstate_path; else if (!strnicmp(argv[0], MESG_STR("fail_path"))) action = fail_path; - else - goto error; + else { + DMWARN("Unrecognised multipath message received."); + goto out; + } r = dm_get_device(ti, argv[1], ti->begin, ti->len, dm_table_get_mode(ti->table), &dev); if (r) { DMWARN("message: error getting device %s", argv[1]); - return -EINVAL; + goto out; } r = action_dev(m, dev, action); dm_put_device(ti, dev); +out: + mutex_unlock(&m->work_mutex); return r; - -error: - DMWARN("Unrecognised multipath message received."); - return -EINVAL; } static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, From 67a46dad25ccc8910995d8671f7ec17a6bc823cb Mon Sep 17 00:00:00 2001 From: Mike Anderson Date: Thu, 10 Dec 2009 23:52:21 +0000 Subject: [PATCH 54/80] dm mpath: prevent io from work queue while suspended Reject messages that can generate I/O while the device itself is suspended. Signed-off-by: Mike Anderson Acked-by: Kiyoshi Ueda Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 700154e21483..45d9bf14cc48 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -95,6 +95,8 @@ struct multipath { mempool_t *mpio_pool; struct mutex work_mutex; + + unsigned suspended; /* Don't create new I/O internally when set. */ }; /* @@ -1274,6 +1276,7 @@ static void multipath_postsuspend(struct dm_target *ti) struct multipath *m = ti->private; mutex_lock(&m->work_mutex); + m->suspended = 1; flush_multipath_work(); mutex_unlock(&m->work_mutex); } @@ -1286,6 +1289,10 @@ static void multipath_resume(struct dm_target *ti) struct multipath *m = (struct multipath *) ti->private; unsigned long flags; + mutex_lock(&m->work_mutex); + m->suspended = 0; + mutex_unlock(&m->work_mutex); + spin_lock_irqsave(&m->lock, flags); m->queue_if_no_path = m->saved_queue_if_no_path; spin_unlock_irqrestore(&m->lock, flags); @@ -1421,6 +1428,11 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) mutex_lock(&m->work_mutex); + if (m->suspended) { + r = -EBUSY; + goto out; + } + if (argc == 1) { if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { r = queue_if_no_path(m, 1, 0); From 12fc0f49dc994d8d90dcf3df13f5b1ee5441288d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:22 +0000 Subject: [PATCH 55/80] dm io: handle empty barriers Accept empty barriers in dm-io. dm-io will process empty write barrier requests just like the other read/write requests. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-io.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index f6a714c5aab0..10f457ca6af2 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -309,7 +309,11 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, unsigned num_bvecs; sector_t remaining = where->count; - while (remaining) { + /* + * where->count may be zero if rw holds a write barrier and we + * need to send a zero-sized barrier. + */ + do { /* * Allocate a suitably sized-bio. */ @@ -339,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, atomic_inc(&io->count); submit_bio(rw, bio); - } + } while (remaining); } static void dispatch_io(int rw, unsigned int num_regions, @@ -360,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count) + if (where[i].count || (rw & (1 << BIO_RW_BARRIER))) do_region(rw, i, where + i, dp, io); } From 1d0f3ce83200edc5d43723c77c62b09ad6560294 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:22 +0000 Subject: [PATCH 56/80] dm ioctl: retrieve status from inactive table Add the flag DM_QUERY_INACTIVE_TABLE_FLAG to the ioctls to return infomation about the loaded-but-not-yet-active table instead of the live table. Prior to this patch it was impossible to obtain this information until the device had been 'resumed'. Userspace dmsetup and libdevmapper support the flag as of version 1.02.40. e.g. dmsetup info --inactive vg1-lv1 Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 70 ++++++++++++++++++++++++++++++++-------- include/linux/dm-ioctl.h | 13 ++++++-- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d06dd39856f3..a3d20265ffc1 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -523,8 +523,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size) return 0; } - - static int check_name(const char *name) { if (strchr(name, '/')) { @@ -535,6 +533,40 @@ static int check_name(const char *name) return 0; } +/* + * On successful return, the caller must not attempt to acquire + * _hash_lock without first calling dm_table_put, because dm_table_destroy + * waits for this dm_table_put and could be called under this lock. + */ +static struct dm_table *dm_get_inactive_table(struct mapped_device *md) +{ + struct hash_cell *hc; + struct dm_table *table = NULL; + + down_read(&_hash_lock); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + DMWARN("device has been removed from the dev hash table."); + goto out; + } + + table = hc->new_map; + if (table) + dm_table_get(table); + +out: + up_read(&_hash_lock); + + return table; +} + +static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, + struct dm_ioctl *param) +{ + return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? + dm_get_inactive_table(md) : dm_get_live_table(md); +} + /* * Fills in a dm_ioctl structure, ready for sending back to * userland. @@ -559,18 +591,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) */ param->open_count = dm_open_count(md); - if (get_disk_ro(disk)) - param->flags |= DM_READONLY_FLAG; - param->event_nr = dm_get_event_nr(md); + param->target_count = 0; table = dm_get_live_table(md); if (table) { - param->flags |= DM_ACTIVE_PRESENT_FLAG; - param->target_count = dm_table_get_num_targets(table); + if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { + if (get_disk_ro(disk)) + param->flags |= DM_READONLY_FLAG; + param->target_count = dm_table_get_num_targets(table); + } dm_table_put(table); - } else - param->target_count = 0; + + param->flags |= DM_ACTIVE_PRESENT_FLAG; + } + + if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { + table = dm_get_inactive_table(md); + if (table) { + if (!(dm_table_get_mode(table) & FMODE_WRITE)) + param->flags |= DM_READONLY_FLAG; + param->target_count = dm_table_get_num_targets(table); + dm_table_put(table); + } + } return 0; } @@ -993,7 +1037,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) if (r) goto out; - table = dm_get_live_table(md); + table = dm_get_live_or_inactive_table(md, param); if (table) { retrieve_status(table, param, param_size); dm_table_put(table); @@ -1226,7 +1270,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) if (r) goto out; - table = dm_get_live_table(md); + table = dm_get_live_or_inactive_table(md, param); if (table) { retrieve_deps(table, param, param_size); dm_table_put(table); @@ -1255,13 +1299,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size) if (r) goto out; - table = dm_get_live_table(md); + table = dm_get_live_or_inactive_table(md, param); if (table) { retrieve_status(table, param, param_size); dm_table_put(table); } - out: +out: dm_put(md); return r; } diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 2ab84c83c31a..aa95508d2f95 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. - * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004 - 2009 Red Hat, Inc. All rights reserved. * * This file is released under the LGPL. */ @@ -266,9 +266,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 15 +#define DM_VERSION_MINOR 16 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2009-04-01)" +#define DM_VERSION_EXTRA "-ioctl (2009-11-05)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -309,4 +309,11 @@ enum { */ #define DM_NOFLUSH_FLAG (1 << 11) /* In */ +/* + * If set, any table information returned will relate to the inactive + * table instead of the live one. Always check DM_INACTIVE_PRESENT_FLAG + * is set before using the data returned. + */ +#define DM_QUERY_INACTIVE_TABLE_FLAG (1 << 12) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ From a794015597a2d9b437470c7692aac77e5fc08cd2 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 10 Dec 2009 23:52:23 +0000 Subject: [PATCH 57/80] dm: bind new table before destroying old When replacing a mapped device's table during a 'resume', delay the destruction of the old table until the new one is successfully in place. This will make it easier for a later patch to transfer internal state information from the old table to the new one (something we do not currently support) while giving us more options for reversion if a later part of the operation fails. Devices are always in the suspended state during dm_swap_table(). This patch reinforces the requirement that all I/O must have been flushed from the table targets while in this state (including any in workqueues). In the case of 'noflush' suspending, unprocessed I/O should have been 'pushed back' to the dm core prior to this point, for resubmission after the new table is in place. Signed-off-by: Alasdair G Kergon --- drivers/md/dm-table.c | 3 +++ drivers/md/dm.c | 16 +++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1a6cb3c7822e..3162b9c3c3f2 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -237,6 +237,9 @@ void dm_table_destroy(struct dm_table *t) { unsigned int i; + if (!t) + return; + while (atomic_read(&t->holders)) msleep(1); smp_mb(); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 16f759fe04fe..255b75c61544 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2077,19 +2077,23 @@ static int __bind(struct mapped_device *md, struct dm_table *t, return 0; } -static void __unbind(struct mapped_device *md) +/* + * Returns unbound table for the caller to free. + */ +static struct dm_table *__unbind(struct mapped_device *md) { struct dm_table *map = md->map; unsigned long flags; if (!map) - return; + return NULL; dm_table_event_callback(map, NULL, NULL); write_lock_irqsave(&md->map_lock, flags); md->map = NULL; write_unlock_irqrestore(&md->map_lock, flags); - dm_table_destroy(map); + + return map; } /* @@ -2182,7 +2186,7 @@ void dm_put(struct mapped_device *md) } dm_sysfs_exit(md); dm_table_put(map); - __unbind(md); + dm_table_destroy(__unbind(md)); free_dev(md); } } @@ -2368,6 +2372,7 @@ static void dm_rq_barrier_work(struct work_struct *work) */ int dm_swap_table(struct mapped_device *md, struct dm_table *table) { + struct dm_table *map; struct queue_limits limits; int r = -EINVAL; @@ -2388,8 +2393,9 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) goto out; } - __unbind(md); + map = __unbind(md); r = __bind(md, table, &limits); + dm_table_destroy(map); out: mutex_unlock(&md->suspend_lock); From 042d2a9bcd80fe12d4b0871706aa9dd2231e8238 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 10 Dec 2009 23:52:24 +0000 Subject: [PATCH 58/80] dm: keep old table until after resume succeeded When swapping a new table into place, retain the old table until its replacement is in place. An old check for an empty table is removed because this is enforced in populate_table(). __unbind() becomes redundant when followed by __bind(). Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 10 ++++++---- drivers/md/dm.c | 34 +++++++++++++++++----------------- include/linux/device-mapper.h | 4 +++- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a3d20265ffc1..63fd4de25bda 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -855,7 +855,7 @@ static int do_resume(struct dm_ioctl *param) unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; struct hash_cell *hc; struct mapped_device *md; - struct dm_table *new_map; + struct dm_table *new_map, *old_map = NULL; down_write(&_hash_lock); @@ -884,11 +884,11 @@ static int do_resume(struct dm_ioctl *param) if (!dm_suspended(md)) dm_suspend(md, suspend_flags); - r = dm_swap_table(md, new_map); - if (r) { + old_map = dm_swap_table(md, new_map); + if (IS_ERR(old_map)) { dm_table_destroy(new_map); dm_put(md); - return r; + return PTR_ERR(old_map); } if (dm_table_get_mode(new_map) & FMODE_WRITE) @@ -900,6 +900,8 @@ static int do_resume(struct dm_ioctl *param) if (dm_suspended(md)) r = dm_resume(md); + if (old_map) + dm_table_destroy(old_map); if (!r) { dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 255b75c61544..c254c6cf69a1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2033,9 +2033,13 @@ static void __set_size(struct mapped_device *md, sector_t size) mutex_unlock(&md->bdev->bd_inode->i_mutex); } -static int __bind(struct mapped_device *md, struct dm_table *t, - struct queue_limits *limits) +/* + * Returns old map, which caller must destroy. + */ +static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, + struct queue_limits *limits) { + struct dm_table *old_map; struct request_queue *q = md->queue; sector_t size; unsigned long flags; @@ -2050,11 +2054,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t, __set_size(md, size); - if (!size) { - dm_table_destroy(t); - return 0; - } - dm_table_event_callback(t, event_callback, md); /* @@ -2070,11 +2069,12 @@ static int __bind(struct mapped_device *md, struct dm_table *t, __bind_mempools(md, t); write_lock_irqsave(&md->map_lock, flags); + old_map = md->map; md->map = t; dm_table_set_restrictions(t, q, limits); write_unlock_irqrestore(&md->map_lock, flags); - return 0; + return old_map; } /* @@ -2368,13 +2368,13 @@ static void dm_rq_barrier_work(struct work_struct *work) } /* - * Swap in a new table (destroying old one). + * Swap in a new table, returning the old one for the caller to destroy. */ -int dm_swap_table(struct mapped_device *md, struct dm_table *table) +struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { - struct dm_table *map; + struct dm_table *map = ERR_PTR(-EINVAL); struct queue_limits limits; - int r = -EINVAL; + int r; mutex_lock(&md->suspend_lock); @@ -2383,8 +2383,10 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) goto out; r = dm_calculate_queue_limits(table, &limits); - if (r) + if (r) { + map = ERR_PTR(r); goto out; + } /* cannot change the device type, once a table is bound */ if (md->map && @@ -2393,13 +2395,11 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) goto out; } - map = __unbind(md); - r = __bind(md, table, &limits); - dm_table_destroy(map); + map = __bind(md, table, &limits); out: mutex_unlock(&md->suspend_lock); - return r; + return map; } /* diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index fc16a351ef7a..b9c6c8ca11be 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -295,8 +295,10 @@ void dm_table_event(struct dm_table *t); /* * The device must be suspended before calling this method. + * Returns the previous table, which the caller must destroy. */ -int dm_swap_table(struct mapped_device *md, struct dm_table *t); +struct dm_table *dm_swap_table(struct mapped_device *md, + struct dm_table *t); /* * A wrapper around vmalloc. From c1f0c183f6acc6d32c5a1d0249ec68bf783af7b1 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:24 +0000 Subject: [PATCH 59/80] dm snapshot: allow live exception store handover between tables Permit in-use snapshot exception data to be 'handed over' from one snapshot instance to another. This is a pre-requisite for patches that allow the changes made in a snapshot device to be merged back into its origin device and also allows device resizing. The basic call sequence is: dmsetup load new_snapshot (referencing the existing in-use cow device) - the ctr code detects that the cow is already in use and allows the two snapshot target instances to be linked together dmsetup suspend original_snapshot dmsetup resume new_snapshot - the new_snapshot becomes live, and if anything now tries to access the original one it will receive -EIO dmsetup remove original_snapshot (There can only be two snapshot targets referencing the same cow device simultaneously.) Signed-off-by: Mike Snitzer Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 263 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 236 insertions(+), 27 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index fd04caa90340..b5b9118c0636 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -302,23 +302,117 @@ static void __insert_origin(struct origin *o) list_add_tail(&o->hash_list, sl); } +/* + * _origins_lock must be held when calling this function. + * Returns number of snapshots registered using the supplied cow device, plus: + * snap_src - a snapshot suitable for use as a source of exception handover + * snap_dest - a snapshot capable of receiving exception handover. + * + * Possible return values and states: + * 0: NULL, NULL - first new snapshot + * 1: snap_src, NULL - normal snapshot + * 2: snap_src, snap_dest - waiting for handover + * 2: snap_src, NULL - handed over, waiting for old to be deleted + * 1: NULL, snap_dest - source got destroyed without handover + */ +static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, + struct dm_snapshot **snap_src, + struct dm_snapshot **snap_dest) +{ + struct dm_snapshot *s; + struct origin *o; + int count = 0; + int active; + + o = __lookup_origin(snap->origin->bdev); + if (!o) + goto out; + + list_for_each_entry(s, &o->snapshots, list) { + if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) + continue; + + down_read(&s->lock); + active = s->active; + up_read(&s->lock); + + if (active) { + if (snap_src) + *snap_src = s; + } else if (snap_dest) + *snap_dest = s; + + count++; + } + +out: + return count; +} + +/* + * On success, returns 1 if this snapshot is a handover destination, + * otherwise returns 0. + */ +static int __validate_exception_handover(struct dm_snapshot *snap) +{ + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + /* Does snapshot need exceptions handed over to it? */ + if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest) == 2) || + snap_dest) { + snap->ti->error = "Snapshot cow pairing for exception " + "table handover failed"; + return -EINVAL; + } + + /* + * If no snap_src was found, snap cannot become a handover + * destination. + */ + if (!snap_src) + return 0; + + return 1; +} + +static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) +{ + struct dm_snapshot *l; + + /* Sort the list according to chunk size, largest-first smallest-last */ + list_for_each_entry(l, &o->snapshots, list) + if (l->store->chunk_size < s->store->chunk_size) + break; + list_add_tail(&s->list, &l->list); +} + /* * Make a note of the snapshot and its origin so we can look it * up when the origin has a write on it. + * + * Also validate snapshot exception store handovers. + * On success, returns 1 if this registration is a handover destination, + * otherwise returns 0. */ static int register_snapshot(struct dm_snapshot *snap) { - struct dm_snapshot *l; - struct origin *o, *new_o; + struct origin *o, *new_o = NULL; struct block_device *bdev = snap->origin->bdev; + int r = 0; new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); if (!new_o) return -ENOMEM; down_write(&_origins_lock); - o = __lookup_origin(bdev); + r = __validate_exception_handover(snap); + if (r < 0) { + kfree(new_o); + goto out; + } + + o = __lookup_origin(bdev); if (o) kfree(new_o); else { @@ -332,14 +426,27 @@ static int register_snapshot(struct dm_snapshot *snap) __insert_origin(o); } - /* Sort the list according to chunk size, largest-first smallest-last */ - list_for_each_entry(l, &o->snapshots, list) - if (l->store->chunk_size < snap->store->chunk_size) - break; - list_add_tail(&snap->list, &l->list); + __insert_snapshot(o, snap); + +out: + up_write(&_origins_lock); + + return r; +} + +/* + * Move snapshot to correct place in list according to chunk size. + */ +static void reregister_snapshot(struct dm_snapshot *s) +{ + struct block_device *bdev = s->origin->bdev; + + down_write(&_origins_lock); + + list_del(&s->list); + __insert_snapshot(__lookup_origin(bdev), s); up_write(&_origins_lock); - return 0; } static void unregister_snapshot(struct dm_snapshot *s) @@ -350,7 +457,7 @@ static void unregister_snapshot(struct dm_snapshot *s) o = __lookup_origin(s->origin->bdev); list_del(&s->list); - if (list_empty(&o->snapshots)) { + if (o && list_empty(&o->snapshots)) { list_del(&o->hash_list); kfree(o); } @@ -662,6 +769,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->suspended = 0; atomic_set(&s->pending_exceptions_count, 0); init_rwsem(&s->lock); + INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); /* Allocate hash table for COW data */ @@ -696,39 +804,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); - /* Metadata must only be loaded into one table at once */ + bio_list_init(&s->queued_bios); + INIT_WORK(&s->queued_bios_work, flush_queued_bios); + + ti->private = s; + ti->num_flush_requests = 1; + + /* Add snapshot to the list of snapshots for this origin */ + /* Exceptions aren't triggered till snapshot_resume() is called */ + r = register_snapshot(s); + if (r == -ENOMEM) { + ti->error = "Snapshot origin struct allocation failed"; + goto bad_load_and_register; + } else if (r < 0) { + /* invalid handover, register_snapshot has set ti->error */ + goto bad_load_and_register; + } + + /* + * Metadata must only be loaded into one table at once, so skip this + * if metadata will be handed over during resume. + * Chunk size will be set during the handover - set it to zero to + * ensure it's ignored. + */ + if (r > 0) { + s->store->chunk_size = 0; + return 0; + } + r = s->store->type->read_metadata(s->store, dm_add_exception, (void *)s); if (r < 0) { ti->error = "Failed to read snapshot metadata"; - goto bad_load_and_register; + goto bad_read_metadata; } else if (r > 0) { s->valid = 0; DMWARN("Snapshot is marked invalid."); } - bio_list_init(&s->queued_bios); - INIT_WORK(&s->queued_bios_work, flush_queued_bios); - if (!s->store->chunk_size) { ti->error = "Chunk size not set"; - goto bad_load_and_register; + goto bad_read_metadata; } - - /* Add snapshot to the list of snapshots for this origin */ - /* Exceptions aren't triggered till snapshot_resume() is called */ - if (register_snapshot(s)) { - r = -EINVAL; - ti->error = "Cannot register snapshot origin"; - goto bad_load_and_register; - } - - ti->private = s; ti->split_io = s->store->chunk_size; - ti->num_flush_requests = 1; return 0; +bad_read_metadata: + unregister_snapshot(s); + bad_load_and_register: mempool_destroy(s->tracked_chunk_pool); @@ -767,15 +891,58 @@ static void __free_exceptions(struct dm_snapshot *s) dm_exception_table_exit(&s->complete, exception_cache); } +static void __handover_exceptions(struct dm_snapshot *snap_src, + struct dm_snapshot *snap_dest) +{ + union { + struct dm_exception_table table_swap; + struct dm_exception_store *store_swap; + } u; + + /* + * Swap all snapshot context information between the two instances. + */ + u.table_swap = snap_dest->complete; + snap_dest->complete = snap_src->complete; + snap_src->complete = u.table_swap; + + u.store_swap = snap_dest->store; + snap_dest->store = snap_src->store; + snap_src->store = u.store_swap; + + snap_dest->store->snap = snap_dest; + snap_src->store->snap = snap_src; + + snap_dest->ti->split_io = snap_dest->store->chunk_size; + snap_dest->valid = snap_src->valid; + + /* + * Set source invalid to ensure it receives no further I/O. + */ + snap_src->valid = 0; +} + static void snapshot_dtr(struct dm_target *ti) { #ifdef CONFIG_DM_DEBUG int i; #endif struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; flush_workqueue(ksnapd); + down_read(&_origins_lock); + /* Check whether exception handover must be cancelled */ + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest); + if (snap_src && snap_dest && (s == snap_src)) { + down_write(&snap_dest->lock); + snap_dest->valid = 0; + up_write(&snap_dest->lock); + DMERR("Cancelling snapshot handover."); + } + up_read(&_origins_lock); + /* Prevent further origin writes from using this snapshot. */ /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); @@ -1188,9 +1355,50 @@ static void snapshot_postsuspend(struct dm_target *ti) up_write(&s->lock); } +static int snapshot_preresume(struct dm_target *ti) +{ + int r = 0; + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest); + if (snap_src && snap_dest) { + down_read(&snap_src->lock); + if (s == snap_src) { + DMERR("Unable to resume snapshot source until " + "handover completes."); + r = -EINVAL; + } else if (!snap_src->suspended) { + DMERR("Unable to perform snapshot handover until " + "source is suspended."); + r = -EINVAL; + } + up_read(&snap_src->lock); + } + up_read(&_origins_lock); + + return r; +} + static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest); + if (snap_src && snap_dest) { + down_write(&snap_src->lock); + down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + __handover_exceptions(snap_src, snap_dest); + up_write(&snap_dest->lock); + up_write(&snap_src->lock); + } + up_read(&_origins_lock); + + /* Now we have correct chunk size, reregister */ + reregister_snapshot(s); down_write(&s->lock); s->active = 1; @@ -1510,6 +1718,7 @@ static struct target_type snapshot_target = { .map = snapshot_map, .end_io = snapshot_end_io, .postsuspend = snapshot_postsuspend, + .preresume = snapshot_preresume, .resume = snapshot_resume, .status = snapshot_status, .iterate_devices = snapshot_iterate_devices, From 6db4ccd6357f28c9ef7058b3bc48904c4b2ac419 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Thu, 10 Dec 2009 23:52:25 +0000 Subject: [PATCH 60/80] dm: trace request based remapping This patch adds a remapping trace to request-based dm. BIO-based dm already has the equivalent tracepoint. For example, under this dm stack (linear LV on multipath): # dmsetup ls --tree -o ascii vg-lv0 (253:1) `-mpath0 (253:0) |- (8:160) |- (66:80) |- (65:176) `- (65:160) Trace of 'dd of=/dev/vg/lv0 bs=128k count=1 oflag=direct' looks like this: without the patch: dd-6674 [000] 539.727384: block_bio_queue: 253,1 WS 0 + 256 [dd] dd-6674 [000] 539.727392: block_remap: 253,0 WS 384 + 256 <- (253,1) 0 dd-6674 [000] 539.727394: block_bio_queue: 253,0 WS 384 + 256 [dd] dd-6674 [000] 539.727405: block_getrq: 253,0 WS 384 + 256 [dd] dd-6674 [000] 539.727409: block_plug: [dd] dd-6674 [000] 539.727410: block_rq_insert: 253,0 W 0 () 384 + 256 [dd] dd-6674 [000] 539.727416: block_rq_issue: 253,0 W 0 () 384 + 256 [dd] dd-6674 [000] 539.727426: block_rq_insert: 65,176 W 0 () 384 + 256 [dd] dd-6674 [000] 539.727427: block_rq_issue: 65,176 W 0 () 384 + 256 [dd] ... and with the patch: (the line with '**' is the trace added by this patch) dd-6617 [002] 162.914301: block_bio_queue: 253,1 WS 0 + 256 [dd] dd-6617 [002] 162.914314: block_remap: 253,0 WS 384 + 256 <- (253,1) 0 dd-6617 [002] 162.914316: block_bio_queue: 253,0 WS 384 + 256 [dd] dd-6617 [002] 162.914331: block_getrq: 253,0 WS 384 + 256 [dd] dd-6617 [002] 162.914335: block_plug: [dd] dd-6617 [002] 162.914337: block_rq_insert: 253,0 W 0 () 384 + 256 [dd] dd-6617 [002] 162.914347: block_rq_issue: 253,0 W 0 () 384 + 256 [dd] **dd-6617 [002] 162.914356: block_rq_remap: 65,176 W 384 + 256 <- (253,0) 384 dd-6617 [002] 162.914358: block_rq_insert: 65,176 W 0 () 384 + 256 [dd] dd-6617 [002] 162.914359: block_rq_issue: 65,176 W 0 () 384 + 256 [dd] ... Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Jens Axboe Cc: Li Zefan Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c254c6cf69a1..40e257fadac9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1618,6 +1618,8 @@ static void map_request(struct dm_target *ti, struct request *clone, break; case DM_MAPIO_REMAPPED: /* The target has remapped the I/O so dispatch it */ + trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), + blk_rq_pos(tio->orig)); dm_dispatch_request(clone); break; case DM_MAPIO_REQUEUE: From 61afef614b013ee1b767cdd10325acae1db1f4d2 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 10 Dec 2009 23:52:25 +0000 Subject: [PATCH 61/80] dm crypt: add plain64 iv The default plain IV is 32-bit only. This plain64 IV provides a compatible mode for encrypted devices bigger than 4TB. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 91e1bf91769f..a93637223c8d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -158,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); * plain: the initial vector is the 32-bit little-endian version of the sector * number, padded with zeros if necessary. * + * plain64: the initial vector is the 64-bit little-endian version of the sector + * number, padded with zeros if necessary. + * * essiv: "encrypted sector|salt initial vector", the sector number is * encrypted with the bulk cipher using a salt as key. The salt * should be derived from the bulk cipher's key via hashing. @@ -180,6 +183,15 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) return 0; } +static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, + sector_t sector) +{ + memset(iv, 0, cc->iv_size); + *(u64 *)iv = cpu_to_le64(sector); + + return 0; +} + /* Initialise ESSIV - compute salt but no local memory allocations */ static int crypt_iv_essiv_init(struct crypt_config *cc) { @@ -342,6 +354,10 @@ static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; +static struct crypt_iv_operations crypt_iv_plain64_ops = { + .generator = crypt_iv_plain64_gen +}; + static struct crypt_iv_operations crypt_iv_essiv_ops = { .ctr = crypt_iv_essiv_ctr, .dtr = crypt_iv_essiv_dtr, @@ -1063,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->iv_gen_ops = NULL; else if (strcmp(ivmode, "plain") == 0) cc->iv_gen_ops = &crypt_iv_plain_ops; + else if (strcmp(ivmode, "plain64") == 0) + cc->iv_gen_ops = &crypt_iv_plain64_ops; else if (strcmp(ivmode, "essiv") == 0) cc->iv_gen_ops = &crypt_iv_essiv_ops; else if (strcmp(ivmode, "benbi") == 0) From 4d4471cb5c1ec426c0f24818b270dc7b3ad7e655 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:26 +0000 Subject: [PATCH 62/80] dm: swap target postsuspend call and setting suspended flag This patch moves DMF_SUSPENDED flag set before postsuspend. No one should care about the ordering, because the flag set and the postsuspend are protected by a single lock, md->suspend_lock, and all strict flag-checkers take the lock. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Mike Anderson Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 40e257fadac9..f2b993c43359 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2550,10 +2550,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * requests are being added to md->deferred list. */ - dm_table_postsuspend_targets(map); - set_bit(DMF_SUSPENDED, &md->flags); + dm_table_postsuspend_targets(map); + out: dm_table_put(map); From 4f186f8bbfa92bf1a2b39f7a8674348bfdba9437 Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:26 +0000 Subject: [PATCH 63/80] dm: rename dm_suspended to dm_suspended_md This patch renames dm_suspended() to dm_suspended_md() and keeps it internal to dm. No functional change. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Mike Anderson Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 8 ++++---- drivers/md/dm-sysfs.c | 2 +- drivers/md/dm.c | 12 ++++++------ drivers/md/dm.h | 5 +++++ include/linux/device-mapper.h | 1 - 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 63fd4de25bda..1d669322b27c 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -579,7 +579,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | DM_ACTIVE_PRESENT_FLAG); - if (dm_suspended(md)) + if (dm_suspended_md(md)) param->flags |= DM_SUSPEND_FLAG; param->dev = huge_encode_dev(disk_devt(disk)); @@ -839,7 +839,7 @@ static int do_suspend(struct dm_ioctl *param) if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) r = dm_suspend(md, suspend_flags); if (!r) @@ -881,7 +881,7 @@ static int do_resume(struct dm_ioctl *param) suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) dm_suspend(md, suspend_flags); old_map = dm_swap_table(md, new_map); @@ -897,7 +897,7 @@ static int do_resume(struct dm_ioctl *param) set_disk_ro(dm_disk(md), 1); } - if (dm_suspended(md)) + if (dm_suspended_md(md)) r = dm_resume(md); if (old_map) diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index b000de38c99a..f53392df7b97 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) { - sprintf(buf, "%d\n", dm_suspended(md)); + sprintf(buf, "%d\n", dm_suspended_md(md)); return strlen(buf); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f2b993c43359..e0702bf37935 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -415,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, tgt = dm_table_get_target(map, 0); - if (dm_suspended(md)) { + if (dm_suspended_md(md)) { r = -EAGAIN; goto out; } @@ -2182,7 +2182,7 @@ void dm_put(struct mapped_device *md) MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - if (!dm_suspended(md)) { + if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); dm_table_postsuspend_targets(map); } @@ -2381,7 +2381,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) mutex_lock(&md->suspend_lock); /* device must be suspended */ - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) goto out; r = dm_calculate_queue_limits(table, &limits); @@ -2461,7 +2461,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) mutex_lock(&md->suspend_lock); - if (dm_suspended(md)) { + if (dm_suspended_md(md)) { r = -EINVAL; goto out_unlock; } @@ -2568,7 +2568,7 @@ int dm_resume(struct mapped_device *md) struct dm_table *map = NULL; mutex_lock(&md->suspend_lock); - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) goto out; map = dm_get_live_table(md); @@ -2679,7 +2679,7 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) return md; } -int dm_suspended(struct mapped_device *md) +int dm_suspended_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED, &md->flags); } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 604a5f2a2383..8dadaa5bc396 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -93,6 +93,11 @@ int dm_split_args(int *argc, char ***argvp, char *input); */ int dm_deleting_md(struct mapped_device *md); +/* + * Is this mapped_device suspended? + */ +int dm_suspended_md(struct mapped_device *md); + /* * The device-mapper can be driven through one of two interfaces; * ioctl or filesystem, depending which patch you have applied. diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index b9c6c8ca11be..fca0d31bbf2d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -235,7 +235,6 @@ void dm_uevent_add(struct mapped_device *md, struct list_head *elist); const char *dm_device_name(struct mapped_device *md); int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid); struct gendisk *dm_disk(struct mapped_device *md); -int dm_suspended(struct mapped_device *md); int dm_noflush_suspending(struct dm_target *ti); union map_info *dm_get_mapinfo(struct bio *bio); union map_info *dm_get_rq_mapinfo(struct request *rq); From 64dbce580d5a7e89e8de20b91f80c7267cdad91d Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:27 +0000 Subject: [PATCH 64/80] dm: export suspended state to targets This patch adds the exported dm_suspended() function so that targets can check whether or not they are suspended. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Mike Anderson Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 11 +++++++++++ include/linux/device-mapper.h | 1 + 2 files changed, 12 insertions(+) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e0702bf37935..3167480b532c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2684,6 +2684,17 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_suspended(struct dm_target *ti) +{ + struct mapped_device *md = dm_table_get_md(ti->table); + int r = dm_suspended_md(md); + + dm_put(md); + + return r; +} +EXPORT_SYMBOL_GPL(dm_suspended); + int dm_noflush_suspending(struct dm_target *ti) { struct mapped_device *md = dm_table_get_md(ti->table); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index fca0d31bbf2d..d4c9c0b88adc 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -235,6 +235,7 @@ void dm_uevent_add(struct mapped_device *md, struct list_head *elist); const char *dm_device_name(struct mapped_device *md); int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid); struct gendisk *dm_disk(struct mapped_device *md); +int dm_suspended(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); union map_info *dm_get_mapinfo(struct bio *bio); union map_info *dm_get_rq_mapinfo(struct request *rq); From c2f3d24b783fda20618b73d65678eb5dfae31a5d Mon Sep 17 00:00:00 2001 From: Kiyoshi Ueda Date: Thu, 10 Dec 2009 23:52:27 +0000 Subject: [PATCH 65/80] dm mpath: reject messages when device is suspended This patch rejects messages that can generate I/O while the device itself is suspended. Signed-off-by: Kiyoshi Ueda Cc: Mike Anderson Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 45d9bf14cc48..e81345a1d08f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1433,6 +1433,11 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) goto out; } + if (dm_suspended(ti)) { + r = -EBUSY; + goto out; + } + if (argc == 1) { if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { r = queue_if_no_path(m, 1, 0); From 9eaae8ffbc340fc034fed1e5d0dc9ca0e943f817 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:28 +0000 Subject: [PATCH 66/80] dm snapshot: make bio optional in __origin_write To support the merging of snapshots back into their origin we need to trigger exceptions in other snapshots not being merged without any incoming bio on the origin device. The bio parameter to __origin_write() becomes optional and the sector needs supplying separately. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index b5b9118c0636..5e553c50c215 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1467,7 +1467,19 @@ static int snapshot_iterate_devices(struct dm_target *ti, /*----------------------------------------------------------------- * Origin methods *---------------------------------------------------------------*/ -static int __origin_write(struct list_head *snapshots, struct bio *bio) + +/* + * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any + * supplied bio was ignored. The caller may submit it immediately. + * (No remapping actually occurs as the origin is always a direct linear + * map.) + * + * If further exceptions are required, DM_MAPIO_SUBMITTED is returned + * and any supplied bio is added to a list to be submitted once all + * the necessary exceptions exist. + */ +static int __origin_write(struct list_head *snapshots, sector_t sector, + struct bio *bio) { int r = DM_MAPIO_REMAPPED, first = 0; struct dm_snapshot *snap; @@ -1486,14 +1498,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) goto next_snapshot; /* Nothing to do if writing beyond end of snapshot */ - if (bio->bi_sector >= dm_table_get_size(snap->ti->table)) + if (sector >= dm_table_get_size(snap->ti->table)) goto next_snapshot; /* * Remember, different snapshots can have * different chunk sizes. */ - chunk = sector_to_chunk(snap->store, bio->bi_sector); + chunk = sector_to_chunk(snap->store, sector); /* * Check exception table to see if block @@ -1543,7 +1555,8 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) first = 1; } - bio_list_add(&primary_pe->origin_bios, bio); + if (bio) + bio_list_add(&primary_pe->origin_bios, bio); r = DM_MAPIO_SUBMITTED; } @@ -1599,7 +1612,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) down_read(&_origins_lock); o = __lookup_origin(origin->bdev); if (o) - r = __origin_write(&o->snapshots, bio); + r = __origin_write(&o->snapshots, bio->bi_sector, bio); up_read(&_origins_lock); return r; From 615d1eb9cad9b34ed17c18c604254e8db533ac6f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:29 +0000 Subject: [PATCH 67/80] dm snapshot: create function for chunk_is_tracked wait Move the __chunk_is_tracked() loop into a separate function as we will also need to call it from the write path in the rare case of conflicting writes to the same chunk. Originally introduced in commit a8d41b59f3f5a7ac19452ef442a7fc1b5fa17366 ("dm snapshot: fix race during exception creation"). Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 5e553c50c215..288994ee7142 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -233,6 +233,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) return found; } +/* + * This conflicting I/O is extremely improbable in the caller, + * so msleep(1) is sufficient and there is no need for a wait queue. + */ +static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) +{ + while (__chunk_is_tracked(s, chunk)) + msleep(1); +} + /* * One of these per registered origin, held in the snapshot_origins hash */ @@ -1102,12 +1112,8 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) goto out; } - /* - * Check for conflicting reads. This is extremely improbable, - * so msleep(1) is sufficient and there is no need for a wait queue. - */ - while (__chunk_is_tracked(s, pe->e.old_chunk)) - msleep(1); + /* Check for conflicting reads */ + __check_for_conflicting_io(s, pe->e.old_chunk); /* * Add a proper exception, and remove the From 4454a6216f75a9ef8c4bd0a65e34b101f725ef1e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:29 +0000 Subject: [PATCH 68/80] dm exception store: add merge specific methods Add functions that decide how many consecutive chunks of snapshot to merge back into the origin next and to update the metadata afterwards. prepare_merge provides a pointer to the most recent still-to-be-merged chunk and returns how many previous ones are consecutive and can be processed together. commit_merge removes the nr_merged most-recent chunks permanently from the exception store. The number must not exceed that returned by prepare_merge. Introduce NUM_SNAPSHOT_HDR_CHUNKS to show where the snapshot header chunk is accounted for. Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.h | 17 +++++ drivers/md/dm-snap-persistent.c | 114 +++++++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index bb8874653de1..c53e08935b42 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -74,6 +74,23 @@ struct dm_exception_store_type { void (*callback) (void *, int success), void *callback_context); + /* + * Returns 0 if the exception store is empty. + * + * If there are exceptions still to be merged, sets + * *last_old_chunk and *last_new_chunk to the most recent + * still-to-be-merged chunk and returns the number of + * consecutive previous ones. + */ + int (*prepare_merge) (struct dm_exception_store *store, + chunk_t *last_old_chunk, chunk_t *last_new_chunk); + + /* + * Clear the last n exceptions. + * nr_merged must be <= the value returned by prepare_merge. + */ + int (*commit_merge) (struct dm_exception_store *store, int nr_merged); + /* * The snapshot is invalid, note this in the metadata. */ diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 157999ebd236..7d08879689ac 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -55,6 +55,8 @@ */ #define SNAPSHOT_DISK_VERSION 1 +#define NUM_SNAPSHOT_HDR_CHUNKS 1 + struct disk_header { uint32_t magic; @@ -120,7 +122,22 @@ struct pstore { /* * The next free chunk for an exception. + * + * When creating exceptions, all the chunks here and above are + * free. It holds the next chunk to be allocated. On rare + * occasions (e.g. after a system crash) holes can be left in + * the exception store because chunks can be committed out of + * order. + * + * When merging exceptions, it does not necessarily mean all the + * chunks here and above are free. It holds the value it would + * have held if all chunks had been committed in order of + * allocation. Consequently the value may occasionally be + * slightly too low, but since it's only used for 'status' and + * it can never reach its minimum value too early this doesn't + * matter. */ + chunk_t next_free; /* @@ -409,6 +426,15 @@ static void write_exception(struct pstore *ps, e->new_chunk = cpu_to_le64(de->new_chunk); } +static void clear_exception(struct pstore *ps, uint32_t index) +{ + struct disk_exception *e = get_exception(ps, index); + + /* clear it */ + e->old_chunk = 0; + e->new_chunk = 0; +} + /* * Registers the exceptions that are present in the current area. * 'full' is filled in to indicate if the area has been @@ -505,7 +531,8 @@ static void persistent_usage(struct dm_exception_store *store, * Then there are (ps->current_area + 1) metadata chunks, each one * separated from the next by ps->exceptions_per_area data chunks. */ - *metadata_sectors = (ps->current_area + 2) * store->chunk_size; + *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * + store->chunk_size; } static void persistent_dtr(struct dm_exception_store *store) @@ -680,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store, ps->callback_count = 0; } +static int persistent_prepare_merge(struct dm_exception_store *store, + chunk_t *last_old_chunk, + chunk_t *last_new_chunk) +{ + struct pstore *ps = get_info(store); + struct disk_exception de; + int nr_consecutive; + int r; + + /* + * When current area is empty, move back to preceding area. + */ + if (!ps->current_committed) { + /* + * Have we finished? + */ + if (!ps->current_area) + return 0; + + ps->current_area--; + r = area_io(ps, READ); + if (r < 0) + return r; + ps->current_committed = ps->exceptions_per_area; + } + + read_exception(ps, ps->current_committed - 1, &de); + *last_old_chunk = de.old_chunk; + *last_new_chunk = de.new_chunk; + + /* + * Find number of consecutive chunks within the current area, + * working backwards. + */ + for (nr_consecutive = 1; nr_consecutive < ps->current_committed; + nr_consecutive++) { + read_exception(ps, ps->current_committed - 1 - nr_consecutive, + &de); + if (de.old_chunk != *last_old_chunk - nr_consecutive || + de.new_chunk != *last_new_chunk - nr_consecutive) + break; + } + + return nr_consecutive; +} + +static int persistent_commit_merge(struct dm_exception_store *store, + int nr_merged) +{ + int r, i; + struct pstore *ps = get_info(store); + + BUG_ON(nr_merged > ps->current_committed); + + for (i = 0; i < nr_merged; i++) + clear_exception(ps, ps->current_committed - 1 - i); + + r = area_io(ps, WRITE); + if (r < 0) + return r; + + ps->current_committed -= nr_merged; + + /* + * At this stage, only persistent_usage() uses ps->next_free, so + * we make no attempt to keep ps->next_free strictly accurate + * as exceptions may have been committed out-of-order originally. + * Once a snapshot has become merging, we set it to the value it + * would have held had all the exceptions been committed in order. + * + * ps->current_area does not get reduced by prepare_merge() until + * after commit_merge() has removed the nr_merged previous exceptions. + */ + ps->next_free = (area_location(ps, ps->current_area) - 1) + + (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS; + + return 0; +} + static void persistent_drop_snapshot(struct dm_exception_store *store) { struct pstore *ps = get_info(store); @@ -705,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store, ps->area = NULL; ps->zero_area = NULL; ps->header_area = NULL; - ps->next_free = 2; /* skipping the header and first area */ + ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ ps->current_committed = 0; ps->callback_count = 0; @@ -748,6 +854,8 @@ static struct dm_exception_store_type _persistent_type = { .read_metadata = persistent_read_metadata, .prepare_exception = persistent_prepare_exception, .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, .drop_snapshot = persistent_drop_snapshot, .usage = persistent_usage, .status = persistent_status, @@ -761,6 +869,8 @@ static struct dm_exception_store_type _persistent_compat_type = { .read_metadata = persistent_read_metadata, .prepare_exception = persistent_prepare_exception, .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, .drop_snapshot = persistent_drop_snapshot, .usage = persistent_usage, .status = persistent_status, From d698aa4500aa3ca9559142060caf0f79da998744 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:30 +0000 Subject: [PATCH 69/80] dm snapshot: add merge target The snapshot-merge target allows a snapshot to be merged back into the snapshot's origin device. One anticipated use of snapshot merging is the rollback of filesystems to back out problematic system upgrades. This patch adds snapshot-merge target management to both dm_snapshot_init() and dm_snapshot_exit(). As an initial place-holder, snapshot-merge is identical to the snapshot target. Documentation is provided. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/snapshot.txt | 64 +++++++++++++++++++++--- drivers/md/dm-snap.c | 53 +++++++++++++++----- 2 files changed, 98 insertions(+), 19 deletions(-) diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt index a5009c8300f3..e3a77b215135 100644 --- a/Documentation/device-mapper/snapshot.txt +++ b/Documentation/device-mapper/snapshot.txt @@ -8,13 +8,19 @@ the block device which are also writable without interfering with the original content; *) To create device "forks", i.e. multiple different versions of the same data stream. +*) To merge a snapshot of a block device back into the snapshot's origin +device. + +In the first two cases, dm copies only the chunks of data that get +changed and uses a separate copy-on-write (COW) block device for +storage. + +For snapshot merge the contents of the COW storage are merged back into +the origin device. -In both cases, dm copies only the chunks of data that get changed and -uses a separate copy-on-write (COW) block device for storage. - - -There are two dm targets available: snapshot and snapshot-origin. +There are three dm targets available: +snapshot, snapshot-origin, and snapshot-merge. *) snapshot-origin @@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be saved on disk - they can be kept in memory by the kernel. -How this is used by LVM2 -======================== +* snapshot-merge + +takes the same table arguments as the snapshot target except it only +works with persistent snapshots. This target assumes the role of the +"snapshot-origin" target and must not be loaded if the "snapshot-origin" +is still present for . + +Creates a merging snapshot that takes control of the changed chunks +stored in the of an existing snapshot, through a handover +procedure, and merges these chunks back into the . Once merging +has started (in the background) the may be opened and the merge +will continue while I/O is flowing to it. Changes to the are +deferred until the merging snapshot's corresponding chunk(s) have been +merged. Once merging has started the snapshot device, associated with +the "snapshot" target, will return -EIO when accessed. + + +How snapshot is used by LVM2 +============================ When you create the first LVM2 snapshot of a volume, four dm devices are used: 1) a device containing the original mapping table of the source volume; @@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base + +How snapshot-merge is used by LVM2 +================================== +A merging snapshot assumes the role of the "snapshot-origin" while +merging. As such the "snapshot-origin" is replaced with +"snapshot-merge". The "-real" device is not changed and the "-cow" +device is renamed to -cow to aid LVM2's cleanup of the +merging snapshot after it completes. The "snapshot" that hands over its +COW device to the "snapshot-merge" is deactivated (unless using lvchange +--refresh); but if it is left active it will simply return I/O errors. + +A snapshot will merge into its origin with the following command: + +lvconvert --merge volumeGroup/snap + +we'll now have this situation: + +# dmsetup table|grep volumeGroup + +volumeGroup-base-real: 0 2097152 linear 8:19 384 +volumeGroup-base-cow: 0 204800 linear 8:19 2097536 +volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 + +# ls -lL /dev/mapper/volumeGroup-* +brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real +brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow +brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 288994ee7142..446827f98236 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -25,6 +25,11 @@ #define DM_MSG_PREFIX "snapshots" +static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; + +#define dm_target_is_snapshot_merge(ti) \ + ((ti)->type->name == dm_snapshot_merge_target_name) + /* * The percentage increment we will wake up users at */ @@ -1743,6 +1748,21 @@ static struct target_type snapshot_target = { .iterate_devices = snapshot_iterate_devices, }; +static struct target_type merge_target = { + .name = dm_snapshot_merge_target_name, + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = snapshot_ctr, + .dtr = snapshot_dtr, + .map = snapshot_map, + .end_io = snapshot_end_io, + .postsuspend = snapshot_postsuspend, + .preresume = snapshot_preresume, + .resume = snapshot_resume, + .status = snapshot_status, + .iterate_devices = snapshot_iterate_devices, +}; + static int __init dm_snapshot_init(void) { int r; @@ -1754,7 +1774,7 @@ static int __init dm_snapshot_init(void) } r = dm_register_target(&snapshot_target); - if (r) { + if (r < 0) { DMERR("snapshot target register failed %d", r); goto bad_register_snapshot_target; } @@ -1762,34 +1782,40 @@ static int __init dm_snapshot_init(void) r = dm_register_target(&origin_target); if (r < 0) { DMERR("Origin target register failed %d", r); - goto bad1; + goto bad_register_origin_target; + } + + r = dm_register_target(&merge_target); + if (r < 0) { + DMERR("Merge target register failed %d", r); + goto bad_register_merge_target; } r = init_origin_hash(); if (r) { DMERR("init_origin_hash failed."); - goto bad2; + goto bad_origin_hash; } exception_cache = KMEM_CACHE(dm_exception, 0); if (!exception_cache) { DMERR("Couldn't create exception cache."); r = -ENOMEM; - goto bad3; + goto bad_exception_cache; } pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); if (!pending_cache) { DMERR("Couldn't create pending cache."); r = -ENOMEM; - goto bad4; + goto bad_pending_cache; } tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); if (!tracked_chunk_cache) { DMERR("Couldn't create cache to track chunks in use."); r = -ENOMEM; - goto bad5; + goto bad_tracked_chunk_cache; } ksnapd = create_singlethread_workqueue("ksnapd"); @@ -1803,19 +1829,21 @@ static int __init dm_snapshot_init(void) bad_pending_pool: kmem_cache_destroy(tracked_chunk_cache); -bad5: +bad_tracked_chunk_cache: kmem_cache_destroy(pending_cache); -bad4: +bad_pending_cache: kmem_cache_destroy(exception_cache); -bad3: +bad_exception_cache: exit_origin_hash(); -bad2: +bad_origin_hash: + dm_unregister_target(&merge_target); +bad_register_merge_target: dm_unregister_target(&origin_target); -bad1: +bad_register_origin_target: dm_unregister_target(&snapshot_target); - bad_register_snapshot_target: dm_exception_store_exit(); + return r; } @@ -1825,6 +1853,7 @@ static void __exit dm_snapshot_exit(void) dm_unregister_target(&snapshot_target); dm_unregister_target(&origin_target); + dm_unregister_target(&merge_target); exit_origin_hash(); kmem_cache_destroy(pending_cache); From 515ad66cc4c82f210d726340349c8f7c1ec6b125 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:30 +0000 Subject: [PATCH 70/80] dm snapshot: rework writing to origin To track the completion of exceptions relating to the same location on the device, the current code selects one exception as primary_pe, links the other exceptions to it and uses reference counting to wait until all the reallocations are complete. It is considered too complicated to extend this code to handle the new snapshot-merge target, where sets of non-overlapping chunks would also need to become linked. Instead, a simpler (but less efficient) approach is taken. Bios are linked to one exception. When it completes, bios are simply retried, and if other related exceptions are still outstanding, they'll get queued again to wait for another one. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 157 ++++++++++++++----------------------------- 1 file changed, 50 insertions(+), 107 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 446827f98236..c01e0dafec3c 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -142,28 +142,6 @@ struct dm_snap_pending_exception { struct bio_list origin_bios; struct bio_list snapshot_bios; - /* - * Short-term queue of pending exceptions prior to submission. - */ - struct list_head list; - - /* - * The primary pending_exception is the one that holds - * the ref_count and the list of origin_bios for a - * group of pending_exceptions. It is always last to get freed. - * These fields get set up when writing to the origin. - */ - struct dm_snap_pending_exception *primary_pe; - - /* - * Number of pending_exceptions processing this chunk. - * When this drops to zero we must complete the origin bios. - * If incrementing or decrementing this, hold pe->snap->lock for - * the sibling concerned and not pe->primary_pe->snap->lock unless - * they are the same. - */ - atomic_t ref_count; - /* Pointer back to snapshot context */ struct dm_snapshot *snap; @@ -1019,6 +997,26 @@ static void flush_queued_bios(struct work_struct *work) flush_bios(queued_bios); } +static int do_origin(struct dm_dev *origin, struct bio *bio); + +/* + * Flush a list of buffers. + */ +static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) +{ + struct bio *n; + int r; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + r = do_origin(s->origin, bio); + if (r == DM_MAPIO_REMAPPED) + generic_make_request(bio); + bio = n; + } +} + /* * Error a list of buffers. */ @@ -1052,39 +1050,6 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) dm_table_event(s->ti->table); } -static void get_pending_exception(struct dm_snap_pending_exception *pe) -{ - atomic_inc(&pe->ref_count); -} - -static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe) -{ - struct dm_snap_pending_exception *primary_pe; - struct bio *origin_bios = NULL; - - primary_pe = pe->primary_pe; - - /* - * If this pe is involved in a write to the origin and - * it is the last sibling to complete then release - * the bios for the original write to the origin. - */ - if (primary_pe && - atomic_dec_and_test(&primary_pe->ref_count)) { - origin_bios = bio_list_get(&primary_pe->origin_bios); - free_pending_exception(primary_pe); - } - - /* - * Free the pe if it's not linked to an origin write or if - * it's not itself a primary pe. - */ - if (!primary_pe || primary_pe != pe) - free_pending_exception(pe); - - return origin_bios; -} - static void pending_complete(struct dm_snap_pending_exception *pe, int success) { struct dm_exception *e; @@ -1129,7 +1094,8 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) out: dm_remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); - origin_bios = put_pending_exception(pe); + origin_bios = bio_list_get(&pe->origin_bios); + free_pending_exception(pe); up_write(&s->lock); @@ -1139,7 +1105,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) else flush_bios(snapshot_bios); - flush_bios(origin_bios); + retry_origin_bios(s, origin_bios); } static void commit_callback(void *context, int success) @@ -1226,8 +1192,6 @@ __find_pending_exception(struct dm_snapshot *s, pe->e.old_chunk = chunk; bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); - pe->primary_pe = NULL; - atomic_set(&pe->ref_count, 0); pe->started = 0; if (s->store->type->prepare_exception(s->store, &pe->e)) { @@ -1235,7 +1199,6 @@ __find_pending_exception(struct dm_snapshot *s, return NULL; } - get_pending_exception(pe); dm_insert_exception(&s->pending, &pe->e); return pe; @@ -1492,16 +1455,16 @@ static int snapshot_iterate_devices(struct dm_target *ti, static int __origin_write(struct list_head *snapshots, sector_t sector, struct bio *bio) { - int r = DM_MAPIO_REMAPPED, first = 0; + int r = DM_MAPIO_REMAPPED; struct dm_snapshot *snap; struct dm_exception *e; - struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; + struct dm_snap_pending_exception *pe; + struct dm_snap_pending_exception *pe_to_start_now = NULL; + struct dm_snap_pending_exception *pe_to_start_last = NULL; chunk_t chunk; - LIST_HEAD(pe_queue); /* Do all the snapshots on this origin */ list_for_each_entry (snap, snapshots, list) { - down_write(&snap->lock); /* Only deal with valid and active snapshots */ @@ -1522,9 +1485,6 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, * Check exception table to see if block * is already remapped in this snapshot * and trigger an exception if not. - * - * ref_count is initialised to 1 so pending_complete() - * won't destroy the primary_pe while we're inside this loop. */ e = dm_lookup_exception(&snap->complete, chunk); if (e) @@ -1554,60 +1514,43 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, } } - if (!primary_pe) { - /* - * Either every pe here has same - * primary_pe or none has one yet. - */ - if (pe->primary_pe) - primary_pe = pe->primary_pe; - else { - primary_pe = pe; - first = 1; + r = DM_MAPIO_SUBMITTED; + + /* + * If an origin bio was supplied, queue it to wait for the + * completion of this exception, and start this one last, + * at the end of the function. + */ + if (bio) { + bio_list_add(&pe->origin_bios, bio); + bio = NULL; + + if (!pe->started) { + pe->started = 1; + pe_to_start_last = pe; } - - if (bio) - bio_list_add(&primary_pe->origin_bios, bio); - - r = DM_MAPIO_SUBMITTED; - } - - if (!pe->primary_pe) { - pe->primary_pe = primary_pe; - get_pending_exception(primary_pe); } if (!pe->started) { pe->started = 1; - list_add_tail(&pe->list, &pe_queue); + pe_to_start_now = pe; } next_snapshot: up_write(&snap->lock); - } - if (!primary_pe) - return r; - - /* - * If this is the first time we're processing this chunk and - * ref_count is now 1 it means all the pending exceptions - * got completed while we were in the loop above, so it falls to - * us here to remove the primary_pe and submit any origin_bios. - */ - - if (first && atomic_dec_and_test(&primary_pe->ref_count)) { - flush_bios(bio_list_get(&primary_pe->origin_bios)); - free_pending_exception(primary_pe); - /* If we got here, pe_queue is necessarily empty. */ - return r; + if (pe_to_start_now) { + start_copy(pe_to_start_now); + pe_to_start_now = NULL; + } } /* - * Now that we have a complete pe list we can start the copying. + * Submit the exception against which the bio is queued last, + * to give the other exceptions a head start. */ - list_for_each_entry_safe(pe, next_pe, &pe_queue, list) - start_copy(pe); + if (pe_to_start_last) + start_copy(pe_to_start_last); return r; } From 3452c2a1eb5b93c1b9fb0d22bd5b07c0cee4dc29 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:31 +0000 Subject: [PATCH 71/80] dm snapshot: avoid allocating exceptions in merge The snapshot-merge target should not allocate new exceptions because the intent is to merge all of its exceptions as quickly and safely as possible. This patch introduces the snapshot-merge mapping function and updates __origin_write() so that it doesn't allocate exceptions on any snapshots that are being merged. If a write request to a merging snapshot device is to be dispatched directly to the origin (because the chunk is not remapped or was already merged), snapshot_merge_map() must make exceptions in other snapshots so calls do_origin(). Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 57 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c01e0dafec3c..59d9ef6f5051 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1308,6 +1308,54 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, return r; } +/* + * A snapshot-merge target behaves like a combination of a snapshot + * target and a snapshot-origin target. It only generates new + * exceptions in other snapshots and not in the one that is being + * merged. + * + * For each chunk, if there is an existing exception, it is used to + * redirect I/O to the cow device. Otherwise I/O is sent to the origin, + * which in turn might generate exceptions in other snapshots. + */ +static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_exception *e; + struct dm_snapshot *s = ti->private; + int r = DM_MAPIO_REMAPPED; + chunk_t chunk; + + chunk = sector_to_chunk(s->store, bio->bi_sector); + + down_read(&s->lock); + + /* Full snapshots are not usable */ + if (!s->valid) { + r = -EIO; + goto out_unlock; + } + + /* If the block is already remapped - use that */ + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + remap_exception(s, e, bio, chunk); + goto out_unlock; + } + + bio->bi_bdev = s->origin->bdev; + + if (bio_rw(bio) == WRITE) { + up_read(&s->lock); + return do_origin(s->origin, bio); + } + +out_unlock: + up_read(&s->lock); + + return r; +} + static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error, union map_info *map_context) { @@ -1465,6 +1513,13 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, /* Do all the snapshots on this origin */ list_for_each_entry (snap, snapshots, list) { + /* + * Don't make new exceptions in a merging snapshot + * because it has effectively been deleted + */ + if (dm_target_is_snapshot_merge(snap->ti)) + continue; + down_write(&snap->lock); /* Only deal with valid and active snapshots */ @@ -1697,7 +1752,7 @@ static struct target_type merge_target = { .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, - .map = snapshot_map, + .map = snapshot_merge_map, .end_io = snapshot_end_io, .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, From 10b8106a70433e14153469ebbdd189776500e238 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:31 +0000 Subject: [PATCH 72/80] dm snapshot: support barriers in snapshot merge target Sets num_flush_requests=2 to support flushing both the origin and cow devices used by the snapshot-merge target. Also, snapshot_ctr() now gets the origin device using FMODE_WRITE if the target is snapshot-merge (which writes to the origin device). Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 59d9ef6f5051..23e3396e43b9 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -709,7 +709,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int i; int r = -EINVAL; char *origin_path, *cow_path; - unsigned args_used; + unsigned args_used, num_flush_requests = 1; + fmode_t origin_mode = FMODE_READ; if (argc != 4) { ti->error = "requires exactly 4 arguments"; @@ -717,6 +718,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + if (dm_target_is_snapshot_merge(ti)) { + num_flush_requests = 2; + origin_mode = FMODE_WRITE; + } + origin_path = argv[0]; argv++; argc--; @@ -750,7 +756,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv += args_used; argc -= args_used; - r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); + r = dm_get_device(ti, origin_path, 0, ti->len, origin_mode, &s->origin); if (r) { ti->error = "Cannot get origin device"; goto bad_origin; @@ -801,7 +807,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_WORK(&s->queued_bios_work, flush_queued_bios); ti->private = s; - ti->num_flush_requests = 1; + ti->num_flush_requests = num_flush_requests; /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ @@ -1326,6 +1332,15 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, int r = DM_MAPIO_REMAPPED; chunk_t chunk; + if (unlikely(bio_empty_barrier(bio))) { + if (!map_context->flush_request) + bio->bi_bdev = s->origin->bdev; + else + bio->bi_bdev = s->cow->bdev; + map_context->ptr = NULL; + return DM_MAPIO_REMAPPED; + } + chunk = sector_to_chunk(s->store, bio->bi_sector); down_read(&s->lock); From 9d3b15c4c776b041f9ee81810cbd375275411829 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:32 +0000 Subject: [PATCH 73/80] dm snapshot: permit only one merge at once Merging more than one snapshot is not supported, so prevent this happening. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 23e3396e43b9..7ddee7c0c518 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -300,8 +300,10 @@ static void __insert_origin(struct origin *o) * Returns number of snapshots registered using the supplied cow device, plus: * snap_src - a snapshot suitable for use as a source of exception handover * snap_dest - a snapshot capable of receiving exception handover. + * snap_merge - an existing snapshot-merge target linked to the same origin. + * There can be at most one snapshot-merge target. The parameter is optional. * - * Possible return values and states: + * Possible return values and states of snap_src and snap_dest. * 0: NULL, NULL - first new snapshot * 1: snap_src, NULL - normal snapshot * 2: snap_src, snap_dest - waiting for handover @@ -310,7 +312,8 @@ static void __insert_origin(struct origin *o) */ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, struct dm_snapshot **snap_src, - struct dm_snapshot **snap_dest) + struct dm_snapshot **snap_dest, + struct dm_snapshot **snap_merge) { struct dm_snapshot *s; struct origin *o; @@ -322,6 +325,8 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, goto out; list_for_each_entry(s, &o->snapshots, list) { + if (dm_target_is_snapshot_merge(s->ti) && snap_merge) + *snap_merge = s; if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) continue; @@ -349,9 +354,11 @@ out: static int __validate_exception_handover(struct dm_snapshot *snap) { struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + struct dm_snapshot *snap_merge = NULL; /* Does snapshot need exceptions handed over to it? */ - if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest) == 2) || + if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, + &snap_merge) == 2) || snap_dest) { snap->ti->error = "Snapshot cow pairing for exception " "table handover failed"; @@ -365,6 +372,20 @@ static int __validate_exception_handover(struct dm_snapshot *snap) if (!snap_src) return 0; + /* + * Non-snapshot-merge handover? + */ + if (!dm_target_is_snapshot_merge(snap->ti)) + return 1; + + /* + * Do not allow more than one merging snapshot. + */ + if (snap_merge) { + snap->ti->error = "A snapshot is already merging."; + return -EINVAL; + } + return 1; } @@ -933,7 +954,7 @@ static void snapshot_dtr(struct dm_target *ti) down_read(&_origins_lock); /* Check whether exception handover must be cancelled */ - (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest && (s == snap_src)) { down_write(&snap_dest->lock); snap_dest->valid = 0; @@ -1399,7 +1420,7 @@ static int snapshot_preresume(struct dm_target *ti) struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; down_read(&_origins_lock); - (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { down_read(&snap_src->lock); if (s == snap_src) { @@ -1424,7 +1445,7 @@ static void snapshot_resume(struct dm_target *ti) struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; down_read(&_origins_lock); - (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { down_write(&snap_src->lock); down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); From 1e03f97e4301f75a2f3b649787f7876516764929 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:32 +0000 Subject: [PATCH 74/80] dm snapshot: add merging Merging is started when origin is resumed and it is stopped when origin is suspended or when the merging snapshot is destroyed or errors are detected. Merging is not yet interlocked with writes: this will be handled in subsequent patches. The code relies on callbacks from a private kcopyd thread. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-exception-store.h | 11 ++ drivers/md/dm-snap.c | 239 +++++++++++++++++++++++++++++++- 2 files changed, 244 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index c53e08935b42..e8dfa06af3ba 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -154,6 +154,13 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) BUG_ON(!dm_consecutive_chunk_count(e)); } +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ + BUG_ON(!dm_consecutive_chunk_count(e)); + + e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); +} + # else # define DM_CHUNK_CONSECUTIVE_BITS 0 @@ -171,6 +178,10 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) { } +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ +} + # endif /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 7ddee7c0c518..dc2412e6c5cf 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -106,8 +106,20 @@ struct dm_snapshot { mempool_t *tracked_chunk_pool; spinlock_t tracked_chunk_lock; struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; + + /* Wait for events based on state_bits */ + unsigned long state_bits; }; +/* + * state_bits: + * RUNNING_MERGE - Merge operation is in progress. + * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; + * cleared afterwards. + */ +#define RUNNING_MERGE 0 +#define SHUTDOWN_MERGE 1 + struct dm_dev *dm_snap_cow(struct dm_snapshot *s) { return s->cow; @@ -386,6 +398,13 @@ static int __validate_exception_handover(struct dm_snapshot *snap) return -EINVAL; } + if (!snap_src->store->type->prepare_merge || + !snap_src->store->type->commit_merge) { + snap->ti->error = "Snapshot exception store does not " + "support snapshot-merge."; + return -EINVAL; + } + return 1; } @@ -721,6 +740,178 @@ static int init_hash_tables(struct dm_snapshot *s) return 0; } +static void merge_shutdown(struct dm_snapshot *s) +{ + clear_bit_unlock(RUNNING_MERGE, &s->state_bits); + smp_mb__after_clear_bit(); + wake_up_bit(&s->state_bits, RUNNING_MERGE); +} + +/* + * Remove one chunk from the index of completed exceptions. + */ +static int __remove_single_exception_chunk(struct dm_snapshot *s, + chunk_t old_chunk) +{ + struct dm_exception *e; + + /* FIXME: interlock writes to this chunk */ + + e = dm_lookup_exception(&s->complete, old_chunk); + if (!e) { + DMERR("Corruption detected: exception for block %llu is " + "on disk but not in memory", + (unsigned long long)old_chunk); + return -EINVAL; + } + + /* + * If this is the only chunk using this exception, remove exception. + */ + if (!dm_consecutive_chunk_count(e)) { + dm_remove_exception(e); + free_completed_exception(e); + return 0; + } + + /* + * The chunk may be either at the beginning or the end of a + * group of consecutive chunks - never in the middle. We are + * removing chunks in the opposite order to that in which they + * were added, so this should always be true. + * Decrement the consecutive chunk counter and adjust the + * starting point if necessary. + */ + if (old_chunk == e->old_chunk) { + e->old_chunk++; + e->new_chunk++; + } else if (old_chunk != e->old_chunk + + dm_consecutive_chunk_count(e)) { + DMERR("Attempt to merge block %llu from the " + "middle of a chunk range [%llu - %llu]", + (unsigned long long)old_chunk, + (unsigned long long)e->old_chunk, + (unsigned long long) + e->old_chunk + dm_consecutive_chunk_count(e)); + return -EINVAL; + } + + dm_consecutive_chunk_count_dec(e); + + return 0; +} + +static int remove_single_exception_chunk(struct dm_snapshot *s, + chunk_t old_chunk) +{ + int r = 0; + + down_write(&s->lock); + r = __remove_single_exception_chunk(s, old_chunk); + up_write(&s->lock); + + return r; +} + +static void merge_callback(int read_err, unsigned long write_err, + void *context); + +static void snapshot_merge_next_chunks(struct dm_snapshot *s) +{ + int r; + chunk_t old_chunk, new_chunk; + struct dm_io_region src, dest; + + BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); + if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) + goto shut; + + /* + * valid flag never changes during merge, so no lock required. + */ + if (!s->valid) { + DMERR("Snapshot is invalid: can't merge"); + goto shut; + } + + r = s->store->type->prepare_merge(s->store, &old_chunk, &new_chunk); + if (r <= 0) { + if (r < 0) + DMERR("Read error in exception store: " + "shutting down merge"); + goto shut; + } + + /* TODO: use larger I/O size once we verify that kcopyd handles it */ + + if (remove_single_exception_chunk(s, old_chunk) < 0) + goto shut; + + dest.bdev = s->origin->bdev; + dest.sector = chunk_to_sector(s->store, old_chunk); + dest.count = min((sector_t)s->store->chunk_size, + get_dev_size(dest.bdev) - dest.sector); + + src.bdev = s->cow->bdev; + src.sector = chunk_to_sector(s->store, new_chunk); + src.count = dest.count; + + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); + return; + +shut: + merge_shutdown(s); +} + +static void merge_callback(int read_err, unsigned long write_err, void *context) +{ + struct dm_snapshot *s = context; + + if (read_err || write_err) { + if (read_err) + DMERR("Read error: shutting down merge."); + else + DMERR("Write error: shutting down merge."); + goto shut; + } + + if (s->store->type->commit_merge(s->store, 1) < 0) { + DMERR("Write error in exception store: shutting down merge"); + goto shut; + } + + snapshot_merge_next_chunks(s); + + return; + +shut: + merge_shutdown(s); +} + +static void start_merge(struct dm_snapshot *s) +{ + if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) + snapshot_merge_next_chunks(s); +} + +static int wait_schedule(void *ptr) +{ + schedule(); + + return 0; +} + +/* + * Stop the merging process and wait until it finishes. + */ +static void stop_merge(struct dm_snapshot *s) +{ + set_bit(SHUTDOWN_MERGE, &s->state_bits); + wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, + TASK_UNINTERRUPTIBLE); + clear_bit(SHUTDOWN_MERGE, &s->state_bits); +} + /* * Construct a snapshot mapping:

*/ @@ -791,6 +982,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); + s->state_bits = 0; /* Allocate hash table for COW data */ if (init_hash_tables(s)) { @@ -963,6 +1155,9 @@ static void snapshot_dtr(struct dm_target *ti) } up_read(&_origins_lock); + if (dm_target_is_snapshot_merge(ti)) + stop_merge(s); + /* Prevent further origin writes from using this snapshot. */ /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); @@ -1404,6 +1599,13 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio, return 0; } +static void snapshot_merge_presuspend(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + stop_merge(s); +} + static void snapshot_postsuspend(struct dm_target *ti) { struct dm_snapshot *s = ti->private; @@ -1464,6 +1666,34 @@ static void snapshot_resume(struct dm_target *ti) up_write(&s->lock); } +static sector_t get_origin_minimum_chunksize(struct block_device *bdev) +{ + sector_t min_chunksize; + + down_read(&_origins_lock); + min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); + up_read(&_origins_lock); + + return min_chunksize; +} + +static void snapshot_merge_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + /* + * Handover exceptions from existing snapshot. + */ + snapshot_resume(ti); + + /* + * snapshot-merge acts as an origin, so set ti->split_io + */ + ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); + + start_merge(s); +} + static int snapshot_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { @@ -1722,11 +1952,7 @@ static void origin_resume(struct dm_target *ti) { struct dm_dev *dev = ti->private; - down_read(&_origins_lock); - - ti->split_io = __minimum_chunk_size(__lookup_origin(dev->bdev)); - - up_read(&_origins_lock); + ti->split_io = get_origin_minimum_chunksize(dev->bdev); } static int origin_status(struct dm_target *ti, status_type_t type, char *result, @@ -1790,9 +2016,10 @@ static struct target_type merge_target = { .dtr = snapshot_dtr, .map = snapshot_merge_map, .end_io = snapshot_end_io, + .presuspend = snapshot_merge_presuspend, .postsuspend = snapshot_postsuspend, .preresume = snapshot_preresume, - .resume = snapshot_resume, + .resume = snapshot_merge_resume, .status = snapshot_status, .iterate_devices = snapshot_iterate_devices, }; From 9fe862548821b0c206c58e8057b782530a173703 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:33 +0000 Subject: [PATCH 75/80] dm snapshot: queue writes to chunks being merged While a set of chunks is being merged, any overlapping writes need to be queued. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 91 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index dc2412e6c5cf..91a47c522b09 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -109,6 +109,16 @@ struct dm_snapshot { /* Wait for events based on state_bits */ unsigned long state_bits; + + /* Range of chunks currently being merged. */ + chunk_t first_merging_chunk; + int num_merging_chunks; + + /* + * Incoming bios that overlap with chunks being merged must wait + * for them to be committed. + */ + struct bio_list bios_queued_during_merge; }; /* @@ -747,6 +757,14 @@ static void merge_shutdown(struct dm_snapshot *s) wake_up_bit(&s->state_bits, RUNNING_MERGE); } +static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) +{ + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + + return bio_list_get(&s->bios_queued_during_merge); +} + /* * Remove one chunk from the index of completed exceptions. */ @@ -755,8 +773,6 @@ static int __remove_single_exception_chunk(struct dm_snapshot *s, { struct dm_exception *e; - /* FIXME: interlock writes to this chunk */ - e = dm_lookup_exception(&s->complete, old_chunk); if (!e) { DMERR("Corruption detected: exception for block %llu is " @@ -801,14 +817,32 @@ static int __remove_single_exception_chunk(struct dm_snapshot *s, return 0; } -static int remove_single_exception_chunk(struct dm_snapshot *s, - chunk_t old_chunk) +static void flush_bios(struct bio *bio); + +static int remove_single_exception_chunk(struct dm_snapshot *s) { - int r = 0; + struct bio *b = NULL; + int r; + chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; down_write(&s->lock); - r = __remove_single_exception_chunk(s, old_chunk); + + /* + * Process chunks (and associated exceptions) in reverse order + * so that dm_consecutive_chunk_count_dec() accounting works. + */ + do { + r = __remove_single_exception_chunk(s, old_chunk); + if (r) + goto out; + } while (old_chunk-- > s->first_merging_chunk); + + b = __release_queued_bios_after_merge(s); + +out: up_write(&s->lock); + if (b) + flush_bios(b); return r; } @@ -844,9 +878,6 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) /* TODO: use larger I/O size once we verify that kcopyd handles it */ - if (remove_single_exception_chunk(s, old_chunk) < 0) - goto shut; - dest.bdev = s->origin->bdev; dest.sector = chunk_to_sector(s->store, old_chunk); dest.count = min((sector_t)s->store->chunk_size, @@ -856,6 +887,13 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) src.sector = chunk_to_sector(s->store, new_chunk); src.count = dest.count; + down_write(&s->lock); + s->first_merging_chunk = old_chunk; + s->num_merging_chunks = 1; + up_write(&s->lock); + + /* !!! FIXME: wait until writes to this chunk drain */ + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); return; @@ -863,9 +901,12 @@ shut: merge_shutdown(s); } +static void error_bios(struct bio *bio); + static void merge_callback(int read_err, unsigned long write_err, void *context) { struct dm_snapshot *s = context; + struct bio *b = NULL; if (read_err || write_err) { if (read_err) @@ -875,16 +916,25 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) goto shut; } - if (s->store->type->commit_merge(s->store, 1) < 0) { + if (s->store->type->commit_merge(s->store, + s->num_merging_chunks) < 0) { DMERR("Write error in exception store: shutting down merge"); goto shut; } + if (remove_single_exception_chunk(s) < 0) + goto shut; + snapshot_merge_next_chunks(s); return; shut: + down_write(&s->lock); + b = __release_queued_bios_after_merge(s); + up_write(&s->lock); + error_bios(b); + merge_shutdown(s); } @@ -983,6 +1033,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + bio_list_init(&s->bios_queued_during_merge); /* Allocate hash table for COW data */ if (init_hash_tables(s)) { @@ -1539,6 +1592,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, * For each chunk, if there is an existing exception, it is used to * redirect I/O to the cow device. Otherwise I/O is sent to the origin, * which in turn might generate exceptions in other snapshots. + * If merging is currently taking place on the chunk in question, the + * I/O is deferred by adding it to s->bios_queued_during_merge. */ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) @@ -1559,7 +1614,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, chunk = sector_to_chunk(s->store, bio->bi_sector); - down_read(&s->lock); + down_write(&s->lock); /* Full snapshots are not usable */ if (!s->valid) { @@ -1570,6 +1625,16 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, /* If the block is already remapped - use that */ e = dm_lookup_exception(&s->complete, chunk); if (e) { + /* Queue writes overlapping with chunks being merged */ + if (bio_rw(bio) == WRITE && + chunk >= s->first_merging_chunk && + chunk < (s->first_merging_chunk + + s->num_merging_chunks)) { + bio->bi_bdev = s->origin->bdev; + bio_list_add(&s->bios_queued_during_merge, bio); + r = DM_MAPIO_SUBMITTED; + goto out_unlock; + } remap_exception(s, e, bio, chunk); goto out_unlock; } @@ -1577,12 +1642,12 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, bio->bi_bdev = s->origin->bdev; if (bio_rw(bio) == WRITE) { - up_read(&s->lock); + up_write(&s->lock); return do_origin(s->origin, bio); } out_unlock: - up_read(&s->lock); + up_write(&s->lock); return r; } From 17aa03326d40614db94bc51fbbc92df628a5c97c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:33 +0000 Subject: [PATCH 76/80] dm snapshot: delay merging a chunk until writes to it complete Track writes to chunks that are currently being merged and delay merging a chunk until all writes to that chunk finish. Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 91a47c522b09..bc52776c69cc 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -892,7 +892,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) s->num_merging_chunks = 1; up_write(&s->lock); - /* !!! FIXME: wait until writes to this chunk drain */ + __check_for_conflicting_io(s, old_chunk); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); return; @@ -1635,7 +1635,11 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, r = DM_MAPIO_SUBMITTED; goto out_unlock; } + remap_exception(s, e, bio, chunk); + + if (bio_rw(bio) == WRITE) + map_context->ptr = track_chunk(s, chunk); goto out_unlock; } From 73dfd078cf8bfee4018fb22f1e2a24f2e05b69dc Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:34 +0000 Subject: [PATCH 77/80] dm snapshot: trigger exceptions in remaining snapshots during merge When there is one merging snapshot and other non-merging snapshots, snapshot_merge_process() must make exceptions in the non-merging snapshots. Use a sequence count to resolve the race between I/O to chunks that are about to be merged. The count increases each time an exception reallocation finishes. Use wait_event() to wait until the count changes. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 83 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index bc52776c69cc..1498704467a7 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -270,6 +270,10 @@ struct origin { static struct list_head *_origins; static struct rw_semaphore _origins_lock; +static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); +static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); +static uint64_t _pending_exceptions_done_count; + static int init_origin_hash(void) { int i; @@ -847,14 +851,38 @@ out: return r; } +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned chunk_size); + static void merge_callback(int read_err, unsigned long write_err, void *context); +static uint64_t read_pending_exceptions_done_count(void) +{ + uint64_t pending_exceptions_done; + + spin_lock(&_pending_exceptions_done_spinlock); + pending_exceptions_done = _pending_exceptions_done_count; + spin_unlock(&_pending_exceptions_done_spinlock); + + return pending_exceptions_done; +} + +static void increment_pending_exceptions_done_count(void) +{ + spin_lock(&_pending_exceptions_done_spinlock); + _pending_exceptions_done_count++; + spin_unlock(&_pending_exceptions_done_spinlock); + + wake_up_all(&_pending_exceptions_done); +} + static void snapshot_merge_next_chunks(struct dm_snapshot *s) { int r; chunk_t old_chunk, new_chunk; struct dm_io_region src, dest; + uint64_t previous_count; BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) @@ -887,6 +915,24 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) src.sector = chunk_to_sector(s->store, new_chunk); src.count = dest.count; + /* + * Reallocate any exceptions needed in other snapshots then + * wait for the pending exceptions to complete. + * Each time any pending exception (globally on the system) + * completes we are woken and repeat the process to find out + * if we can proceed. While this may not seem a particularly + * efficient algorithm, it is not expected to have any + * significant impact on performance. + */ + previous_count = read_pending_exceptions_done_count(); + while (origin_write_extent(s, dest.sector, s->store->chunk_size)) { + wait_event(_pending_exceptions_done, + (read_pending_exceptions_done_count() != + previous_count)); + /* Retry after the wait, until all exceptions are done. */ + previous_count = read_pending_exceptions_done_count(); + } + down_write(&s->lock); s->first_merging_chunk = old_chunk; s->num_merging_chunks = 1; @@ -1372,6 +1418,8 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) origin_bios = bio_list_get(&pe->origin_bios); free_pending_exception(pe); + increment_pending_exceptions_done_count(); + up_write(&s->lock); /* Submit any pending write bios */ @@ -1962,6 +2010,41 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) return r; } +/* + * Trigger exceptions in all non-merging snapshots. + * + * The chunk size of the merging snapshot may be larger than the chunk + * size of some other snapshot so we may need to reallocate multiple + * chunks in other snapshots. + * + * We scan all the overlapping exceptions in the other snapshots. + * Returns 1 if anything was reallocated and must be waited for, + * otherwise returns 0. + * + * size must be a multiple of merging_snap's chunk_size. + */ +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned size) +{ + int must_wait = 0; + sector_t n; + struct origin *o; + + /* + * The origin's __minimum_chunk_size() got stored in split_io + * by snapshot_merge_resume(). + */ + down_read(&_origins_lock); + o = __lookup_origin(merging_snap->origin->bdev); + for (n = 0; n < size; n += merging_snap->ti->split_io) + if (__origin_write(&o->snapshots, sector + n, NULL) == + DM_MAPIO_SUBMITTED) + must_wait = 1; + up_read(&_origins_lock); + + return must_wait; +} + /* * Origin: maps a linear range of a device, with hooks for snapshotting. */ From 8a2d528620e228ddfd0df9cec0a16e034ff8db1d Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:34 +0000 Subject: [PATCH 78/80] dm snapshot: merge consecutive chunks together s->store->type->prepare_merge returns the number of chunks that can be copied linearly working backwards from the returned chunk number. For example, if it returns 3 chunks with old_chunk == 10 and new_chunk == 20, then chunk 20 can be copied to 10, chunk 19 to 9 and 18 to 8. Until now kcopyd only copied one chunk at a time. This patch now copies the full set at once. Consequently, snapshot_merge_process() needs to delay the merging of all chunks if any have writes in progress, not just the first chunk in the region that is to be merged. snapshot-merge's performance is now comparable to the original snapshot-origin target. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 1498704467a7..bb4b733697b3 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -879,9 +879,10 @@ static void increment_pending_exceptions_done_count(void) static void snapshot_merge_next_chunks(struct dm_snapshot *s) { - int r; + int i, linear_chunks; chunk_t old_chunk, new_chunk; struct dm_io_region src, dest; + sector_t io_size; uint64_t previous_count; BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); @@ -896,20 +897,28 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) goto shut; } - r = s->store->type->prepare_merge(s->store, &old_chunk, &new_chunk); - if (r <= 0) { - if (r < 0) + linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, + &new_chunk); + if (linear_chunks <= 0) { + if (linear_chunks < 0) DMERR("Read error in exception store: " "shutting down merge"); goto shut; } - /* TODO: use larger I/O size once we verify that kcopyd handles it */ + /* Adjust old_chunk and new_chunk to reflect start of linear region */ + old_chunk = old_chunk + 1 - linear_chunks; + new_chunk = new_chunk + 1 - linear_chunks; + + /* + * Use one (potentially large) I/O to copy all 'linear_chunks' + * from the exception store to the origin + */ + io_size = linear_chunks * s->store->chunk_size; dest.bdev = s->origin->bdev; dest.sector = chunk_to_sector(s->store, old_chunk); - dest.count = min((sector_t)s->store->chunk_size, - get_dev_size(dest.bdev) - dest.sector); + dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); src.bdev = s->cow->bdev; src.sector = chunk_to_sector(s->store, new_chunk); @@ -925,7 +934,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) * significant impact on performance. */ previous_count = read_pending_exceptions_done_count(); - while (origin_write_extent(s, dest.sector, s->store->chunk_size)) { + while (origin_write_extent(s, dest.sector, io_size)) { wait_event(_pending_exceptions_done, (read_pending_exceptions_done_count() != previous_count)); @@ -935,10 +944,12 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) down_write(&s->lock); s->first_merging_chunk = old_chunk; - s->num_merging_chunks = 1; + s->num_merging_chunks = linear_chunks; up_write(&s->lock); - __check_for_conflicting_io(s, old_chunk); + /* Wait until writes to all 'linear_chunks' drain */ + for (i = 0; i < linear_chunks; i++) + __check_for_conflicting_io(s, old_chunk + i); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); return; From d8ddb1cfff0070479c1f4a07c1d4a48ef8cb188e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 10 Dec 2009 23:52:35 +0000 Subject: [PATCH 79/80] dm snapshot: report merge failure in status Set 'merge_failed' flag if a snapshot fails to merge. Update snapshot_status() to report "Merge failed" if 'merge_failed' is set. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index bb4b733697b3..4c80e82f941c 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -71,7 +71,10 @@ struct dm_snapshot { /* List of snapshots per Origin */ struct list_head list; - /* You can't use a snapshot if this is 0 (e.g. if full) */ + /* + * You can't use a snapshot if this is 0 (e.g. if full). + * A snapshot-merge target never clears this. + */ int valid; /* Origin writes don't trigger exceptions until this is set */ @@ -107,6 +110,21 @@ struct dm_snapshot { spinlock_t tracked_chunk_lock; struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; + /* + * The merge operation failed if this flag is set. + * Failure modes are handled as follows: + * - I/O error reading the header + * => don't load the target; abort. + * - Header does not have "valid" flag set + * => use the origin; forget about the snapshot. + * - I/O error when reading exceptions + * => don't load the target; abort. + * (We can't use the intermediate origin state.) + * - I/O error while merging + * => stop merging; set merge_failed; process I/O normally. + */ + int merge_failed; + /* Wait for events based on state_bits */ unsigned long state_bits; @@ -900,9 +918,13 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, &new_chunk); if (linear_chunks <= 0) { - if (linear_chunks < 0) + if (linear_chunks < 0) { DMERR("Read error in exception store: " "shutting down merge"); + down_write(&s->lock); + s->merge_failed = 1; + up_write(&s->lock); + } goto shut; } @@ -988,6 +1010,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) shut: down_write(&s->lock); + s->merge_failed = 1; b = __release_queued_bios_after_merge(s); up_write(&s->lock); error_bios(b); @@ -1090,6 +1113,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; + s->merge_failed = 0; s->first_merging_chunk = 0; s->num_merging_chunks = 0; bio_list_init(&s->bios_queued_during_merge); @@ -1835,6 +1859,8 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, if (!snap->valid) DMEMIT("Invalid"); + else if (snap->merge_failed) + DMEMIT("Merge failed"); else { if (snap->store->type->usage) { sector_t total_sectors, sectors_allocated, From d2fdb776e08d4231d7e86a879cc663a93913c202 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 10 Dec 2009 23:52:36 +0000 Subject: [PATCH 80/80] dm snapshot: use merge origin if snapshot invalid If the snapshot we are merging became invalid (e.g. it ran out of space) redirect all I/O directly to the origin device. Signed-off-by: Mikulas Patocka Reviewed-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 4c80e82f941c..ee8eb283650d 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1699,11 +1699,9 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, down_write(&s->lock); - /* Full snapshots are not usable */ - if (!s->valid) { - r = -EIO; - goto out_unlock; - } + /* Full merging snapshots are redirected to the origin */ + if (!s->valid) + goto redirect_to_origin; /* If the block is already remapped - use that */ e = dm_lookup_exception(&s->complete, chunk); @@ -1726,6 +1724,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, goto out_unlock; } +redirect_to_origin: bio->bi_bdev = s->origin->bdev; if (bio_rw(bio) == WRITE) {