From cfa7a9ccda711ac6ab8f0d17c3a9b540092d305a Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 17 Dec 2012 06:38:51 +0000 Subject: [PATCH 01/36] Btrfs: fix memory leak in name_cache_insert() We should free name_cache_entry before returning from the error handling code. Signed-off-by: Tsutomu Itoh --- fs/btrfs/send.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 54454542ad40..321b7fb4e441 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); - if (!nce_head) + if (!nce_head) { + kfree(nce); return -ENOMEM; + } INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); From cc975eb4605c5765a5d5e7a51d24ba5a1cda269e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 7 Dec 2012 10:09:19 +0000 Subject: [PATCH 02/36] btrfs: get the device in write mode when deleting it When we're deleting the device we should get it in write mode since we're going to re-write the super block magic on that device. And it should fail if the device is read-only. Signed-off-by: Lukas Czerner --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa74012..86279c37de64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } else { ret = btrfs_get_bdev_and_sb(device_path, - FMODE_READ | FMODE_EXCL, + FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder, 0, &bdev, &bh); if (ret) From d86e56cf7d3669dd292012ac82b986bd1573b6cc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 15 Nov 2012 11:35:41 +0000 Subject: [PATCH 03/36] Btrfs: disable qgroup id 0 Qgroup id 0 is a special number, we should set the id of a qgroup to 0. Fix it. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7624212ae926..dd8e3448fe8f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3698,6 +3698,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto drop_write; } + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); From 5c39da5b6ca23e68e7acea7f4c01470383475214 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 22 Oct 2012 11:39:53 +0000 Subject: [PATCH 04/36] Btrfs: do not delete a subvolume which is in a R/O subvolume Step to reproduce: # mkfs.btrfs # mount # btrfs sub create /subv0 # btrfs sub snap /subv0/snap0 # change /subv0 from R/W to R/O # btrfs sub del /subv0/snap0 We deleted the snapshot successfully. I think we should not be able to delete the snapshot since the parent subvolume is R/O. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dd8e3448fe8f..5a72896bd769 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2095,13 +2095,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = inode_permission(inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; } + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; From dba60f3f5d564167118cad151a7d41dfe8d2a5f7 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 09:19:51 +0000 Subject: [PATCH 05/36] Btrfs: fix resize a readonly device We should not resize a readonly device, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a72896bd769..0de21213d05d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1362,6 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", @@ -1369,9 +1370,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - if (device->fs_devices && device->fs_devices->seeding) { + + if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", + "readonly device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_free; From 97547676570b3bd908560741315bf4b7d635bcf5 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 10:38:50 +0000 Subject: [PATCH 06/36] Btrfs: fix missing write access release in btrfs_ioctl_resize() We forget to give up the write access after we find some device operation is going on. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de21213d05d..982c0b9ceea5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1339,6 +1339,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINPROGRESS; } From 72bcd99d450cb1dde8bf13c3b65fc5883b2a3893 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 15:16:34 -0500 Subject: [PATCH 07/36] Btrfs: set flushing if we're limited flushing We still need to say we're flushing if we're limit flushing to keep somebody from coming in and stealing our reservation. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d133edfcd449..61fefda74ff5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3997,7 +3997,7 @@ again: * We make the other tasks wait for the flush only when we can flush * all things. */ - if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } From f3fe820c20a1a36c790545184e734e78d61cd68d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Jan 2013 17:03:21 -0500 Subject: [PATCH 08/36] Btrfs: add orphan before truncating pagecache Running xfstests 83 in a loop would sometimes fail the fsck. This happens because if we invalidate a page that already has an ordered extent setup for it we will complete the ordered extent ourselves, assuming that the truncate will clean everything up. The problem with this is there is plenty of time for the truncate to fail after we've done this work. So to fix this we need to add the orphan item first to make sure the cleanup gets done properly, and then we can truncate the pagecache and all that stuff and be safe. This fixes the btrfsck failures I was seeing while running 83 in a loop. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 53 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86bb..4ddcf79e7894 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + ret = btrfs_truncate(inode); } else { nr_unlink++; @@ -3783,9 +3795,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) + btrfs_orphan_del(NULL, inode); } return ret; @@ -6929,11 +6966,9 @@ static int btrfs_truncate(struct inode *inode) /* * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 4); + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; @@ -6944,12 +6979,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -7018,12 +7047,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); } if (trans) { From ac5c93005b7073732e268606688fb6c821d5310e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:24 +0000 Subject: [PATCH 09/36] Btrfs: let allocation start from the right raid type This'd avoid us empty looping. Say we have only one disk and the metadata raid type will be defaultly DUP, and we do not need to start from index=0(RAID10) and get over two empty loops to index=2(DUP). Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61fefda74ff5..aeba53191ece 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5560,7 +5560,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; From 3268a2468eb6a31af89930cbae58a62fe6ca6d2d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 28 Dec 2012 09:33:19 +0000 Subject: [PATCH 10/36] Btrfs: reset path lock state to zero We forgot to reset the path lock state to zero after we unlock the path block, and this can lead to the ASSERT checker in tree unlock API. Reported-by: Slava Barinov Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aeba53191ece..85b8454d9608 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6788,11 +6788,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } From 1214b53f90131fee1f950010c43e92455fe598ab Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 03:53:08 +0000 Subject: [PATCH 11/36] Btrfs: fix off-by-one in lseek Lock end is inclusive. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 20452c110d7d..fa48051484b8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2242,6 +2242,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); From f9e4fb53938de5db01950c9dfe479703b2f5c964 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 10:10:12 +0000 Subject: [PATCH 12/36] Btrfs: fix a bug when llseek for delalloc bytes behind prealloc extents xfstests case 285 complains. It it because btrfs did not try to find unwritten delalloc bytes(only dirty pages, not yet writeback) behind prealloc extents, it ends up finding nothing while we're with SEEK_DATA. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 9 ++++++--- fs/btrfs/inode.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa48051484b8..841cfe3be0e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2309,9 +2309,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4ddcf79e7894..ac98384b174e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5623,10 +5623,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; if (em) { /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. */ - if (em->block_start != EXTENT_MAP_HOLE) + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; @@ -5708,6 +5711,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag */ em->block_start = hole_em->block_start; em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { em->start = range_start; em->len = found; From f276795627045a3c599a60b476767861e4318c7d Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 8 Jan 2013 19:37:58 +0000 Subject: [PATCH 13/36] btrfs: fix btrfs_cont_expand() freeing IS_ERR em btrfs_cont_expand() tries to free an IS_ERR em as it gets an error from btrfs_get_extent() and breaks out of its loop. An instance of -EEXIST was reported in the wild: https://bugzilla.redhat.com/show_bug.cgi?id=874407 I have no idea if that -EEXIST is surprising, or not. Regardless, this error handling should be cleaned up to handle other reasonable errors (ENOMEM, EIO; whatever). This seemed to be the only buggy freeing of the relatively rare IS_ERR em so I opted to fix the caller rather than teach free_extent_map() to use IS_ERR_OR_NULL(). Signed-off-by: Zach Brown Reviewed-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ac98384b174e..3d2c64d4734a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3677,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); + em = NULL; break; } last_byte = min(extent_map_end(em), block_end); From 3972f2603d8570effaf633cea52b12c7c2773c11 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 12 Jan 2013 02:57:22 +0000 Subject: [PATCH 14/36] btrfs: update timestamps on truncate() truncate() vs. ftruncate() differ in the VFS; truncate() doesn't set (ATTR_CTIME | ATTR_MTIME), and it's up to the fs to do the timestamp updates if the size changes. Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d2c64d4734a..9bc6c40b182d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, @@ -3761,16 +3761,27 @@ next: return err; } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; if (newsize == oldsize) return 0; + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3843,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); + err = btrfs_setsize(inode, attr); if (err) return err; } From ed0fb78fb6aa294a719f8f5654fdff0ec8bc00bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: [PATCH 15/36] Btrfs: bring back balance pause/resume logic Balance pause/resume logic got broken by 5ac00add (went in into 3.8-rc1 as part of dev-replace merge). Offending commit took a stab at making mutually exclusive volume operations (add_dev, rm_dev, resize, balance, replace_dev) not block behind volume_mutex if another such operation is in progress and instead return an error right away. Balancing front-end relied on the blocking behaviour, so the fix is ugly, but short of a complete rework, it's the best we can do. Reported-by: Liu Bo Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 78 +++++++++++++++++++++++++++++++++++++--------- fs/btrfs/volumes.c | 10 ++++-- 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 982c0b9ceea5..77d8273e394c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3440,8 +3440,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ int ret; - int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3450,14 +3450,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&fs_info->volume_mutex); +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINVAL; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); - goto out; + goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3477,13 +3524,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } - need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3504,11 +3548,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } do_balance: - ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -3516,12 +3566,12 @@ do_balance: out_bargs: kfree(bargs); -out: - if (need_to_clear_lock) - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); +out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 86279c37de64..9c84dbe64f18 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2959,6 +2959,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); BUG_ON(ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -3138,8 +3140,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, out: if (bctl->flags & BTRFS_BALANCE_RESUME) __cancel_balance(fs_info); - else + else { kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } return ret; } @@ -3156,7 +3160,6 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3182,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); @@ -3233,6 +3235,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); From 2c0c9da02a2c4289350da6e54202a86602c0f926 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: [PATCH 16/36] Btrfs: fix "mutually exclusive op is running" error code The error code that is returned in response to starting a mutually exclusive operation when there is one already running got silently changed from EINVAL to EINPROGRESS by 5ac00add. Returning EINPROGRESS to, say, add_dev, when rm_dev is running is misleading. Furthermore, the operation itself may want to use EINPROGRESS for other purposes. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 77d8273e394c..259dd52d8785 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1340,7 +1340,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2192,7 +2192,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } ret = mnt_want_write_file(file); if (ret) { @@ -2266,7 +2266,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2303,7 +2303,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); From 18f39c416d18d74ac11d157e44247253d3fa30ae Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: [PATCH 17/36] Btrfs: fix unlock order in btrfs_ioctl_resize Fix unlock order in btrfs_ioctl_resize(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 259dd52d8785..fcf1b1b40082 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1446,8 +1446,8 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } From 4ac20c70b0734b65662ded735e5f6ba0415bdb71 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: [PATCH 18/36] Btrfs: fix unlock order in btrfs_ioctl_rm_dev Fix unlock order in btrfs_ioctl_rm_dev(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fcf1b1b40082..f5c1c150d9f3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2319,8 +2319,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } From 25122d15e21cf252e91e4cad7cea760f97df29f1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: [PATCH 19/36] Btrfs: reorder locks and sanity checks in btrfs_ioctl_defrag Operation-specific check (whether subvol is readonly or not) should go after the mutual exclusiveness check. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f5c1c150d9f3..afbf3ac2079d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2186,19 +2186,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINVAL; } - ret = mnt_want_write_file(file); - if (ret) { - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); - return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; } switch (inode->i_mode & S_IFMT) { @@ -2250,8 +2251,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } From ff24858c65d9c518af41aad22fb964685351051a Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:08 -0700 Subject: [PATCH 20/36] Btrfs: ignore orphan qgroup relations If a qgroup that has still assignments is deleted by the user, the corresponding relations are left in the tree. This leads to an unmountable filesystem. With this patch, those relations are simple ignored. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c45f8e..28f2b39f6a25 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -379,6 +379,13 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); + if (ret == -ENOENT) { + printk(KERN_WARNING + "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + (unsigned long long)found_key.objectid, + (unsigned long long)found_key.offset); + ret = 0; /* ignore the error */ + } if (ret) goto out; next2: From 2cf687039676c2b6e1ee96b0b89090aca94babcd Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:09 -0700 Subject: [PATCH 21/36] Btrfs: prevent qgroup destroy when there are still relations Currently you can just destroy a qgroup even though it is in use by other qgroups or has qgroups assigned to it. This patch prevents destruction of qgroups unless they are completely unused. Otherwise destroy will return EBUSY. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 28f2b39f6a25..a5c856234323 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -963,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; int ret = 0; quota_root = fs_info->quota_root; if (!quota_root) return -EINVAL; + /* check if there are no relations to this qgroup */ + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { + spin_unlock(&fs_info->qgroup_lock); + return -EBUSY; + } + } + spin_unlock(&fs_info->qgroup_lock); + ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); - spin_unlock(&fs_info->qgroup_lock); return ret; From a105bb88f46b60de2adf1ee98745bd59362b09ab Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 21 Jan 2013 15:15:56 +0200 Subject: [PATCH 22/36] Btrfs: fix a regression in balance usage filter Commit 3fed40cc ("Btrfs: cleanup duplicated division functions"), which was merged into 3.8-rc1, has introduced a regression by removing logic that was guarding us against bad user input. Bring it back. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c84dbe64f18..469609838913 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2614,7 +2614,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (bargs->usage == 0) + user_thresh = 0; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + if (chunk_used < user_thresh) ret = 0; From 201a90389424d6771d24fc5d72f7e34cb4a8f967 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 12:02:07 -0500 Subject: [PATCH 23/36] Btrfs: do not allow logged extents to be merged or removed We drop the extent map tree lock while we're logging extents, so somebody could come in and merge another extent into this one and screw up our logging, or they could even remove us from the list which would keep us from logging the extent or freeing our ref on it, so we need to make sure to not clear LOGGING until after the extent is logged, and then we can merge it to adjacent extents. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 13 ++++++++++++- fs/btrfs/extent_map.h | 1 + fs/btrfs/tree-log.c | 5 +++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fff2c28497b6..ed88f5ee4bea 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -256,7 +260,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -281,6 +286,12 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e8..c6598c89cff8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d4..de8899b04d69 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3410,13 +3410,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3424,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); From b0175117b9376a69978bbe80af26fb95dddbd53e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 11:39:19 -0500 Subject: [PATCH 24/36] Btrfs: fix panic when recovering tree log A user reported a BUG_ON(ret) that occured during tree log replay. Ret was -EAGAIN, so what I think happened is that we removed an extent that covered a bitmap entry and an extent entry. We remove the part from the bitmap and return -EAGAIN and then search for the next piece we want to remove, which happens to be an entire extent entry, so we just free the sucker and return. The problem is ret is still set to -EAGAIN so we trip the BUG_ON(). The user used btrfs-zero-log so I'm not 100% sure this is what happened so I've added a WARN_ON() to catch the other possibility. Thanks, Reported-by: Jan Steffens Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c9..0be7a8742a43 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - int ret = 0; + int ret; + bool re_search = false; spin_lock(&ctl->tree_lock); again: + ret = 0; if (!bytes) goto out_lock; @@ -1879,17 +1881,17 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. */ + WARN_ON(re_search); goto out_lock; } } + re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info); if (offset == info->offset) { @@ -1935,8 +1937,10 @@ again: } ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + re_search = true; goto again; + } BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); From 192000dda22e02225772e862b92e7c09e5a17d08 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 6 Jan 2013 03:38:22 +0000 Subject: [PATCH 25/36] Btrfs: use right range to find checksum for compressed extents For compressed extents, the range of checksum is covered by disk length, and the disk length is different with ram length, so we need to use disk length instead to get us the right checksum. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index de8899b04d69..9027bb1e7466 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, From e58dd74bccb4317e39e4b675bf9c6cd133608fac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Jan 2013 15:43:09 -0500 Subject: [PATCH 26/36] Btrfs: put csums on the right ordered extent I noticed a WARN_ON going off when adding csums because we were going over the amount of csum bytes that should have been allowed for an ordered extent. This is a leftover from when we used to hold the csums privately for direct io, but now we use the normal ordered sum stuff so we need to make sure and check if we've moved on to another extent so that the csums are added to the right extent. Without this we could end up with csums for bytenrs that don't have extents to cover them yet. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef42358..94aa53b38721 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; From 8d25a086eb104297e3ba1fdd180b04cfaaa84797 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:27:25 +0000 Subject: [PATCH 27/36] Btrfs: Add ACCESS_ONCE() to transaction->abort accesses We may access and update transaction->aborted on the different CPUs without lock, so we need ACCESS_ONCE() wrapper to prevent the compiler from creating unsolicited accesses and make sure we can get the right value. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df1b86c..d8982e9601d3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - trans->transaction->aborted = errno; + ACCESS_ONCE(trans->transaction->aborted) = errno; __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea5..0ef29611fade 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1468,7 +1468,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } - if (cur_trans->aborted) { + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto cleanup_transaction; } From 2cba30f172afdfa00f3e844f42f21eb3b972d01c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:29:12 +0000 Subject: [PATCH 28/36] Btrfs: fix missed transaction->aborted check First, though the current transaction->aborted check can stop the commit early and avoid unnecessary operations, it is too early, and some transaction handles don't end, those handles may set transaction->aborted after the check. Second, when we commit the transaction, we will wake up some worker threads to flush the space cache and inode cache. Those threads also allocate some transaction handles and may set transaction->aborted if some serious error happens. So we need more check for ->aborted when committing the transaction. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0ef29611fade..f15494699f3b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1575,6 +1575,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto cleanup_transaction; + } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving @@ -1658,6 +1663,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } + btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; From c9f01bfe0ca411b4751d7fdbb9d602035ba52f75 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 16 Jan 2013 11:27:17 +0000 Subject: [PATCH 29/36] Btrfs: fix wrong max device number for single profile The max device number of single profile is 1, not 0 (0 means 'as many as possible'). Fix it. Cc: Liu Bo Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 469609838913..15f6efdf6463 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3507,7 +3507,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { { 1, 1, 2, 2, 2, 2 /* raid1 */ }, { 1, 2, 1, 1, 1, 2 /* dup */ }, { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 0, 1, 1, 1 /* single */ }, + { 1, 1, 1, 1, 1, 1 /* single */ }, }; static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, From 1eafa6c73791e4f312324ddad9cbcaf6a1b6052b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 22 Jan 2013 10:49:00 +0000 Subject: [PATCH 30/36] Btrfs: fix repeated delalloc work allocation btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/ btrfs_queue_worker for this inode, and then it locks the list, checks the head of the list again. But because we don't delete the first inode that it deals with before, it will fetch the same inode. As a result, this function allocates a huge amount of btrfs_delalloc_work structures, and OOM happens. Fix this problem by splice this delalloc list. Reported-by: Alex Lyakas Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9bc6c40b182d..ca7ace7b7b52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7585,41 +7585,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) */ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; + struct list_head splice; int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; INIT_LIST_HEAD(&works); - + INIT_LIST_HEAD(&splice); +again: spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + inode = igrab(&binode->vfs_inode); if (!inode) - list_del_init(&binode->delalloc_inodes); + continue; + + list_add_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (!work) { - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&root->fs_info->delalloc_inodes)) { + spin_unlock(&root->fs_info->delalloc_lock); + goto again; + } + spin_unlock(&root->fs_info->delalloc_lock); + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7632,11 +7652,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->fs_info->delalloc_lock); + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + } return ret; } From 222c81dc3874b4fe98371be665d0447a36447653 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 28 Jan 2013 09:45:20 -0500 Subject: [PATCH 31/36] Btrfs: do not merge logged extents if we've removed them from the tree You can run into this problem where if somebody is fsyncing and writing out the existing extents you will have removed the extent map from the em tree, but it's still valid for the current fsync so we go ahead and write it. The problem is we unconditionally try to merge it back into the em tree, but if we've removed it from the em tree that will cause use after free problems. Fix this to only merge if we are still a part of the tree. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ed88f5ee4bea..9759911dd34e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -289,7 +289,8 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - try_merge_map(tree, em); + if (em->in_tree) + try_merge_map(tree, em); } /** From 0a3404dcff29a7589e8d6ce8c36f97c5411f85b4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 28 Jan 2013 12:34:55 +0000 Subject: [PATCH 32/36] Btrfs: fix wrong sync_writers decrement in btrfs_file_aio_write() If the checks at the beginning of btrfs_file_aio_write() fail, we needn't decrease ->sync_writers, because we have not increased it. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841cfe3be0e0..a902faab7161 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1595,9 +1595,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; From 843fcf35733164076a77ad833c72c32da8228ad0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 28 Jan 2013 12:36:22 +0000 Subject: [PATCH 33/36] Btrfs: fix missing release of the space/qgroup reservation in start_transaction() When we fail to start a transaction, we need to release the reserved free space and qgroup space, fix it. Signed-off-by: Miao Xie Reviewed-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f15494699f3b..fc03aa60b684 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -333,12 +333,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, &root->fs_info->trans_block_rsv, num_bytes, flush); if (ret) - return ERR_PTR(ret); + goto reserve_fail; } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); - if (!h) - return ERR_PTR(-ENOMEM); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } /* * If we are JOIN_NOLOCK we're already committing a transaction and @@ -365,11 +367,7 @@ again: if (ret < 0) { /* We must get the transaction if we are JOIN_NOLOCK. */ BUG_ON(type == TRANS_JOIN_NOLOCK); - - if (type < TRANS_JOIN_NOLOCK) - sb_end_intwrite(root->fs_info->sb); - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(ret); + goto join_fail; } cur_trans = root->fs_info->running_transaction; @@ -410,6 +408,19 @@ got_it: if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; return h; + +join_fail: + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, From 6f1c36055f96e80031c7fdda3fd5be826b8d7782 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Jan 2013 03:22:10 +0000 Subject: [PATCH 34/36] Btrfs: fix race between snapshot deletion and getting inode While running snapshot testscript created by Mitch and David, the race between autodefrag and snapshot deletion can lead to corruption of dead_root list so that we can get crash on btrfs_clean_old_snapshots(). And besides autodefrag, scrub also does the same thing, ie. read root first and get inode. Here is the story(take autodefrag as an example): (1) when we delete a snapshot or subvolume, it will set its root's refs to zero and do a iput() on its own inode, and if this inode happens to be the only active in-meory one in root's inode rbtree, it will add itself to the global dead_roots list for later cleanup. (2) after (1), the autodefrag thread may read another inode for defrag and the inode is just in the deleted snapshot/subvolume, but all of these are without checking if the root is still valid(refs > 0). So the end up result is adding the deleted snapshot/subvolume's root to the global dead_roots list AGAIN. Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu. So all we need to do is to take the lock to protect 'read root and get inode', since we synchronize to wait for the rcu grace period before adding something to the global dead_roots list. Reported-by: Mitch Harder Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 22 ++++++++++++++++++---- fs/btrfs/scrub.c | 25 ++++++++++++++++++++----- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a902faab7161..b06d289f998f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c9..67783e03d121 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) int corrected = 0; struct btrfs_key key; struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) } if (PageUptodate(page)) { - struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) u64 physical_for_dev_replace; u64 len; struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); From 5d1f40202bad12d4c70a2d40a420b30d23a72b1a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 Jan 2013 14:17:31 -0500 Subject: [PATCH 35/36] Btrfs: fix missing i_size update If we have an ordered extent before the ordered extent we are currently completing that is after the current disk_i_size we will put our i_size update into that ordered extent so that we do not expose stale data. The problem is that if our disk i_size is updated past the previous ordered extent we won't update the i_size with the pending i_size update. So check the pending i_size update and if its above the current disk i_size we need to go ahead and try to update. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f10731297040..fc6840b53d9d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size == i_size || offset <= disk_i_size) { + if (disk_i_size == i_size) + goto out; + + /* + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. + */ + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) goto out; - } /* * walk backward from this ordered extent to disk_i_size. From 59fe4f41976f6331b695ff049296d082cf621823 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 Jan 2013 14:31:31 -0500 Subject: [PATCH 36/36] Btrfs: fix possible stale data exposure We specifically do not update the disk i_size if there are ordered extents outstanding for any area between the current disk_i_size and our ordered extent so that we do not expose stale data. The problem is the check we have only checks if the ordered extent starts at or after the current disk_i_size, which doesn't take into account an ordered extent that starts before the current disk_i_size and ends past the disk_i_size. Fix this by checking if the extent ends past the disk_i_size. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fc6840b53d9d..e5ed56729607 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -877,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) { + if (entry_end(test) > disk_i_size) { /* * we don't update disk_i_size now, so record this * undealt i_size. Or we will not know the real