From 663dfbb07774e0fe1049e8db3054a08500122f18 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2014 12:28:37 +0100 Subject: [PATCH] Btrfs: deal with convert_extent_bit errors to avoid fs corruption When committing a transaction or a log, we look for btree extents that need to be durably persisted by searching for ranges in a io tree that have some bits set (EXTENT_DIRTY or EXTENT_NEW). We then attempt to clear those bits and set the EXTENT_NEED_WAIT bit, with calls to the function convert_extent_bit, and then start writeback for the extents. That function however can return an error (at the moment only -ENOMEM is possible, specially when it does GFP_ATOMIC allocation requests through alloc_extent_state_atomic) - that means the ranges didn't got the EXTENT_NEED_WAIT bit set (or at least not for the whole range), which in turn means a call to btrfs_wait_marked_extents() won't find those ranges for which we started writeback, causing a transaction commit or a log commit to persist a new superblock without waiting for the writeback of extents in that range to finish first. Therefore if a crash happens after persisting the new superblock and before writeback finishes, we have a superblock pointing to roots that weren't fully persisted or roots that point to nodes or leafs that weren't fully persisted, causing all sorts of unexpected/bad behaviour as we endup reading garbage from disk or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like "parent transid verify failed on X wanted Y found Z" when reading btree nodes/leafs from disk). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 92 ++++++++++++++++++++++++++++++++++-------- fs/btrfs/transaction.h | 2 - 2 files changed, 76 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..04dbc800c209 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } +static void clear_btree_io_tree(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } + spin_unlock(&tree->lock); +} + static noinline void switch_commit_roots(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { @@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, root->commit_root = btrfs_root_node(root); if (is_fstree(root->objectid)) btrfs_unpin_free_ino(root); + clear_btree_io_tree(&root->dirty_log_pages); } up_write(&fs_info->commit_root_sem); } @@ -828,17 +855,38 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); - cached_state = NULL; - err = filemap_fdatawrite_range(mapping, start, end); + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + /* + * convert_extent_bit can return -ENOMEM, which is most of the + * time a temporary error. So when it happens, ignore the error + * and wait for writeback of this range to finish - because we + * failed to set the bit EXTENT_NEED_WAIT for the range, a call + * to btrfs_wait_marked_extents() would not know that writeback + * for this range started and therefore wouldn't wait for it to + * finish - we don't want to commit a superblock that points to + * btree nodes/leafs for which writeback hasn't finished yet + * (and without errors). + * We cleanup any entries left in the io tree when committing + * the transaction (through clear_btree_io_tree()). + */ + if (err == -ENOMEM) { + err = 0; + wait_writeback = true; + } + if (!err) + err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; + else if (wait_writeback) + werr = filemap_fdatawait_range(mapping, start, end); + cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; return werr; } @@ -862,9 +910,21 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { - clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); - err = filemap_fdatawait_range(mapping, start, end); + /* + * Ignore -ENOMEM errors returned by clear_extent_bit(). + * When committing the transaction, we'll remove any entries + * left in the io tree. For a log commit, we don't remove them + * after committing the log because the tree can be accessed + * concurrently - we do it only at transaction commit time when + * it's safe to do it (through clear_btree_io_tree()). + */ + err = clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); + if (err == -ENOMEM) + err = 0; + if (!err) + err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; cond_resched(); @@ -919,17 +979,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return 0; } -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!trans || !trans->transaction) { - struct inode *btree_inode; - btree_inode = root->fs_info->btree_inode; - return filemap_write_and_wait(btree_inode->i_mapping); - } - return btrfs_write_and_wait_marked_extents(root, + int ret; + + ret = btrfs_write_and_wait_marked_extents(root, &trans->transaction->dirty_pages, EXTENT_DIRTY); + clear_btree_io_tree(&trans->transaction->dirty_pages); + + return ret; } /* diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8f40e1a5d2d..b3f5b40aab22 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -145,8 +145,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root);