From 17d217fe970d34720f4f1633dca73a6aa2f3d9d1 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Fri, 12 Dec 2008 10:03:38 -0500 Subject: [PATCH] Btrfs: fix nodatasum handling in balancing code Checksums on data can be disabled by mount option, so it's possible some data extents don't have checksums or have invalid checksums. This causes trouble for data relocation. This patch contains following things to make data relocation work. 1) make nodatasum/nodatacow mount option only affects new files. Checksums and COW on data are only controlled by the inode flags. 2) check the existence of checksum in the nodatacow checker. If checksums exist, force COW the data extent. This ensure that checksum for a given block is either valid or does not exist. 3) update data relocation code to properly handle the case of checksum missing. Signed-off-by: Yan Zheng --- fs/btrfs/compression.c | 9 ++-- fs/btrfs/ctree.h | 5 +- fs/btrfs/extent-tree.c | 50 ++++++++++++++++-- fs/btrfs/extent_io.h | 1 + fs/btrfs/file-item.c | 114 +++++++++++++++++++++++++++++++++++++++-- fs/btrfs/file.c | 8 --- fs/btrfs/inode.c | 74 +++++++++++++++++++++----- 7 files changed, 226 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ad7274137309..2436163d5436 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -124,8 +124,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (btrfs_test_opt(root, NODATASUM) || - btrfs_test_flag(inode, NODATASUM)) + if (btrfs_test_flag(inode, NODATASUM)) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -671,8 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ atomic_inc(&cb->pending_bios); - if (!btrfs_test_opt(root, NODATASUM) && - !btrfs_test_flag(inode, NODATASUM)) { + if (!btrfs_test_flag(inode, NODATASUM)) { btrfs_lookup_bio_sums(root, inode, comp_bio, sums); } @@ -699,8 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!btrfs_test_opt(root, NODATASUM) && - !btrfs_test_flag(inode, NODATASUM)) { + if (!btrfs_test_flag(inode, NODATASUM)) { btrfs_lookup_bio_sums(root, inode, comp_bio, sums); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8733081d97a3..b89999de4564 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1702,7 +1702,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr); + struct btrfs_root *root, u64 objectid, u64 bytenr); int btrfs_extent_post_op(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); @@ -1789,6 +1789,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 orig_start); int btrfs_add_dead_reloc_root(struct btrfs_root *root); int btrfs_cleanup_reloc_trees(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); /* ctree.c */ int btrfs_previous_item(struct btrfs_root *root, @@ -1994,6 +1995,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, int btrfs_csum_truncate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 isize); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, + u64 end, struct list_head *list); /* inode.c */ /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 171057a32679..8004695d24d6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1359,7 +1359,7 @@ out: } int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr) + struct btrfs_root *root, u64 objectid, u64 bytenr) { struct btrfs_root *extent_root = root->fs_info->extent_root; struct btrfs_path *path; @@ -1418,8 +1418,9 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, ref_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); ref_root = btrfs_ref_root(leaf, ref_item); - if (ref_root != root->root_key.objectid && - ref_root != BTRFS_TREE_LOG_OBJECTID) { + if ((ref_root != root->root_key.objectid && + ref_root != BTRFS_TREE_LOG_OBJECTID) || + objectid != btrfs_ref_objectid(leaf, ref_item)) { ret = 1; goto out; } @@ -5367,7 +5368,6 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root, if (ret) goto out; } - btrfs_record_root_in_trans(found_root); ret = replace_one_extent(trans, found_root, path, extent_key, &first_key, ref_path, @@ -5534,6 +5534,7 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info, } else { BUG_ON(1); } + BTRFS_I(inode)->index_cnt = group->key.objectid; err = btrfs_orphan_add(trans, inode); out: @@ -5546,6 +5547,47 @@ out: return inode; } +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct list_head list; + size_t offset; + int ret; + u64 disk_bytenr; + + INIT_LIST_HEAD(&list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} + int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) { struct btrfs_trans_handle *trans; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2d5f67065b69..c5b483a79137 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -16,6 +16,7 @@ #define EXTENT_ORDERED (1 << 9) #define EXTENT_ORDERED_METADATA (1 << 10) #define EXTENT_BOUNDARY (1 << 11) +#define EXTENT_NODATASUM (1 << 12) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) /* flags for bio submission */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3ebef871ee6c..df0447632dbd 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -140,6 +140,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } + int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst) { @@ -185,9 +186,16 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, if (ret == -ENOENT || ret == -EFBIG) ret = 0; sum = 0; - printk("no csum found for inode %lu start " - "%llu\n", inode->i_ino, - (unsigned long long)offset); + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_extent_bits(io_tree, offset, + offset + bvec->bv_len - 1, + EXTENT_NODATASUM, GFP_NOFS); + } else { + printk("no csum found for inode %lu " + "start %llu\n", inode->i_ino, + (unsigned long long)offset); + } item = NULL; btrfs_release_path(root, path); goto found; @@ -228,6 +236,106 @@ found: return 0; } +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_csum_item *item; + unsigned long offset; + int ret; + size_t size; + u64 csum_end; + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root->fs_info->csum_root, + &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + if (offset * csum_size < + btrfs_item_size_nr(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root->fs_info->csum_root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) + break; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + size = btrfs_item_size_nr(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * root->sectorsize; + + size = min(csum_end, end + 1) - start; + sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS); + BUG_ON(!sums); + + sector_sum = sums->sums; + sums->bytenr = start; + sums->len = size; + + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + offset *= csum_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (size > 0) { + read_extent_buffer(path->nodes[0], §or_sum->sum, + ((unsigned long)item) + offset, + csum_size); + sector_sum->bytenr = start; + + size -= root->sectorsize; + start += root->sectorsize; + offset += csum_size; + sector_sum++; + } + list_add_tail(&sums->list, list); + + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 71bfe3a6a444..507081059d97 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1059,14 +1059,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; - /* - * if this is a nodatasum mount, force summing off for the inode - * all the time. That way a later mount with summing on won't - * get confused - */ - if (btrfs_test_opt(root, NODATASUM)) - btrfs_set_flag(inode, NODATASUM); - /* * there are lots of better ways to do this, but this code * makes sure the first and last page in the file range are diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0a28b7706314..e64a4fe19a60 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -771,6 +771,13 @@ static noinline int cow_file_range(struct inode *inode, ram_size, cur_alloc_size, 0); BUG_ON(ret); + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, start, + cur_alloc_size); + BUG_ON(ret); + } + if (disk_num_bytes < cur_alloc_size) { printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes, cur_alloc_size); @@ -910,6 +917,26 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, return 0; } +static int noinline csum_exist_in_range(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + int ret; + struct btrfs_ordered_sum *sums; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_range(root, bytenr, bytenr + num_bytes - 1, + &list); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + return 1; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -971,6 +998,7 @@ next_slot: nocow = 0; disk_bytenr = 0; + num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid > inode->i_ino || @@ -996,19 +1024,29 @@ next_slot: path->slots[0]++; goto next_slot; } + if (disk_bytenr == 0) + goto out_check; if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; - if (disk_bytenr == 0) - goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; - if (btrfs_cross_ref_exist(trans, root, disk_bytenr)) - goto out_check; if (btrfs_extent_readonly(root, disk_bytenr)) goto out_check; + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + disk_bytenr)) + goto out_check; disk_bytenr += btrfs_file_extent_offset(leaf, fi); + disk_bytenr += cur_offset - found_key.offset; + num_bytes = min(end + 1, extent_end) - cur_offset; + /* + * force cow if csum exists in the range. + * this ensure that csum for a given extent are + * either valid or do not exist. + */ + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1041,8 +1079,6 @@ out_check: cow_start = (u64)-1; } - disk_bytenr += cur_offset - found_key.offset; - num_bytes = min(end + 1, extent_end) - cur_offset; if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { struct extent_map *em; struct extent_map_tree *em_tree; @@ -1105,11 +1141,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - if (btrfs_test_opt(root, NODATACOW) || - btrfs_test_flag(inode, NODATACOW)) + if (btrfs_test_flag(inode, NODATACOW)) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); else if (btrfs_test_flag(inode, PREALLOC)) @@ -1252,8 +1286,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - skip_sum = btrfs_test_opt(root, NODATASUM) || - btrfs_test_flag(inode, NODATASUM); + skip_sum = btrfs_test_flag(inode, NODATASUM); if (!(rw & (1 << BIO_RW))) { if (bio_flags & EXTENT_BIO_COMPRESSED) { @@ -1263,6 +1296,9 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, btrfs_lookup_bio_sums(root, inode, bio, NULL); goto mapit; } else if (!skip_sum) { + /* csum items have already been cloned */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + goto mapit; /* we're doing a write, do the async checksumming */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, @@ -1692,10 +1728,16 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, ClearPageChecked(page); goto good; } - if (btrfs_test_opt(root, NODATASUM) || - btrfs_test_flag(inode, NODATASUM)) + if (btrfs_test_flag(inode, NODATASUM)) return 0; + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; + } + if (state && state->start == start) { private = state->private; ret = 0; @@ -3391,6 +3433,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 1; BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_hint, owner); + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + btrfs_set_flag(inode, NODATASUM); + if (btrfs_test_opt(root, NODATACOW)) + btrfs_set_flag(inode, NODATACOW); + } key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);