2018-04-03 20:23:33 +03:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2007-06-12 17:07:21 +04:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
*/
|
|
|
|
|
2007-03-16 02:03:33 +03:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "disk-io.h"
|
2007-03-16 23:20:31 +03:00
|
|
|
#include "transaction.h"
|
2010-08-06 21:21:20 +04:00
|
|
|
#include "print-tree.h"
|
2007-03-16 02:03:33 +03:00
|
|
|
|
2019-08-27 14:46:28 +03:00
|
|
|
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
|
|
|
|
int slot, const char *name,
|
|
|
|
int name_len)
|
2007-12-12 22:38:19 +03:00
|
|
|
{
|
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
unsigned long ptr;
|
|
|
|
unsigned long name_ptr;
|
|
|
|
u32 item_size;
|
|
|
|
u32 cur_offset = 0;
|
|
|
|
int len;
|
|
|
|
|
Btrfs: fix log replay failure after unlink and link combination
If we have a file with 2 (or more) hard links in the same directory,
remove one of the hard links, create a new file (or link an existing file)
in the same directory with the name of the removed hard link, and then
finally fsync the new file, we end up with a log that fails to replay,
causing a mount failure.
Example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/testdir
$ touch /mnt/testdir/foo
$ ln /mnt/testdir/foo /mnt/testdir/bar
$ sync
$ unlink /mnt/testdir/bar
$ touch /mnt/testdir/bar
$ xfs_io -c "fsync" /mnt/testdir/bar
<power failure>
$ mount /dev/sdb /mnt
mount: mount(2) failed: /mnt: No such file or directory
When replaying the log, for that example, we also see the following in
dmesg/syslog:
[71813.671307] BTRFS info (device dm-0): failed to delete reference to bar, inode 258 parent 257
[71813.674204] ------------[ cut here ]------------
[71813.675694] BTRFS: Transaction aborted (error -2)
[71813.677236] WARNING: CPU: 1 PID: 13231 at fs/btrfs/inode.c:4128 __btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] Modules linked in: btrfs xfs f2fs dm_flakey dm_mod dax ghash_clmulni_intel ppdev pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse i2c_piix4 parport_pc i2c_core pcspkr sg serio_raw parport button sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel floppy virtio e1000 scsi_mod [last unloaded: btrfs]
[71813.679669] CPU: 1 PID: 13231 Comm: mount Tainted: G W 4.15.0-rc9-btrfs-next-56+ #1
[71813.679669] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
[71813.679669] RIP: 0010:__btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] RSP: 0018:ffffc90001cef738 EFLAGS: 00010286
[71813.679669] RAX: 0000000000000025 RBX: ffff880217ce4708 RCX: 0000000000000001
[71813.679669] RDX: 0000000000000000 RSI: ffffffff81c14bae RDI: 00000000ffffffff
[71813.679669] RBP: ffffc90001cef7c0 R08: 0000000000000001 R09: 0000000000000001
[71813.679669] R10: ffffc90001cef5e0 R11: ffffffff8343f007 R12: ffff880217d474c8
[71813.679669] R13: 00000000fffffffe R14: ffff88021ccf1548 R15: 0000000000000101
[71813.679669] FS: 00007f7cee84c480(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000
[71813.679669] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[71813.679669] CR2: 00007f7cedc1abf9 CR3: 00000002354b4003 CR4: 00000000001606e0
[71813.679669] Call Trace:
[71813.679669] btrfs_unlink_inode+0x17/0x41 [btrfs]
[71813.679669] drop_one_dir_item+0xfa/0x131 [btrfs]
[71813.679669] add_inode_ref+0x71e/0x851 [btrfs]
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] ? replay_one_buffer+0x53/0x53a [btrfs]
[71813.679669] replay_one_buffer+0x4a4/0x53a [btrfs]
[71813.679669] ? rcu_read_unlock+0x3a/0x57
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] walk_up_log_tree+0x101/0x1d2 [btrfs]
[71813.679669] walk_log_tree+0xad/0x188 [btrfs]
[71813.679669] btrfs_recover_log_trees+0x1fa/0x31e [btrfs]
[71813.679669] ? replay_one_extent+0x544/0x544 [btrfs]
[71813.679669] open_ctree+0x1cf6/0x2209 [btrfs]
[71813.679669] btrfs_mount_root+0x368/0x482 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] btrfs_mount+0x13e/0x772 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] do_mount+0x6e5/0x973
[71813.679669] ? memdup_user+0x3e/0x5c
[71813.679669] SyS_mount+0x72/0x98
[71813.679669] entry_SYSCALL_64_fastpath+0x1e/0x8b
[71813.679669] RIP: 0033:0x7f7cedf150ba
[71813.679669] RSP: 002b:00007ffca71da688 EFLAGS: 00000206
[71813.679669] Code: 7f a0 e8 51 0c fd ff 48 8b 43 50 f0 0f ba a8 30 2c 00 00 02 72 17 41 83 fd fb 74 11 44 89 ee 48 c7 c7 7d 11 7f a0 e8 38 f5 8d e0 <0f> ff 44 89 e9 ba 20 10 00 00 eb 4d 48 8b 4d b0 48 8b 75 88 4c
[71813.679669] ---[ end trace 83bd473fc5b4663b ]---
[71813.854764] BTRFS: error (device dm-0) in __btrfs_unlink_inode:4128: errno=-2 No such entry
[71813.886994] BTRFS: error (device dm-0) in btrfs_replay_log:2307: errno=-2 No such entry (Failed to recover log tree)
[71813.903357] BTRFS error (device dm-0): cleaner transaction attach returned -30
[71814.128078] BTRFS error (device dm-0): open_ctree failed
This happens because the log has inode reference items for both inode 258
(the first file we created) and inode 259 (the second file created), and
when processing the reference item for inode 258, we replace the
corresponding item in the subvolume tree (which has two names, "foo" and
"bar") witht he one in the log (which only has one name, "foo") without
removing the corresponding dir index keys from the parent directory.
Later, when processing the inode reference item for inode 259, which has
a name of "bar" associated to it, we notice that dir index entries exist
for that name and for a different inode, so we attempt to unlink that
name, which fails because the inode reference item for inode 258 no longer
has the name "bar" associated to it, making a call to btrfs_unlink_inode()
fail with a -ENOENT error.
Fix this by unlinking all the names in an inode reference item from a
subvolume tree that are not present in the inode reference item found in
the log tree, before overwriting it with the item from the log tree.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-02-28 18:56:10 +03:00
|
|
|
item_size = btrfs_item_size_nr(leaf, slot);
|
|
|
|
ptr = btrfs_item_ptr_offset(leaf, slot);
|
2007-12-12 22:38:19 +03:00
|
|
|
while (cur_offset < item_size) {
|
|
|
|
ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
|
|
|
|
len = btrfs_inode_ref_name_len(leaf, ref);
|
|
|
|
name_ptr = (unsigned long)(ref + 1);
|
|
|
|
cur_offset += len + sizeof(*ref);
|
|
|
|
if (len != name_len)
|
|
|
|
continue;
|
2019-08-27 14:46:28 +03:00
|
|
|
if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
|
|
|
|
return ref;
|
2007-12-12 22:38:19 +03:00
|
|
|
}
|
2019-08-27 14:46:28 +03:00
|
|
|
return NULL;
|
2007-12-12 22:38:19 +03:00
|
|
|
}
|
|
|
|
|
2019-08-27 14:46:29 +03:00
|
|
|
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
|
|
|
|
struct extent_buffer *leaf, int slot, u64 ref_objectid,
|
|
|
|
const char *name, int name_len)
|
2012-08-08 22:32:27 +04:00
|
|
|
{
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
unsigned long ptr;
|
|
|
|
unsigned long name_ptr;
|
|
|
|
u32 item_size;
|
|
|
|
u32 cur_offset = 0;
|
|
|
|
int ref_name_len;
|
|
|
|
|
Btrfs: fix log replay failure after unlink and link combination
If we have a file with 2 (or more) hard links in the same directory,
remove one of the hard links, create a new file (or link an existing file)
in the same directory with the name of the removed hard link, and then
finally fsync the new file, we end up with a log that fails to replay,
causing a mount failure.
Example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/testdir
$ touch /mnt/testdir/foo
$ ln /mnt/testdir/foo /mnt/testdir/bar
$ sync
$ unlink /mnt/testdir/bar
$ touch /mnt/testdir/bar
$ xfs_io -c "fsync" /mnt/testdir/bar
<power failure>
$ mount /dev/sdb /mnt
mount: mount(2) failed: /mnt: No such file or directory
When replaying the log, for that example, we also see the following in
dmesg/syslog:
[71813.671307] BTRFS info (device dm-0): failed to delete reference to bar, inode 258 parent 257
[71813.674204] ------------[ cut here ]------------
[71813.675694] BTRFS: Transaction aborted (error -2)
[71813.677236] WARNING: CPU: 1 PID: 13231 at fs/btrfs/inode.c:4128 __btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] Modules linked in: btrfs xfs f2fs dm_flakey dm_mod dax ghash_clmulni_intel ppdev pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse i2c_piix4 parport_pc i2c_core pcspkr sg serio_raw parport button sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel floppy virtio e1000 scsi_mod [last unloaded: btrfs]
[71813.679669] CPU: 1 PID: 13231 Comm: mount Tainted: G W 4.15.0-rc9-btrfs-next-56+ #1
[71813.679669] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
[71813.679669] RIP: 0010:__btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] RSP: 0018:ffffc90001cef738 EFLAGS: 00010286
[71813.679669] RAX: 0000000000000025 RBX: ffff880217ce4708 RCX: 0000000000000001
[71813.679669] RDX: 0000000000000000 RSI: ffffffff81c14bae RDI: 00000000ffffffff
[71813.679669] RBP: ffffc90001cef7c0 R08: 0000000000000001 R09: 0000000000000001
[71813.679669] R10: ffffc90001cef5e0 R11: ffffffff8343f007 R12: ffff880217d474c8
[71813.679669] R13: 00000000fffffffe R14: ffff88021ccf1548 R15: 0000000000000101
[71813.679669] FS: 00007f7cee84c480(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000
[71813.679669] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[71813.679669] CR2: 00007f7cedc1abf9 CR3: 00000002354b4003 CR4: 00000000001606e0
[71813.679669] Call Trace:
[71813.679669] btrfs_unlink_inode+0x17/0x41 [btrfs]
[71813.679669] drop_one_dir_item+0xfa/0x131 [btrfs]
[71813.679669] add_inode_ref+0x71e/0x851 [btrfs]
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] ? replay_one_buffer+0x53/0x53a [btrfs]
[71813.679669] replay_one_buffer+0x4a4/0x53a [btrfs]
[71813.679669] ? rcu_read_unlock+0x3a/0x57
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] walk_up_log_tree+0x101/0x1d2 [btrfs]
[71813.679669] walk_log_tree+0xad/0x188 [btrfs]
[71813.679669] btrfs_recover_log_trees+0x1fa/0x31e [btrfs]
[71813.679669] ? replay_one_extent+0x544/0x544 [btrfs]
[71813.679669] open_ctree+0x1cf6/0x2209 [btrfs]
[71813.679669] btrfs_mount_root+0x368/0x482 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] btrfs_mount+0x13e/0x772 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] do_mount+0x6e5/0x973
[71813.679669] ? memdup_user+0x3e/0x5c
[71813.679669] SyS_mount+0x72/0x98
[71813.679669] entry_SYSCALL_64_fastpath+0x1e/0x8b
[71813.679669] RIP: 0033:0x7f7cedf150ba
[71813.679669] RSP: 002b:00007ffca71da688 EFLAGS: 00000206
[71813.679669] Code: 7f a0 e8 51 0c fd ff 48 8b 43 50 f0 0f ba a8 30 2c 00 00 02 72 17 41 83 fd fb 74 11 44 89 ee 48 c7 c7 7d 11 7f a0 e8 38 f5 8d e0 <0f> ff 44 89 e9 ba 20 10 00 00 eb 4d 48 8b 4d b0 48 8b 75 88 4c
[71813.679669] ---[ end trace 83bd473fc5b4663b ]---
[71813.854764] BTRFS: error (device dm-0) in __btrfs_unlink_inode:4128: errno=-2 No such entry
[71813.886994] BTRFS: error (device dm-0) in btrfs_replay_log:2307: errno=-2 No such entry (Failed to recover log tree)
[71813.903357] BTRFS error (device dm-0): cleaner transaction attach returned -30
[71814.128078] BTRFS error (device dm-0): open_ctree failed
This happens because the log has inode reference items for both inode 258
(the first file we created) and inode 259 (the second file created), and
when processing the reference item for inode 258, we replace the
corresponding item in the subvolume tree (which has two names, "foo" and
"bar") witht he one in the log (which only has one name, "foo") without
removing the corresponding dir index keys from the parent directory.
Later, when processing the inode reference item for inode 259, which has
a name of "bar" associated to it, we notice that dir index entries exist
for that name and for a different inode, so we attempt to unlink that
name, which fails because the inode reference item for inode 258 no longer
has the name "bar" associated to it, making a call to btrfs_unlink_inode()
fail with a -ENOENT error.
Fix this by unlinking all the names in an inode reference item from a
subvolume tree that are not present in the inode reference item found in
the log tree, before overwriting it with the item from the log tree.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-02-28 18:56:10 +03:00
|
|
|
item_size = btrfs_item_size_nr(leaf, slot);
|
|
|
|
ptr = btrfs_item_ptr_offset(leaf, slot);
|
2012-08-08 22:32:27 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Search all extended backrefs in this item. We're only
|
|
|
|
* looking through any collisions so most of the time this is
|
|
|
|
* just going to compare against one buffer. If all is well,
|
|
|
|
* we'll return success and the inode ref object.
|
|
|
|
*/
|
|
|
|
while (cur_offset < item_size) {
|
|
|
|
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
|
|
|
|
name_ptr = (unsigned long)(&extref->name);
|
|
|
|
ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
|
|
|
|
|
|
|
|
if (ref_name_len == name_len &&
|
|
|
|
btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
|
2019-08-27 14:46:29 +03:00
|
|
|
(memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0))
|
|
|
|
return extref;
|
2012-08-08 22:32:27 +04:00
|
|
|
|
|
|
|
cur_offset += ref_name_len + sizeof(*extref);
|
|
|
|
}
|
2019-08-27 14:46:29 +03:00
|
|
|
return NULL;
|
2012-08-08 22:32:27 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns NULL if no extref found */
|
|
|
|
struct btrfs_inode_extref *
|
|
|
|
btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
const char *name, int name_len,
|
|
|
|
u64 inode_objectid, u64 ref_objectid, int ins_len,
|
|
|
|
int cow)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
key.objectid = inode_objectid;
|
|
|
|
key.type = BTRFS_INODE_EXTREF_KEY;
|
|
|
|
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
|
|
|
|
if (ret < 0)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
if (ret > 0)
|
|
|
|
return NULL;
|
2019-08-27 14:46:29 +03:00
|
|
|
return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
|
|
|
|
ref_objectid, name, name_len);
|
|
|
|
|
2012-08-08 22:32:27 +04:00
|
|
|
}
|
|
|
|
|
2013-04-26 00:41:01 +04:00
|
|
|
static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
const char *name, int name_len,
|
|
|
|
u64 inode_objectid, u64 ref_objectid,
|
|
|
|
u64 *index)
|
2012-08-08 22:32:27 +04:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
int ret;
|
|
|
|
int del_len = name_len + sizeof(*extref);
|
|
|
|
unsigned long ptr;
|
|
|
|
unsigned long item_start;
|
|
|
|
u32 item_size;
|
|
|
|
|
|
|
|
key.objectid = inode_objectid;
|
2014-06-04 20:41:45 +04:00
|
|
|
key.type = BTRFS_INODE_EXTREF_KEY;
|
2012-08-08 22:32:27 +04:00
|
|
|
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sanity check - did we find the right item for this name?
|
|
|
|
* This should always succeed so error here will make the FS
|
|
|
|
* readonly.
|
|
|
|
*/
|
2019-08-27 14:46:29 +03:00
|
|
|
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
|
|
|
|
ref_objectid, name, name_len);
|
|
|
|
if (!extref) {
|
2016-03-16 11:43:06 +03:00
|
|
|
btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
|
2012-08-08 22:32:27 +04:00
|
|
|
ret = -EROFS;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
|
|
|
if (index)
|
|
|
|
*index = btrfs_inode_extref_index(leaf, extref);
|
|
|
|
|
|
|
|
if (del_len == item_size) {
|
|
|
|
/*
|
|
|
|
* Common case only one ref in the item, remove the
|
|
|
|
* whole item.
|
|
|
|
*/
|
|
|
|
ret = btrfs_del_item(trans, root, path);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ptr = (unsigned long)extref;
|
|
|
|
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
|
|
|
|
|
|
|
|
memmove_extent_buffer(leaf, ptr, ptr + del_len,
|
|
|
|
item_size - (ptr + del_len - item_start));
|
|
|
|
|
2019-03-20 16:49:12 +03:00
|
|
|
btrfs_truncate_item(path, item_size - del_len, 1);
|
2012-08-08 22:32:27 +04:00
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
const char *name, int name_len,
|
|
|
|
u64 inode_objectid, u64 ref_objectid, u64 *index)
|
2007-12-12 22:38:19 +03:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
unsigned long ptr;
|
|
|
|
unsigned long item_start;
|
|
|
|
u32 item_size;
|
|
|
|
u32 sub_item_len;
|
|
|
|
int ret;
|
2012-08-08 22:32:27 +04:00
|
|
|
int search_ext_refs = 0;
|
2007-12-12 22:38:19 +03:00
|
|
|
int del_len = name_len + sizeof(*ref);
|
|
|
|
|
|
|
|
key.objectid = inode_objectid;
|
|
|
|
key.offset = ref_objectid;
|
2014-06-04 20:41:45 +04:00
|
|
|
key.type = BTRFS_INODE_REF_KEY;
|
2007-12-12 22:38:19 +03:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = -ENOENT;
|
2012-08-08 22:32:27 +04:00
|
|
|
search_ext_refs = 1;
|
2007-12-12 22:38:19 +03:00
|
|
|
goto out;
|
|
|
|
} else if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
2019-08-27 14:46:28 +03:00
|
|
|
|
|
|
|
ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name,
|
|
|
|
name_len);
|
|
|
|
if (!ref) {
|
2007-12-12 22:38:19 +03:00
|
|
|
ret = -ENOENT;
|
2012-08-08 22:32:27 +04:00
|
|
|
search_ext_refs = 1;
|
2007-12-12 22:38:19 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
2008-07-24 20:12:38 +04:00
|
|
|
|
|
|
|
if (index)
|
|
|
|
*index = btrfs_inode_ref_index(leaf, ref);
|
|
|
|
|
2007-12-12 22:38:19 +03:00
|
|
|
if (del_len == item_size) {
|
|
|
|
ret = btrfs_del_item(trans, root, path);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ptr = (unsigned long)ref;
|
|
|
|
sub_item_len = name_len + sizeof(*ref);
|
|
|
|
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
|
|
|
|
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
|
|
|
|
item_size - (ptr + sub_item_len - item_start));
|
2019-03-20 16:49:12 +03:00
|
|
|
btrfs_truncate_item(path, item_size - sub_item_len, 1);
|
2012-08-08 22:32:27 +04:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
if (search_ext_refs) {
|
|
|
|
/*
|
|
|
|
* No refs were found, or we could not find the
|
|
|
|
* name in our ref array. Find and remove the extended
|
|
|
|
* inode ref then.
|
|
|
|
*/
|
|
|
|
return btrfs_del_inode_extref(trans, root, name, name_len,
|
|
|
|
inode_objectid, ref_objectid, index);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
|
|
|
|
*
|
|
|
|
* The caller must have checked against BTRFS_LINK_MAX already.
|
|
|
|
*/
|
|
|
|
static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
const char *name, int name_len,
|
|
|
|
u64 inode_objectid, u64 ref_objectid, u64 index)
|
|
|
|
{
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
int ret;
|
|
|
|
int ins_len = name_len + sizeof(*extref);
|
|
|
|
unsigned long ptr;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_item *item;
|
|
|
|
|
|
|
|
key.objectid = inode_objectid;
|
|
|
|
key.type = BTRFS_INODE_EXTREF_KEY;
|
|
|
|
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
ins_len);
|
|
|
|
if (ret == -EEXIST) {
|
Btrfs: fix log replay failure after unlink and link combination
If we have a file with 2 (or more) hard links in the same directory,
remove one of the hard links, create a new file (or link an existing file)
in the same directory with the name of the removed hard link, and then
finally fsync the new file, we end up with a log that fails to replay,
causing a mount failure.
Example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/testdir
$ touch /mnt/testdir/foo
$ ln /mnt/testdir/foo /mnt/testdir/bar
$ sync
$ unlink /mnt/testdir/bar
$ touch /mnt/testdir/bar
$ xfs_io -c "fsync" /mnt/testdir/bar
<power failure>
$ mount /dev/sdb /mnt
mount: mount(2) failed: /mnt: No such file or directory
When replaying the log, for that example, we also see the following in
dmesg/syslog:
[71813.671307] BTRFS info (device dm-0): failed to delete reference to bar, inode 258 parent 257
[71813.674204] ------------[ cut here ]------------
[71813.675694] BTRFS: Transaction aborted (error -2)
[71813.677236] WARNING: CPU: 1 PID: 13231 at fs/btrfs/inode.c:4128 __btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] Modules linked in: btrfs xfs f2fs dm_flakey dm_mod dax ghash_clmulni_intel ppdev pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse i2c_piix4 parport_pc i2c_core pcspkr sg serio_raw parport button sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel floppy virtio e1000 scsi_mod [last unloaded: btrfs]
[71813.679669] CPU: 1 PID: 13231 Comm: mount Tainted: G W 4.15.0-rc9-btrfs-next-56+ #1
[71813.679669] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
[71813.679669] RIP: 0010:__btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] RSP: 0018:ffffc90001cef738 EFLAGS: 00010286
[71813.679669] RAX: 0000000000000025 RBX: ffff880217ce4708 RCX: 0000000000000001
[71813.679669] RDX: 0000000000000000 RSI: ffffffff81c14bae RDI: 00000000ffffffff
[71813.679669] RBP: ffffc90001cef7c0 R08: 0000000000000001 R09: 0000000000000001
[71813.679669] R10: ffffc90001cef5e0 R11: ffffffff8343f007 R12: ffff880217d474c8
[71813.679669] R13: 00000000fffffffe R14: ffff88021ccf1548 R15: 0000000000000101
[71813.679669] FS: 00007f7cee84c480(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000
[71813.679669] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[71813.679669] CR2: 00007f7cedc1abf9 CR3: 00000002354b4003 CR4: 00000000001606e0
[71813.679669] Call Trace:
[71813.679669] btrfs_unlink_inode+0x17/0x41 [btrfs]
[71813.679669] drop_one_dir_item+0xfa/0x131 [btrfs]
[71813.679669] add_inode_ref+0x71e/0x851 [btrfs]
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] ? replay_one_buffer+0x53/0x53a [btrfs]
[71813.679669] replay_one_buffer+0x4a4/0x53a [btrfs]
[71813.679669] ? rcu_read_unlock+0x3a/0x57
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] walk_up_log_tree+0x101/0x1d2 [btrfs]
[71813.679669] walk_log_tree+0xad/0x188 [btrfs]
[71813.679669] btrfs_recover_log_trees+0x1fa/0x31e [btrfs]
[71813.679669] ? replay_one_extent+0x544/0x544 [btrfs]
[71813.679669] open_ctree+0x1cf6/0x2209 [btrfs]
[71813.679669] btrfs_mount_root+0x368/0x482 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] btrfs_mount+0x13e/0x772 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] do_mount+0x6e5/0x973
[71813.679669] ? memdup_user+0x3e/0x5c
[71813.679669] SyS_mount+0x72/0x98
[71813.679669] entry_SYSCALL_64_fastpath+0x1e/0x8b
[71813.679669] RIP: 0033:0x7f7cedf150ba
[71813.679669] RSP: 002b:00007ffca71da688 EFLAGS: 00000206
[71813.679669] Code: 7f a0 e8 51 0c fd ff 48 8b 43 50 f0 0f ba a8 30 2c 00 00 02 72 17 41 83 fd fb 74 11 44 89 ee 48 c7 c7 7d 11 7f a0 e8 38 f5 8d e0 <0f> ff 44 89 e9 ba 20 10 00 00 eb 4d 48 8b 4d b0 48 8b 75 88 4c
[71813.679669] ---[ end trace 83bd473fc5b4663b ]---
[71813.854764] BTRFS: error (device dm-0) in __btrfs_unlink_inode:4128: errno=-2 No such entry
[71813.886994] BTRFS: error (device dm-0) in btrfs_replay_log:2307: errno=-2 No such entry (Failed to recover log tree)
[71813.903357] BTRFS error (device dm-0): cleaner transaction attach returned -30
[71814.128078] BTRFS error (device dm-0): open_ctree failed
This happens because the log has inode reference items for both inode 258
(the first file we created) and inode 259 (the second file created), and
when processing the reference item for inode 258, we replace the
corresponding item in the subvolume tree (which has two names, "foo" and
"bar") witht he one in the log (which only has one name, "foo") without
removing the corresponding dir index keys from the parent directory.
Later, when processing the inode reference item for inode 259, which has
a name of "bar" associated to it, we notice that dir index entries exist
for that name and for a different inode, so we attempt to unlink that
name, which fails because the inode reference item for inode 258 no longer
has the name "bar" associated to it, making a call to btrfs_unlink_inode()
fail with a -ENOENT error.
Fix this by unlinking all the names in an inode reference item from a
subvolume tree that are not present in the inode reference item found in
the log tree, before overwriting it with the item from the log tree.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-02-28 18:56:10 +03:00
|
|
|
if (btrfs_find_name_in_ext_backref(path->nodes[0],
|
|
|
|
path->slots[0],
|
|
|
|
ref_objectid,
|
2019-08-27 14:46:29 +03:00
|
|
|
name, name_len))
|
2012-08-08 22:32:27 +04:00
|
|
|
goto out;
|
|
|
|
|
2019-03-20 16:51:10 +03:00
|
|
|
btrfs_extend_item(path, ins_len);
|
2012-08-08 22:32:27 +04:00
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
2013-09-16 18:58:09 +04:00
|
|
|
item = btrfs_item_nr(path->slots[0]);
|
2012-08-08 22:32:27 +04:00
|
|
|
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
|
|
|
|
ptr += btrfs_item_size(leaf, item) - ins_len;
|
|
|
|
extref = (struct btrfs_inode_extref *)ptr;
|
|
|
|
|
|
|
|
btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
|
|
|
|
btrfs_set_inode_extref_index(path->nodes[0], extref, index);
|
|
|
|
btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
|
|
|
|
|
|
|
|
ptr = (unsigned long)&extref->name;
|
|
|
|
write_extent_buffer(path->nodes[0], name, ptr, name_len);
|
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
|
|
|
|
2007-12-12 22:38:19 +03:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-03-12 19:03:00 +04:00
|
|
|
/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
|
2007-12-12 22:38:19 +03:00
|
|
|
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
const char *name, int name_len,
|
2008-07-24 20:12:38 +04:00
|
|
|
u64 inode_objectid, u64 ref_objectid, u64 index)
|
2007-12-12 22:38:19 +03:00
|
|
|
{
|
2016-06-23 01:54:23 +03:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2007-12-12 22:38:19 +03:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
unsigned long ptr;
|
|
|
|
int ret;
|
|
|
|
int ins_len = name_len + sizeof(*ref);
|
|
|
|
|
|
|
|
key.objectid = inode_objectid;
|
|
|
|
key.offset = ref_objectid;
|
2014-06-04 20:41:45 +04:00
|
|
|
key.type = BTRFS_INODE_REF_KEY;
|
2007-12-12 22:38:19 +03:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
Btrfs: fix fsync log replay for inodes with a mix of regular refs and extrefs
If we have an inode with a large number of hard links, some of which may
be extrefs, turn a regular ref into an extref, fsync the inode and then
replay the fsync log (after a crash/reboot), we can endup with an fsync
log that makes the replay code always fail with -EOVERFLOW when processing
the inode's references.
This is easy to reproduce with the test case I made for xfstests. Its steps
are the following:
_scratch_mkfs "-O extref" >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create a test file with 3001 hard links. This number is large enough to
# make btrfs start using extrefs at some point even if the fs has the maximum
# possible leaf/node size (64Kb).
echo "hello world" > $SCRATCH_MNT/foo
for i in `seq 1 3000`; do
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i`
done
# Make sure all metadata and data are durably persisted.
sync
# Now remove one link, add a new one with a new name, add another new one with
# the same name as the one we just removed and fsync the inode.
rm -f $SCRATCH_MNT/foo_link_0001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_0001
rm -f $SCRATCH_MNT/foo_link_0002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3003
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Simulate a crash/power loss. This makes sure the next mount
# will see an fsync log and will replay that log.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Check that the number of hard links is correct, we are able to remove all
# the hard links and read the file's data. This is just to verify we don't
# get stale file handle errors (due to dangling directory index entries that
# point to inodes that no longer exist).
echo "Link count: $(stat --format=%h $SCRATCH_MNT/foo)"
[ -f $SCRATCH_MNT/foo ] || echo "Link foo is missing"
for ((i = 1; i <= 3003; i++)); do
name=foo_link_`printf "%04d" $i`
if [ $i -eq 2 ]; then
[ -f $SCRATCH_MNT/$name ] && echo "Link $name found"
else
[ -f $SCRATCH_MNT/$name ] || echo "Link $name is missing"
fi
done
rm -f $SCRATCH_MNT/foo_link_*
cat $SCRATCH_MNT/foo
rm -f $SCRATCH_MNT/foo
status=0
exit
The fix is simply to correct the overflow condition when overwriting a
reference item because it was wrong, trying to increase the item in the
fs/subvol tree by an impossible amount. Also ensure that we don't insert
one normal ref and one ext ref for the same dentry - this happened because
processing a dir index entry from the parent in the log happened when
the normal ref item was full, which made the logic insert an extref and
later when the normal ref had enough room, it would be inserted again
when processing the ref item from the child inode in the log.
This issue has been present since the introduction of the extrefs feature
(2012).
A test case for xfstests follows soon. This test only passes if the previous
patch titled "Btrfs: fix fsync when extend references are added to an inode"
is applied too.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-01-14 04:52:25 +03:00
|
|
|
path->skip_release_on_error = 1;
|
2007-12-12 22:38:19 +03:00
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
ins_len);
|
|
|
|
if (ret == -EEXIST) {
|
|
|
|
u32 old_size;
|
2019-08-27 14:46:28 +03:00
|
|
|
ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
|
|
|
|
name, name_len);
|
|
|
|
if (ref)
|
2007-12-12 22:38:19 +03:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
|
2019-03-20 16:51:10 +03:00
|
|
|
btrfs_extend_item(path, ins_len);
|
2007-12-12 22:38:19 +03:00
|
|
|
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_ref);
|
|
|
|
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
|
|
|
|
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
|
2008-07-24 20:12:38 +04:00
|
|
|
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
|
2007-12-12 22:38:19 +03:00
|
|
|
ptr = (unsigned long)(ref + 1);
|
|
|
|
ret = 0;
|
|
|
|
} else if (ret < 0) {
|
Btrfs: fix fsync log replay for inodes with a mix of regular refs and extrefs
If we have an inode with a large number of hard links, some of which may
be extrefs, turn a regular ref into an extref, fsync the inode and then
replay the fsync log (after a crash/reboot), we can endup with an fsync
log that makes the replay code always fail with -EOVERFLOW when processing
the inode's references.
This is easy to reproduce with the test case I made for xfstests. Its steps
are the following:
_scratch_mkfs "-O extref" >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create a test file with 3001 hard links. This number is large enough to
# make btrfs start using extrefs at some point even if the fs has the maximum
# possible leaf/node size (64Kb).
echo "hello world" > $SCRATCH_MNT/foo
for i in `seq 1 3000`; do
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i`
done
# Make sure all metadata and data are durably persisted.
sync
# Now remove one link, add a new one with a new name, add another new one with
# the same name as the one we just removed and fsync the inode.
rm -f $SCRATCH_MNT/foo_link_0001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_0001
rm -f $SCRATCH_MNT/foo_link_0002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3003
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Simulate a crash/power loss. This makes sure the next mount
# will see an fsync log and will replay that log.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Check that the number of hard links is correct, we are able to remove all
# the hard links and read the file's data. This is just to verify we don't
# get stale file handle errors (due to dangling directory index entries that
# point to inodes that no longer exist).
echo "Link count: $(stat --format=%h $SCRATCH_MNT/foo)"
[ -f $SCRATCH_MNT/foo ] || echo "Link foo is missing"
for ((i = 1; i <= 3003; i++)); do
name=foo_link_`printf "%04d" $i`
if [ $i -eq 2 ]; then
[ -f $SCRATCH_MNT/$name ] && echo "Link $name found"
else
[ -f $SCRATCH_MNT/$name ] || echo "Link $name is missing"
fi
done
rm -f $SCRATCH_MNT/foo_link_*
cat $SCRATCH_MNT/foo
rm -f $SCRATCH_MNT/foo
status=0
exit
The fix is simply to correct the overflow condition when overwriting a
reference item because it was wrong, trying to increase the item in the
fs/subvol tree by an impossible amount. Also ensure that we don't insert
one normal ref and one ext ref for the same dentry - this happened because
processing a dir index entry from the parent in the log happened when
the normal ref item was full, which made the logic insert an extref and
later when the normal ref had enough room, it would be inserted again
when processing the ref item from the child inode in the log.
This issue has been present since the introduction of the extrefs feature
(2012).
A test case for xfstests follows soon. This test only passes if the previous
patch titled "Btrfs: fix fsync when extend references are added to an inode"
is applied too.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-01-14 04:52:25 +03:00
|
|
|
if (ret == -EOVERFLOW) {
|
Btrfs: fix log replay failure after unlink and link combination
If we have a file with 2 (or more) hard links in the same directory,
remove one of the hard links, create a new file (or link an existing file)
in the same directory with the name of the removed hard link, and then
finally fsync the new file, we end up with a log that fails to replay,
causing a mount failure.
Example:
$ mkfs.btrfs -f /dev/sdb
$ mount /dev/sdb /mnt
$ mkdir /mnt/testdir
$ touch /mnt/testdir/foo
$ ln /mnt/testdir/foo /mnt/testdir/bar
$ sync
$ unlink /mnt/testdir/bar
$ touch /mnt/testdir/bar
$ xfs_io -c "fsync" /mnt/testdir/bar
<power failure>
$ mount /dev/sdb /mnt
mount: mount(2) failed: /mnt: No such file or directory
When replaying the log, for that example, we also see the following in
dmesg/syslog:
[71813.671307] BTRFS info (device dm-0): failed to delete reference to bar, inode 258 parent 257
[71813.674204] ------------[ cut here ]------------
[71813.675694] BTRFS: Transaction aborted (error -2)
[71813.677236] WARNING: CPU: 1 PID: 13231 at fs/btrfs/inode.c:4128 __btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] Modules linked in: btrfs xfs f2fs dm_flakey dm_mod dax ghash_clmulni_intel ppdev pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse i2c_piix4 parport_pc i2c_core pcspkr sg serio_raw parport button sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel floppy virtio e1000 scsi_mod [last unloaded: btrfs]
[71813.679669] CPU: 1 PID: 13231 Comm: mount Tainted: G W 4.15.0-rc9-btrfs-next-56+ #1
[71813.679669] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
[71813.679669] RIP: 0010:__btrfs_unlink_inode+0x17b/0x355 [btrfs]
[71813.679669] RSP: 0018:ffffc90001cef738 EFLAGS: 00010286
[71813.679669] RAX: 0000000000000025 RBX: ffff880217ce4708 RCX: 0000000000000001
[71813.679669] RDX: 0000000000000000 RSI: ffffffff81c14bae RDI: 00000000ffffffff
[71813.679669] RBP: ffffc90001cef7c0 R08: 0000000000000001 R09: 0000000000000001
[71813.679669] R10: ffffc90001cef5e0 R11: ffffffff8343f007 R12: ffff880217d474c8
[71813.679669] R13: 00000000fffffffe R14: ffff88021ccf1548 R15: 0000000000000101
[71813.679669] FS: 00007f7cee84c480(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000
[71813.679669] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[71813.679669] CR2: 00007f7cedc1abf9 CR3: 00000002354b4003 CR4: 00000000001606e0
[71813.679669] Call Trace:
[71813.679669] btrfs_unlink_inode+0x17/0x41 [btrfs]
[71813.679669] drop_one_dir_item+0xfa/0x131 [btrfs]
[71813.679669] add_inode_ref+0x71e/0x851 [btrfs]
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] ? replay_one_buffer+0x53/0x53a [btrfs]
[71813.679669] replay_one_buffer+0x4a4/0x53a [btrfs]
[71813.679669] ? rcu_read_unlock+0x3a/0x57
[71813.679669] ? __lock_is_held+0x39/0x71
[71813.679669] walk_up_log_tree+0x101/0x1d2 [btrfs]
[71813.679669] walk_log_tree+0xad/0x188 [btrfs]
[71813.679669] btrfs_recover_log_trees+0x1fa/0x31e [btrfs]
[71813.679669] ? replay_one_extent+0x544/0x544 [btrfs]
[71813.679669] open_ctree+0x1cf6/0x2209 [btrfs]
[71813.679669] btrfs_mount_root+0x368/0x482 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] btrfs_mount+0x13e/0x772 [btrfs]
[71813.679669] ? trace_hardirqs_on_caller+0x14c/0x1a6
[71813.679669] ? __lockdep_init_map+0x176/0x1c2
[71813.679669] ? mount_fs+0x64/0x10b
[71813.679669] mount_fs+0x64/0x10b
[71813.679669] vfs_kern_mount+0x68/0xce
[71813.679669] do_mount+0x6e5/0x973
[71813.679669] ? memdup_user+0x3e/0x5c
[71813.679669] SyS_mount+0x72/0x98
[71813.679669] entry_SYSCALL_64_fastpath+0x1e/0x8b
[71813.679669] RIP: 0033:0x7f7cedf150ba
[71813.679669] RSP: 002b:00007ffca71da688 EFLAGS: 00000206
[71813.679669] Code: 7f a0 e8 51 0c fd ff 48 8b 43 50 f0 0f ba a8 30 2c 00 00 02 72 17 41 83 fd fb 74 11 44 89 ee 48 c7 c7 7d 11 7f a0 e8 38 f5 8d e0 <0f> ff 44 89 e9 ba 20 10 00 00 eb 4d 48 8b 4d b0 48 8b 75 88 4c
[71813.679669] ---[ end trace 83bd473fc5b4663b ]---
[71813.854764] BTRFS: error (device dm-0) in __btrfs_unlink_inode:4128: errno=-2 No such entry
[71813.886994] BTRFS: error (device dm-0) in btrfs_replay_log:2307: errno=-2 No such entry (Failed to recover log tree)
[71813.903357] BTRFS error (device dm-0): cleaner transaction attach returned -30
[71814.128078] BTRFS error (device dm-0): open_ctree failed
This happens because the log has inode reference items for both inode 258
(the first file we created) and inode 259 (the second file created), and
when processing the reference item for inode 258, we replace the
corresponding item in the subvolume tree (which has two names, "foo" and
"bar") witht he one in the log (which only has one name, "foo") without
removing the corresponding dir index keys from the parent directory.
Later, when processing the inode reference item for inode 259, which has
a name of "bar" associated to it, we notice that dir index entries exist
for that name and for a different inode, so we attempt to unlink that
name, which fails because the inode reference item for inode 258 no longer
has the name "bar" associated to it, making a call to btrfs_unlink_inode()
fail with a -ENOENT error.
Fix this by unlinking all the names in an inode reference item from a
subvolume tree that are not present in the inode reference item found in
the log tree, before overwriting it with the item from the log tree.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-02-28 18:56:10 +03:00
|
|
|
if (btrfs_find_name_in_backref(path->nodes[0],
|
|
|
|
path->slots[0],
|
2019-08-27 14:46:28 +03:00
|
|
|
name, name_len))
|
Btrfs: fix fsync log replay for inodes with a mix of regular refs and extrefs
If we have an inode with a large number of hard links, some of which may
be extrefs, turn a regular ref into an extref, fsync the inode and then
replay the fsync log (after a crash/reboot), we can endup with an fsync
log that makes the replay code always fail with -EOVERFLOW when processing
the inode's references.
This is easy to reproduce with the test case I made for xfstests. Its steps
are the following:
_scratch_mkfs "-O extref" >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create a test file with 3001 hard links. This number is large enough to
# make btrfs start using extrefs at some point even if the fs has the maximum
# possible leaf/node size (64Kb).
echo "hello world" > $SCRATCH_MNT/foo
for i in `seq 1 3000`; do
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i`
done
# Make sure all metadata and data are durably persisted.
sync
# Now remove one link, add a new one with a new name, add another new one with
# the same name as the one we just removed and fsync the inode.
rm -f $SCRATCH_MNT/foo_link_0001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_0001
rm -f $SCRATCH_MNT/foo_link_0002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3003
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Simulate a crash/power loss. This makes sure the next mount
# will see an fsync log and will replay that log.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Check that the number of hard links is correct, we are able to remove all
# the hard links and read the file's data. This is just to verify we don't
# get stale file handle errors (due to dangling directory index entries that
# point to inodes that no longer exist).
echo "Link count: $(stat --format=%h $SCRATCH_MNT/foo)"
[ -f $SCRATCH_MNT/foo ] || echo "Link foo is missing"
for ((i = 1; i <= 3003; i++)); do
name=foo_link_`printf "%04d" $i`
if [ $i -eq 2 ]; then
[ -f $SCRATCH_MNT/$name ] && echo "Link $name found"
else
[ -f $SCRATCH_MNT/$name ] || echo "Link $name is missing"
fi
done
rm -f $SCRATCH_MNT/foo_link_*
cat $SCRATCH_MNT/foo
rm -f $SCRATCH_MNT/foo
status=0
exit
The fix is simply to correct the overflow condition when overwriting a
reference item because it was wrong, trying to increase the item in the
fs/subvol tree by an impossible amount. Also ensure that we don't insert
one normal ref and one ext ref for the same dentry - this happened because
processing a dir index entry from the parent in the log happened when
the normal ref item was full, which made the logic insert an extref and
later when the normal ref had enough room, it would be inserted again
when processing the ref item from the child inode in the log.
This issue has been present since the introduction of the extrefs feature
(2012).
A test case for xfstests follows soon. This test only passes if the previous
patch titled "Btrfs: fix fsync when extend references are added to an inode"
is applied too.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-01-14 04:52:25 +03:00
|
|
|
ret = -EEXIST;
|
|
|
|
else
|
|
|
|
ret = -EMLINK;
|
|
|
|
}
|
2007-12-12 22:38:19 +03:00
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_ref);
|
|
|
|
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
|
2008-07-24 20:12:38 +04:00
|
|
|
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
|
2007-12-12 22:38:19 +03:00
|
|
|
ptr = (unsigned long)(ref + 1);
|
|
|
|
}
|
|
|
|
write_extent_buffer(path->nodes[0], name, ptr, name_len);
|
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
2012-08-08 22:32:27 +04:00
|
|
|
|
|
|
|
if (ret == -EMLINK) {
|
2016-06-23 01:54:23 +03:00
|
|
|
struct btrfs_super_block *disk_super = fs_info->super_copy;
|
2012-08-08 22:32:27 +04:00
|
|
|
/* We ran out of space in the ref array. Need to
|
|
|
|
* add an extended ref. */
|
|
|
|
if (btrfs_super_incompat_flags(disk_super)
|
|
|
|
& BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
|
|
|
|
ret = btrfs_insert_inode_extref(trans, root, name,
|
|
|
|
name_len,
|
|
|
|
inode_objectid,
|
|
|
|
ref_objectid, index);
|
|
|
|
}
|
|
|
|
|
2007-12-12 22:38:19 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-10-16 00:14:19 +04:00
|
|
|
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct btrfs_path *path, u64 objectid)
|
2007-03-16 02:03:33 +03:00
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
int ret;
|
|
|
|
key.objectid = objectid;
|
2014-06-04 20:41:45 +04:00
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
2007-03-16 02:03:33 +03:00
|
|
|
key.offset = 0;
|
|
|
|
|
2007-10-16 00:14:19 +04:00
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
sizeof(struct btrfs_inode_item));
|
2007-03-16 02:03:33 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-03-16 23:20:31 +03:00
|
|
|
int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
|
2007-04-06 23:37:36 +04:00
|
|
|
*root, struct btrfs_path *path,
|
|
|
|
struct btrfs_key *location, int mod)
|
2007-03-16 02:03:33 +03:00
|
|
|
{
|
|
|
|
int ins_len = mod < 0 ? -1 : 0;
|
|
|
|
int cow = mod != 0;
|
2007-04-06 23:37:36 +04:00
|
|
|
int ret;
|
|
|
|
int slot;
|
2007-10-16 00:14:19 +04:00
|
|
|
struct extent_buffer *leaf;
|
2007-04-06 23:37:36 +04:00
|
|
|
struct btrfs_key found_key;
|
2007-03-16 02:03:33 +03:00
|
|
|
|
2007-04-06 23:37:36 +04:00
|
|
|
ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
|
2014-06-04 20:41:45 +04:00
|
|
|
if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
|
2007-04-06 23:37:36 +04:00
|
|
|
location->offset == (u64)-1 && path->slots[0] != 0) {
|
|
|
|
slot = path->slots[0] - 1;
|
2007-10-16 00:14:19 +04:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
2007-04-06 23:37:36 +04:00
|
|
|
if (found_key.objectid == location->objectid &&
|
2014-06-04 20:41:45 +04:00
|
|
|
found_key.type == location->type) {
|
2007-04-06 23:37:36 +04:00
|
|
|
path->slots[0]--;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
2007-03-16 02:03:33 +03:00
|
|
|
}
|