From 313b085851c13ca08320372a05a7047ea25d3dd4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:18:26 -0400 Subject: [PATCH 1/3] btrfs: move btrfs_scratch_superblocks into btrfs_dev_replace_finishing We need to move the closing of the src_device out of all the device replace locking, but we definitely want to zero out the superblock before we commit the last time to make sure the device is properly removed. Handle this by pushing btrfs_scratch_superblocks into btrfs_dev_replace_finishing, and then later on we'll move the src_device closing and freeing stuff where we need it to be. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 +++ fs/btrfs/volumes.c | 12 +++--------- fs/btrfs/volumes.h | 3 +++ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db93909b25e0..7cf48aeb6f14 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -745,6 +745,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* replace the sysfs entry */ btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); btrfs_sysfs_update_devid(tgt_device); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) + btrfs_scratch_superblocks(fs_info, src_device->bdev, + src_device->name->str); btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 117b43367629..01ebdb69fbdb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1999,9 +1999,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } -static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, - struct block_device *bdev, - const char *device_path) +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) { struct btrfs_super_block *disk_super; int copy_num; @@ -2224,12 +2224,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(fs_info, srcdev->bdev, - srcdev->name->str); - } - btrfs_close_bdev(srcdev); synchronize_rcu(); btrfs_free_device(srcdev); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5eea93916fbf..302c9234f7d0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -573,6 +573,9 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path); int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); From a466c85edc6fbe845facc8f57c408c544f42899e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:18:27 -0400 Subject: [PATCH 2/3] btrfs: move btrfs_rm_dev_replace_free_srcdev outside of all locks When closing and freeing the source device we could end up doing our final blkdev_put() on the bdev, which will grab the bd_mutex. As such we want to be holding as few locks as possible, so move this call outside of the dev_replace->lock_finishing_cancel_unmount lock. Since we're modifying the fs_devices we need to make sure we're holding the uuid_mutex here, so take that as well. There's a report from syzbot probably hitting one of the cases where the bd_mutex and device_list_mutex are taken in the wrong order, however it's not with device replace, like this patch fixes. As there's no reproducer available so far, we can't verify the fix. https://lore.kernel.org/lkml/000000000000fc04d105afcf86d7@google.com/ dashboard link: https://syzkaller.appspot.com/bug?extid=84a0634dc5d21d488419 WARNING: possible circular locking dependency detected 5.9.0-rc5-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.0/6878 is trying to acquire lock: ffff88804c17d780 (&bdev->bd_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x520 fs/block_dev.c:1804 but task is already holding lock: ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 btrfs_finish_chunk_alloc+0x281/0xf90 fs/btrfs/volumes.c:5255 btrfs_create_pending_block_groups+0x2f3/0x700 fs/btrfs/block-group.c:2109 __btrfs_end_transaction+0xf5/0x690 fs/btrfs/transaction.c:916 find_free_extent_update_loop fs/btrfs/extent-tree.c:3807 [inline] find_free_extent+0x23b7/0x2e60 fs/btrfs/extent-tree.c:4127 btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206 cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063 btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838 writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439 __extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653 extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249 extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370 do_writepages+0xec/0x290 mm/page-writeback.c:2352 __writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461 writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721 wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894 wb_do_writeback fs/fs-writeback.c:2039 [inline] wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080 process_one_work+0x94c/0x1670 kernel/workqueue.c:2269 worker_thread+0x64c/0x1120 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 -> #3 (sb_internal#2){.+.+}-{0:0}: percpu_down_read include/linux/percpu-rwsem.h:51 [inline] __sb_start_write+0x234/0x470 fs/super.c:1672 sb_start_intwrite include/linux/fs.h:1690 [inline] start_transaction+0xbe7/0x1170 fs/btrfs/transaction.c:624 find_free_extent_update_loop fs/btrfs/extent-tree.c:3789 [inline] find_free_extent+0x25e1/0x2e60 fs/btrfs/extent-tree.c:4127 btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206 cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063 btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838 writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439 __extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653 extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249 extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370 do_writepages+0xec/0x290 mm/page-writeback.c:2352 __writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461 writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721 wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894 wb_do_writeback fs/fs-writeback.c:2039 [inline] wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080 process_one_work+0x94c/0x1670 kernel/workqueue.c:2269 worker_thread+0x64c/0x1120 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 -> #2 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}: __flush_work+0x60e/0xac0 kernel/workqueue.c:3041 wb_shutdown+0x180/0x220 mm/backing-dev.c:355 bdi_unregister+0x174/0x590 mm/backing-dev.c:872 del_gendisk+0x820/0xa10 block/genhd.c:933 loop_remove drivers/block/loop.c:2192 [inline] loop_control_ioctl drivers/block/loop.c:2291 [inline] loop_control_ioctl+0x3b1/0x480 drivers/block/loop.c:2257 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (loop_ctl_mutex){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 lo_open+0x19/0xd0 drivers/block/loop.c:1893 __blkdev_get+0x759/0x1aa0 fs/block_dev.c:1507 blkdev_get fs/block_dev.c:1639 [inline] blkdev_open+0x227/0x300 fs/block_dev.c:1753 do_dentry_open+0x4b9/0x11b0 fs/open.c:817 do_open fs/namei.c:3251 [inline] path_openat+0x1b9a/0x2730 fs/namei.c:3368 do_filp_open+0x17e/0x3c0 fs/namei.c:3395 do_sys_openat2+0x16d/0x420 fs/open.c:1168 do_sys_open fs/open.c:1184 [inline] __do_sys_open fs/open.c:1192 [inline] __se_sys_open fs/open.c:1188 [inline] __x64_sys_open+0x119/0x1c0 fs/open.c:1188 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&bdev->bd_mutex){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:2496 [inline] check_prevs_add kernel/locking/lockdep.c:2601 [inline] validate_chain kernel/locking/lockdep.c:3218 [inline] __lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426 lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006 __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 blkdev_put+0x30/0x520 fs/block_dev.c:1804 btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline] btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline] btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline] close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161 close_fs_devices fs/btrfs/volumes.c:1193 [inline] btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179 close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149 generic_shutdown_super+0x144/0x370 fs/super.c:464 kill_anon_super+0x36/0x60 fs/super.c:1108 btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265 deactivate_locked_super+0x94/0x160 fs/super.c:335 deactivate_super+0xad/0xd0 fs/super.c:366 cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118 task_work_run+0xdd/0x190 kernel/task_work.c:141 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_user_mode_loop kernel/entry/common.c:163 [inline] exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190 syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Chain exists of: &bdev->bd_mutex --> sb_internal#2 --> &fs_devs->device_list_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_devs->device_list_mutex); lock(sb_internal#2); lock(&fs_devs->device_list_mutex); lock(&bdev->bd_mutex); *** DEADLOCK *** 3 locks held by syz-executor.0/6878: #0: ffff88809070c0e0 (&type->s_umount_key#70){++++}-{3:3}, at: deactivate_super+0xa5/0xd0 fs/super.c:365 #1: ffffffff8a5b37a8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_close_devices+0x23/0x1f0 fs/btrfs/volumes.c:1178 #2: ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159 stack backtrace: CPU: 0 PID: 6878 Comm: syz-executor.0 Not tainted 5.9.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 check_noncircular+0x324/0x3e0 kernel/locking/lockdep.c:1827 check_prev_add kernel/locking/lockdep.c:2496 [inline] check_prevs_add kernel/locking/lockdep.c:2601 [inline] validate_chain kernel/locking/lockdep.c:3218 [inline] __lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426 lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006 __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 blkdev_put+0x30/0x520 fs/block_dev.c:1804 btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline] btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline] btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline] close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161 close_fs_devices fs/btrfs/volumes.c:1193 [inline] btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179 close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149 generic_shutdown_super+0x144/0x370 fs/super.c:464 kill_anon_super+0x36/0x60 fs/super.c:1108 btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265 deactivate_locked_super+0x94/0x160 fs/super.c:335 deactivate_super+0xad/0xd0 fs/super.c:366 cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118 task_work_run+0xdd/0x190 kernel/task_work.c:141 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_user_mode_loop kernel/entry/common.c:163 [inline] exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190 syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x460027 RSP: 002b:00007fff59216328 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 0000000000076035 RCX: 0000000000460027 RDX: 0000000000403188 RSI: 0000000000000002 RDI: 00007fff592163d0 RBP: 0000000000000333 R08: 0000000000000000 R09: 000000000000000b R10: 0000000000000005 R11: 0000000000000246 R12: 00007fff59217460 R13: 0000000002df2a60 R14: 0000000000000000 R15: 00007fff59217460 Signed-off-by: Josef Bacik [ add syzbot reference ] Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 ++- fs/btrfs/volumes.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7cf48aeb6f14..62aece1bb8ec 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -748,7 +748,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) btrfs_scratch_superblocks(fs_info, src_device->bdev, src_device->name->str); - btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -757,6 +756,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_rm_dev_replace_free_srcdev(src_device); + return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 01ebdb69fbdb..1997a7d67f22 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2224,6 +2224,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + mutex_lock(&uuid_mutex); + btrfs_close_bdev(srcdev); synchronize_rcu(); btrfs_free_device(srcdev); @@ -2252,6 +2254,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) close_fs_devices(fs_devices); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) From 4c8f353272dd1262013873990c0fafd0e3c8f274 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 23 Sep 2020 15:30:16 +0100 Subject: [PATCH 3/3] btrfs: fix filesystem corruption after a device replace We use a device's allocation state tree to track ranges in a device used for allocated chunks, and we set ranges in this tree when allocating a new chunk. However after a device replace operation, we were not setting the allocated ranges in the new device's allocation state tree, so that tree is empty after a device replace. This means that a fitrim operation after a device replace will trim the device ranges that have allocated chunks and extents, as we trim every range for which there is not a range marked in the device's allocation state tree. It is also important during chunk allocation, since the device's allocation state is used to determine if a range is already allocated when allocating a new chunk. This is trivial to reproduce and the following script triggers the bug: $ cat reproducer.sh #!/bin/bash DEV1="/dev/sdg" DEV2="/dev/sdh" DEV3="/dev/sdi" wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null # Create a raid1 test fs on 2 devices. mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null mount $DEV1 /mnt/btrfs xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo echo "Starting to replace $DEV1 with $DEV3" btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs echo echo "Running fstrim" fstrim /mnt/btrfs echo echo "Unmounting filesystem" umount /mnt/btrfs echo "Mounting filesystem in degraded mode using $DEV3 only" wipefs -a $DEV1 $DEV2 &> /dev/null mount -o degraded $DEV3 /mnt/btrfs if [ $? -ne 0 ]; then dmesg | tail echo echo "Failed to mount in degraded mode" exit 1 fi echo echo "File foo data (expected all bytes = 0xab):" od -A d -t x1 /mnt/btrfs/foo umount /mnt/btrfs When running the reproducer: $ ./replace-test.sh wrote 10485760/10485760 bytes at offset 0 10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec) Starting to replace /dev/sdg with /dev/sdi Running fstrim Unmounting filesystem Mounting filesystem in degraded mode using /dev/sdi only mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error. [19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started [19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished [19582.208293] BTRFS info (device sdi): allowing degraded mounts [19582.208298] BTRFS info (device sdi): disk space caching is enabled [19582.208301] BTRFS info (device sdi): has skinny extents [19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing [19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed [19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0 [19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5 [19582.231576] BTRFS error (device sdi): open_ctree failed Failed to mount in degraded mode So fix by setting all allocated ranges in the replace target device when the replace operation is finishing, when we are holding the chunk mutex and we can not race with new chunk allocations. A test case for fstests follows soon. Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree") CC: stable@vger.kernel.org # 5.2+ Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 62aece1bb8ec..e4a1c6afe35d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -599,6 +599,37 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) wake_up(&fs_info->dev_replace.replace_wait); } +/* + * When finishing the device replace, before swapping the source device with the + * target device we must update the chunk allocation state in the target device, + * as it is empty because replace works by directly copying the chunks and not + * through the normal chunk allocation path. + */ +static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 found_start; + u64 found_end; + int ret = 0; + + lockdep_assert_held(&srcdev->fs_info->chunk_mutex); + + while (!find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = set_extent_bits(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED); + if (ret) + break; + start = found_end + 1; + } + + free_extent_state(cached_state); + return ret; +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -673,8 +704,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; - /* replace old device with new one in mapping tree */ + /* + * Update allocation state in the new device and replace the old device + * with the new one in the mapping tree. + */ if (!scrub_ret) { + scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); + if (scrub_ret) + goto error; btrfs_dev_replace_update_device_in_mapping_tree(fs_info, src_device, tgt_device); @@ -685,6 +722,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); +error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex);