From f55985f4dda5cfb6967c17e96237f3c859076eb3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 Feb 2015 21:14:24 +0000 Subject: [PATCH] Btrfs: scrub, fix sleep in atomic context My previous patch "Btrfs: fix scrub race leading to use-after-free" introduced the possibility to sleep in an atomic context, which happens when the scrub_lock mutex is held at the time scrub_pending_bio_dec() is called - this function can be called under an atomic context. Chris ran into this in a debug kernel which gave the following trace: [ 1928.950319] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:621 [ 1928.967334] in_atomic(): 1, irqs_disabled(): 0, pid: 149670, name: fsstress [ 1928.981324] INFO: lockdep is turned off. [ 1928.989244] CPU: 24 PID: 149670 Comm: fsstress Tainted: G W 3.19.0-rc7-mason+ #41 [ 1929.006418] Hardware name: ZTSYSTEMS Echo Ridge T4 /A9DRPF-10D, BIOS 1.07 05/10/2012 [ 1929.022207] ffffffff81a22cf8 ffff881076e03b78 ffffffff816b8dd9 ffff881076e03b78 [ 1929.037267] ffff880d8e828710 ffff881076e03ba8 ffffffff810856c4 ffff881076e03bc8 [ 1929.052315] 0000000000000000 000000000000026d ffffffff81a22cf8 ffff881076e03bd8 [ 1929.067381] Call Trace: [ 1929.072344] [] dump_stack+0x4f/0x6e [ 1929.083968] [] ___might_sleep+0x174/0x230 [ 1929.095352] [] __might_sleep+0x52/0x90 [ 1929.106223] [] mutex_lock_nested+0x2f/0x3b0 [ 1929.117951] [] ? trace_hardirqs_on+0xd/0x10 [ 1929.129708] [] scrub_pending_bio_dec+0x38/0x70 [btrfs] [ 1929.143370] [] scrub_parity_bio_endio+0x50/0x70 [btrfs] [ 1929.157191] [] bio_endio+0x53/0xa0 [ 1929.167382] [] rbio_orig_end_io+0x7c/0xa0 [btrfs] [ 1929.180161] [] raid_write_parity_end_io+0x5a/0x80 [btrfs] [ 1929.194318] [] bio_endio+0x53/0xa0 [ 1929.204496] [] blk_update_request+0x1eb/0x450 [ 1929.216569] [] ? trigger_load_balance+0x78/0x500 [ 1929.229176] [] scsi_end_request+0x3d/0x1f0 [ 1929.240740] [] scsi_io_completion+0xac/0x5b0 [ 1929.252654] [] scsi_finish_command+0xf0/0x150 [ 1929.264725] [] scsi_softirq_done+0x147/0x170 [ 1929.276635] [] blk_done_softirq+0x86/0xa0 [ 1929.288014] [] __do_softirq+0xde/0x600 [ 1929.298885] [] irq_exit+0xbd/0xd0 (...) Fix this by using a reference count on the scrub context structure instead of locking the scrub_lock mutex. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6d6e155c8c8b..db21f17df996 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -193,6 +193,15 @@ struct scrub_ctx { */ struct btrfs_scrub_progress stat; spinlock_t stat_lock; + + /* + * Use a ref counter to avoid use-after-free issues. Scrub workers + * decrement bios_in_flight and workers_pending and then do a wakeup + * on the list_wait wait queue. We must ensure the main scrub task + * doesn't free the scrub context before or while the workers are + * doing the wakeup() call. + */ + atomic_t refs; }; struct scrub_fixup_nodatasum { @@ -297,26 +306,20 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void copy_nocow_pages_worker(struct btrfs_work *work); static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { + atomic_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } static void scrub_pending_bio_dec(struct scrub_ctx *sctx) { - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; - - /* - * Hold the scrub_lock while doing the wakeup to ensure the - * sctx (and its wait queue list_wait) isn't destroyed/freed - * during the wakeup. - */ - mutex_lock(&fs_info->scrub_lock); atomic_dec(&sctx->bios_in_flight); wake_up(&sctx->list_wait); - mutex_unlock(&fs_info->scrub_lock); + scrub_put_ctx(sctx); } static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) @@ -350,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + atomic_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -388,15 +392,11 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) mutex_lock(&fs_info->scrub_lock); atomic_dec(&fs_info->scrubs_running); atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); atomic_dec(&sctx->workers_pending); wake_up(&fs_info->scrub_pause_wait); - /* - * Hold the scrub_lock while doing the wakeup to ensure the - * sctx (and its wait queue list_wait) isn't destroyed/freed - * during the wakeup. - */ wake_up(&sctx->list_wait); - mutex_unlock(&fs_info->scrub_lock); + scrub_put_ctx(sctx); } static void scrub_free_csums(struct scrub_ctx *sctx) @@ -442,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sctx); } +static void scrub_put_ctx(struct scrub_ctx *sctx) +{ + if (atomic_dec_and_test(&sctx->refs)) + scrub_free_ctx(sctx); +} + static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { @@ -466,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; @@ -3739,7 +3746,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); - scrub_free_ctx(sctx); + scrub_put_ctx(sctx); return ret; }