From f55985f4dda5cfb6967c17e96237f3c859076eb3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 9 Feb 2015 21:14:24 +0000
Subject: [PATCH] Btrfs: scrub, fix sleep in atomic context

My previous patch "Btrfs: fix scrub race leading to use-after-free"
introduced the possibility to sleep in an atomic context, which happens
when the scrub_lock mutex is held at the time scrub_pending_bio_dec()
is called - this function can be called under an atomic context.
Chris ran into this in a debug kernel which gave the following trace:

[ 1928.950319] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:621
[ 1928.967334] in_atomic(): 1, irqs_disabled(): 0, pid: 149670, name: fsstress
[ 1928.981324] INFO: lockdep is turned off.
[ 1928.989244] CPU: 24 PID: 149670 Comm: fsstress Tainted: G        W     3.19.0-rc7-mason+ #41
[ 1929.006418] Hardware name: ZTSYSTEMS Echo Ridge T4  /A9DRPF-10D, BIOS 1.07 05/10/2012
[ 1929.022207]  ffffffff81a22cf8 ffff881076e03b78 ffffffff816b8dd9 ffff881076e03b78
[ 1929.037267]  ffff880d8e828710 ffff881076e03ba8 ffffffff810856c4 ffff881076e03bc8
[ 1929.052315]  0000000000000000 000000000000026d ffffffff81a22cf8 ffff881076e03bd8
[ 1929.067381] Call Trace:
[ 1929.072344]  <IRQ>  [<ffffffff816b8dd9>] dump_stack+0x4f/0x6e
[ 1929.083968]  [<ffffffff810856c4>] ___might_sleep+0x174/0x230
[ 1929.095352]  [<ffffffff810857d2>] __might_sleep+0x52/0x90
[ 1929.106223]  [<ffffffff816bb68f>] mutex_lock_nested+0x2f/0x3b0
[ 1929.117951]  [<ffffffff810ab37d>] ? trace_hardirqs_on+0xd/0x10
[ 1929.129708]  [<ffffffffa05dc838>] scrub_pending_bio_dec+0x38/0x70 [btrfs]
[ 1929.143370]  [<ffffffffa05dd0e0>] scrub_parity_bio_endio+0x50/0x70 [btrfs]
[ 1929.157191]  [<ffffffff812fa603>] bio_endio+0x53/0xa0
[ 1929.167382]  [<ffffffffa05f96bc>] rbio_orig_end_io+0x7c/0xa0 [btrfs]
[ 1929.180161]  [<ffffffffa05f97ba>] raid_write_parity_end_io+0x5a/0x80 [btrfs]
[ 1929.194318]  [<ffffffff812fa603>] bio_endio+0x53/0xa0
[ 1929.204496]  [<ffffffff8130401b>] blk_update_request+0x1eb/0x450
[ 1929.216569]  [<ffffffff81096e58>] ? trigger_load_balance+0x78/0x500
[ 1929.229176]  [<ffffffff8144c74d>] scsi_end_request+0x3d/0x1f0
[ 1929.240740]  [<ffffffff8144ccac>] scsi_io_completion+0xac/0x5b0
[ 1929.252654]  [<ffffffff81441c50>] scsi_finish_command+0xf0/0x150
[ 1929.264725]  [<ffffffff8144d317>] scsi_softirq_done+0x147/0x170
[ 1929.276635]  [<ffffffff8130ace6>] blk_done_softirq+0x86/0xa0
[ 1929.288014]  [<ffffffff8105d92e>] __do_softirq+0xde/0x600
[ 1929.298885]  [<ffffffff8105df6d>] irq_exit+0xbd/0xd0
(...)

Fix this by using a reference count on the scrub context structure
instead of locking the scrub_lock mutex.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/scrub.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6d6e155c8c8b..db21f17df996 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -193,6 +193,15 @@ struct scrub_ctx {
 	 */
 	struct btrfs_scrub_progress stat;
 	spinlock_t		stat_lock;
+
+	/*
+	 * Use a ref counter to avoid use-after-free issues. Scrub workers
+	 * decrement bios_in_flight and workers_pending and then do a wakeup
+	 * on the list_wait wait queue. We must ensure the main scrub task
+	 * doesn't free the scrub context before or while the workers are
+	 * doing the wakeup() call.
+	 */
+	atomic_t                refs;
 };
 
 struct scrub_fixup_nodatasum {
@@ -297,26 +306,20 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 static void copy_nocow_pages_worker(struct btrfs_work *work);
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
 
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 {
+	atomic_inc(&sctx->refs);
 	atomic_inc(&sctx->bios_in_flight);
 }
 
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 {
-	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
-
-	/*
-	 * Hold the scrub_lock while doing the wakeup to ensure the
-	 * sctx (and its wait queue list_wait) isn't destroyed/freed
-	 * during the wakeup.
-	 */
-	mutex_lock(&fs_info->scrub_lock);
 	atomic_dec(&sctx->bios_in_flight);
 	wake_up(&sctx->list_wait);
-	mutex_unlock(&fs_info->scrub_lock);
+	scrub_put_ctx(sctx);
 }
 
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -350,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 {
 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 
+	atomic_inc(&sctx->refs);
 	/*
 	 * increment scrubs_running to prevent cancel requests from
 	 * completing as long as a worker is running. we must also
@@ -388,15 +392,11 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 	mutex_lock(&fs_info->scrub_lock);
 	atomic_dec(&fs_info->scrubs_running);
 	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
 	atomic_dec(&sctx->workers_pending);
 	wake_up(&fs_info->scrub_pause_wait);
-	/*
-	 * Hold the scrub_lock while doing the wakeup to ensure the
-	 * sctx (and its wait queue list_wait) isn't destroyed/freed
-	 * during the wakeup.
-	 */
 	wake_up(&sctx->list_wait);
-	mutex_unlock(&fs_info->scrub_lock);
+	scrub_put_ctx(sctx);
 }
 
 static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -442,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 	kfree(sctx);
 }
 
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+	if (atomic_dec_and_test(&sctx->refs))
+		scrub_free_ctx(sctx);
+}
+
 static noinline_for_stack
 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
@@ -466,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 	if (!sctx)
 		goto nomem;
+	atomic_set(&sctx->refs, 1);
 	sctx->is_dev_replace = is_dev_replace;
 	sctx->pages_per_rd_bio = pages_per_rd_bio;
 	sctx->curr = -1;
@@ -3739,7 +3746,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	scrub_workers_put(fs_info);
 	mutex_unlock(&fs_info->scrub_lock);
 
-	scrub_free_ctx(sctx);
+	scrub_put_ctx(sctx);
 
 	return ret;
 }