From 78c931b8be75456562b55ed4e27878f1519e1367 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 28 Nov 2014 13:59:58 +1100 Subject: [PATCH 1/5] xfs: replace global xfslogd wq with per-mount wq The xfslogd workqueue is a global, single-job workqueue for buffer ioend processing. This means we allow for a single work item at a time for all possible XFS mounts on a system. fsstress testing in loopback XFS over XFS configurations has reproduced xfslogd deadlocks due to the single threaded nature of the queue and dependencies introduced between the separate XFS instances by online discard (-o discard). Discard over a loopback device converts the discard request to a hole punch (fallocate) on the underlying file. Online discard requests are issued synchronously and from xfslogd context in XFS, hence the xfslogd workqueue is blocked in the upper fs waiting on a hole punch request to be servied in the lower fs. If the lower fs issues I/O that depends on xfslogd to complete, both filesystems end up hung indefinitely. This is reproduced reliabily by generic/013 on XFS->loop->XFS test devices with the '-o discard' mount option. Further, docker implementations appear to use this kind of configuration for container instance filesystems by default (container fs->dm-> loop->base fs) and therefore are subject to this deadlock when running on XFS. Replace the global xfslogd workqueue with a per-mount variant. This guarantees each mount access to a single worker and prevents deadlocks due to inter-fs dependencies introduced by discard. Since the queue is only responsible for buffer iodone processing at this point in time, rename xfslogd to xfs-buf. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 12 +----------- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 11 ++++++++++- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 24b4ebea0d4d..c06d790a3000 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -44,8 +44,6 @@ static kmem_zone_t *xfs_buf_zone; -static struct workqueue_struct *xfslogd_workqueue; - #ifdef XFS_BUF_LOCK_TRACKING # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) @@ -1053,7 +1051,7 @@ xfs_buf_ioend_async( struct xfs_buf *bp) { INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work); - queue_work(xfslogd_workqueue, &bp->b_iodone_work); + queue_work(bp->b_target->bt_mount->m_buf_workqueue, &bp->b_iodone_work); } void @@ -1882,15 +1880,8 @@ xfs_buf_init(void) if (!xfs_buf_zone) goto out; - xfslogd_workqueue = alloc_workqueue("xfslogd", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1); - if (!xfslogd_workqueue) - goto out_free_buf_zone; - return 0; - out_free_buf_zone: - kmem_zone_destroy(xfs_buf_zone); out: return -ENOMEM; } @@ -1898,6 +1889,5 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b0447c86e7e2..394bc711171a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -168,6 +168,7 @@ typedef struct xfs_mount { /* low free space thresholds */ struct xfs_kobj m_kobj; + struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9f622feda6a4..03e3cc242902 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -842,10 +842,16 @@ STATIC int xfs_init_mount_workqueues( struct xfs_mount *mp) { + mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", + WQ_MEM_RECLAIM|WQ_HIGHPRI|WQ_FREEZABLE, 1, + mp->m_fsname); + if (!mp->m_buf_workqueue) + goto out; + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_data_workqueue) - goto out; + goto out_destroy_buf; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); @@ -884,6 +890,8 @@ out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: destroy_workqueue(mp->m_data_workqueue); +out_destroy_buf: + destroy_workqueue(mp->m_buf_workqueue); out: return -ENOMEM; } @@ -898,6 +906,7 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); + destroy_workqueue(mp->m_buf_workqueue); } /* From 062647a8b41928f4fb97f967b24092be68f5f0f0 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 28 Nov 2014 14:00:16 +1100 Subject: [PATCH 2/5] xfs: replace on-stack xfs_trans_res with pointer in xfs_create() There's no need to store a full struct xfs_trans_res on the stack in xfs_create() and copy the fields. Use a pointer to the appropriate structures embedded in the xfs_mount. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ed049d1e332..2ffb80267e37 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1082,7 +1082,7 @@ xfs_create( struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; - struct xfs_trans_res tres; + struct xfs_trans_res *tres; uint resblks; trace_xfs_create(dp, name); @@ -1105,13 +1105,11 @@ xfs_create( if (is_dir) { rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); - tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; - tres.tr_logcount = XFS_MKDIR_LOG_COUNT; + tres = &M_RES(mp)->tr_mkdir; tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { resblks = XFS_CREATE_SPACE_RES(mp, name->len); - tres.tr_logres = M_RES(mp)->tr_create.tr_logres; - tres.tr_logcount = XFS_CREATE_LOG_COUNT; + tres = &M_RES(mp)->tr_create; tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } @@ -1123,17 +1121,16 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(tp, &tres, resblks, 0); + error = xfs_trans_reserve(tp, tres, resblks, 0); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, &tres, resblks, 0); + error = xfs_trans_reserve(tp, tres, resblks, 0); } if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, &tres, 0, 0); + error = xfs_trans_reserve(tp, tres, 0, 0); } if (error) { cancel_flags = 0; From 5d45ee1b41b02269ce04920a48cd2c6b2a458090 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 28 Nov 2014 14:00:53 +1100 Subject: [PATCH 3/5] xfs: fix error handling in xfs_qm_log_quotaoff() The error handling in xfs_qm_log_quotaoff() has a couple problems. If xfs_trans_commit() fails, we fall through to the error block and call xfs_trans_cancel(). This is incorrect on commit failure. If xfs_trans_reserve() fails, we jump to the error block, cancel the tp and restore the superblock qflags to oldsbqflag. However, oldsbqflag has been initialized to zero and not yet updated from the original flags so we set the flags to zero. Fix up the error handling in xfs_qm_log_quotaoff() to not restore flags if they haven't been modified and not cancel the tp on commit failure. Remove the flag restore code altogether because commit error is the only failure condition and we don't know whether the transaction made it to disk. Reported-by: Dan Carpenter Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_qm_syscalls.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 80f2d77d929a..d1e0ab7a5d12 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -784,19 +784,21 @@ xfs_qm_log_quotaoff( { xfs_trans_t *tp; int error; - xfs_qoff_logitem_t *qoffi=NULL; - uint oldsbqflag=0; + xfs_qoff_logitem_t *qoffi; + + *qoffstartp = NULL; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); - if (error) - goto error0; + if (error) { + xfs_trans_cancel(tp, 0); + goto out; + } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); spin_lock(&mp->m_sb_lock); - oldsbqflag = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; spin_unlock(&mp->m_sb_lock); @@ -809,19 +811,11 @@ xfs_qm_log_quotaoff( */ xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); + if (error) + goto out; -error0: - if (error) { - xfs_trans_cancel(tp, 0); - /* - * No one else is modifying sb_qflags, so this is OK. - * We still hold the quotaofflock. - */ - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = oldsbqflag; - spin_unlock(&mp->m_sb_lock); - } *qoffstartp = qoffi; +out: return error; } From 91ee575f2b35d1307412f917787195c2f6a38dfb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 28 Nov 2014 14:02:59 +1100 Subject: [PATCH 4/5] xfs: allow lazy sb counter sync during filesystem freeze sequence The expectation since the introduction the lazy superblock counters is that the counters are synced and superblock logged appropriately as part of the filesystem freeze sequence. This does not occur, however, due to the logic in xfs_fs_writable() that prevents progress when the fs is in any state other than SB_UNFROZEN. While this is a bug, it has not been exposed to date because the last thing XFS does during freeze is dirty the log. The log recovery process recalculates the counters from AGI/AGF metadata to ensure everything is correct. Therefore should a crash occur while an fs is frozen, the subsequent log recovery puts everything back in order. See the following commit for reference: 92821e2b [XFS] Lazy Superblock Counters We might not always want to rely on dirtying the log on a frozen fs. Modify xfs_log_sbcount() to proceed when the filesystem is freezing but not once the freeze process has completed. Modify xfs_fs_writable() to accept the minimum freeze level for which modifications should be blocked to support various codepaths. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_mount.c | 29 +++++++++++++++++++++-------- fs/xfs/xfs_mount.h | 2 +- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fe88ef67f93a..e810e9df91b7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1031,7 +1031,7 @@ xfs_log_need_covered(xfs_mount_t *mp) struct xlog *log = mp->m_log; int needed = 0; - if (!xfs_fs_writable(mp)) + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) return 0; if (!xlog_cil_empty(log)) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 51435dbce9c4..13d117089101 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1074,11 +1074,23 @@ xfs_unmountfs( xfs_sysfs_del(&mp->m_kobj); } -int -xfs_fs_writable(xfs_mount_t *mp) +/* + * Determine whether modifications can proceed. The caller specifies the minimum + * freeze level for which modifications should not be allowed. This allows + * certain operations to proceed while the freeze sequence is in progress, if + * necessary. + */ +bool +xfs_fs_writable( + struct xfs_mount *mp, + int level) { - return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || - (mp->m_flags & XFS_MOUNT_RDONLY)); + ASSERT(level > SB_UNFROZEN); + if ((mp->m_super->s_writers.frozen >= level) || + XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) + return false; + + return true; } /* @@ -1086,9 +1098,9 @@ xfs_fs_writable(xfs_mount_t *mp) * * Sync the superblock counters to disk. * - * Note this code can be called during the process of freezing, so - * we may need to use the transaction allocator which does not - * block when the transaction subsystem is in its frozen state. + * Note this code can be called during the process of freezing, so we use the + * transaction allocator that does not block when the transaction subsystem is + * in its frozen state. */ int xfs_log_sbcount(xfs_mount_t *mp) @@ -1096,7 +1108,8 @@ xfs_log_sbcount(xfs_mount_t *mp) xfs_trans_t *tp; int error; - if (!xfs_fs_writable(mp)) + /* allow this to proceed during the freeze sequence... */ + if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) return 0; xfs_icsb_sync_counters(mp, 0); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 394bc711171a..01fb28f5ae1c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -385,7 +385,7 @@ extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); -extern int xfs_fs_writable(xfs_mount_t *); +extern bool xfs_fs_writable(struct xfs_mount *mp, int level); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); From db52d09ecbf85c54e263a9d1ebfb615a9b2b3ba6 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 28 Nov 2014 14:03:55 +1100 Subject: [PATCH 5/5] xfs: catch invalid negative blknos in _xfs_buf_find() Here blkno is a daddr_t, which is a __s64; it's possible to hold a value which is negative, and thus pass the (blkno >= eofs) test. Then we try to do a xfs_perag_get() for a ridiculous agno via xfs_daddr_to_agno(), and bad things happen when that fails, and returns a null pag which is dereferenced shortly thereafter. Found via a user-supplied fuzzed image... Signed-off-by: Eric Sandeen Reviewed-by: Mark Tinguely Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c06d790a3000..d083889535a2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -461,7 +461,7 @@ _xfs_buf_find( * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno >= eofs) { + if (blkno < 0 || blkno >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports