From 65552b02a10acea68127081faf414b84a65d1855 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 4 Jan 2022 17:38:36 -0800 Subject: [PATCH 1/2] xfs: take the ILOCK when readdir inspects directory mapping data I was poking around in the directory code while diagnosing online fsck bugs, and noticed that xfs_readdir doesn't actually take the directory ILOCK when it calls xfs_dir2_isblock. xfs_dir_open most probably loaded the data fork mappings and the VFS took i_rwsem (aka IOLOCK_SHARED) so we're protected against writer threads, but we really need to follow the locking model like we do in other places. To avoid unnecessarily cycling the ILOCK for fairly small directories, change the block/leaf _getdents functions to consume the ILOCK hold that the parent readdir function took to decide on a _getdents implementation. It is ok to cycle the ILOCK in readdir because the VFS takes the IOLOCK in the appropriate mode during lookups and writes, and we don't want to be holding the ILOCK when we copy directory entries to userspace in case there's a page fault. We really only need it to protect against data fork lookups, like we do for other files. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_dir2_readdir.c | 53 +++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8310005af00f..a7174a5b3203 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -138,7 +138,8 @@ xfs_dir2_sf_getdents( STATIC int xfs_dir2_block_getdents( struct xfs_da_args *args, - struct dir_context *ctx) + struct dir_context *ctx, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; /* incore directory inode */ struct xfs_buf *bp; /* buffer for block */ @@ -146,7 +147,6 @@ xfs_dir2_block_getdents( int wantoff; /* starting block offset */ xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; - int lock_mode; unsigned int offset, next_offset; unsigned int end; @@ -156,12 +156,13 @@ xfs_dir2_block_getdents( if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; - lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir3_block_read(args->trans, dp, &bp); - xfs_iunlock(dp, lock_mode); if (error) return error; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -344,7 +345,8 @@ STATIC int xfs_dir2_leaf_getdents( struct xfs_da_args *args, struct dir_context *ctx, - size_t bufsize) + size_t bufsize, + unsigned int *lock_mode) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; @@ -356,7 +358,6 @@ xfs_dir2_leaf_getdents( xfs_dir2_off_t curoff; /* current overall offset */ int length; /* temporary length value */ int byteoff; /* offset in current block */ - int lock_mode; unsigned int offset = 0; int error = 0; /* error return value */ @@ -390,13 +391,16 @@ xfs_dir2_leaf_getdents( bp = NULL; } - lock_mode = xfs_ilock_data_map_shared(dp); + if (*lock_mode == 0) + *lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff, &rablk, &bp); - xfs_iunlock(dp, lock_mode); if (error || !bp) break; + xfs_iunlock(dp, *lock_mode); + *lock_mode = 0; + xfs_dir3_data_check(dp, bp); /* * Find our position in the block. @@ -496,7 +500,7 @@ xfs_dir2_leaf_getdents( * * If supplied, the transaction collects locked dir buffers to avoid * nested buffer deadlocks. This function does not dirty the - * transaction. The caller should ensure that the inode is locked + * transaction. The caller must hold the IOLOCK (shared or exclusive) * before calling this function. */ int @@ -507,8 +511,9 @@ xfs_readdir( size_t bufsize) { struct xfs_da_args args = { NULL }; - int rval; - int v; + unsigned int lock_mode; + int isblock; + int error; trace_xfs_readdir(dp); @@ -516,6 +521,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; @@ -523,13 +529,22 @@ xfs_readdir( args.trans = tp; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(&args, ctx); - else if ((rval = xfs_dir2_isblock(&args, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(&args, ctx); - else - rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + return xfs_dir2_sf_getdents(&args, ctx); - return rval; + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir2_isblock(&args, &isblock); + if (error) + goto out_unlock; + + if (isblock) { + error = xfs_dir2_block_getdents(&args, ctx, &lock_mode); + goto out_unlock; + } + + error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode); + +out_unlock: + if (lock_mode) + xfs_iunlock(dp, lock_mode); + return error; } From 4a9bca86806fa6fc4fbccf050c1bd36a4778948a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 7 Jan 2022 17:45:51 -0800 Subject: [PATCH 2/2] xfs: fix online fsck handling of v5 feature bits on secondary supers While I was auditing the code in xfs_repair that adds feature bits to existing V5 filesystems, I decided to have a look at how online fsck handles feature bits, and I found a few problems: 1) ATTR2 is added to the primary super when an xattr is set to a file, but that isn't consistently propagated to secondary supers. This isn't a corruption, merely a discrepancy that repair will fix if it ever has to restore the primary from a secondary. Hence, if we find a mismatch on a secondary, this is a preen condition, not a corruption. 2) There are more compat and ro_compat features now than there used to be, but we mask off the newer features from testing. This means we ignore inconsistencies in the INOBTCOUNT and BIGTIME features, which is wrong. Get rid of the masking and compare directly. 3) NEEDSREPAIR, when set on a secondary, is ignored by everyone. Hence a mismatch here should also be flagged for preening, and online repair should clear the flag. Right now we ignore it due to (2). 4) log_incompat features are ephemeral, since we can clear the feature bit as soon as the log no longer contains live records for a particular log feature. As such, the only copy we care about is the one in the primary super. If we find any bits set in the secondary super, we should flag that for preening, and clear the bits if the user elects to repair it. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader.c | 53 +++++++++++++++++----------------- fs/xfs/scrub/agheader_repair.c | 12 ++++++++ 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index bed798792226..90aebfe9dc5f 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -281,7 +281,7 @@ xchk_superblock( features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT); if ((sb->sb_features2 & features_mask) != (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) - xchk_block_set_corrupt(sc, bp); + xchk_block_set_preen(sc, bp); if (!xfs_has_crc(mp)) { /* all v5 fields must be zero */ @@ -290,38 +290,37 @@ xchk_superblock( offsetof(struct xfs_dsb, sb_features_compat))) xchk_block_set_corrupt(sc, bp); } else { - /* Check compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN); - if ((sb->sb_features_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask)) + /* compat features must match */ + if (sb->sb_features_compat != + cpu_to_be32(mp->m_sb.sb_features_compat)) xchk_block_set_corrupt(sc, bp); - /* Check ro compat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN | - XFS_SB_FEAT_RO_COMPAT_FINOBT | - XFS_SB_FEAT_RO_COMPAT_RMAPBT | - XFS_SB_FEAT_RO_COMPAT_REFLINK); - if ((sb->sb_features_ro_compat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_ro_compat) & - features_mask)) + /* ro compat features must match */ + if (sb->sb_features_ro_compat != + cpu_to_be32(mp->m_sb.sb_features_ro_compat)) xchk_block_set_corrupt(sc, bp); - /* Check incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN | - XFS_SB_FEAT_INCOMPAT_FTYPE | - XFS_SB_FEAT_INCOMPAT_SPINODES | - XFS_SB_FEAT_INCOMPAT_META_UUID); - if ((sb->sb_features_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_incompat) & - features_mask)) + /* + * NEEDSREPAIR is ignored on a secondary super, so we should + * clear it when we find it, though it's not a corruption. + */ + features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & features_mask) + xchk_block_set_preen(sc, bp); + + /* all other incompat features must match */ + if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^ + sb->sb_features_incompat) & ~features_mask) xchk_block_set_corrupt(sc, bp); - /* Check log incompat flags; all are set at mkfs time. */ - features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN); - if ((sb->sb_features_log_incompat & features_mask) != - (cpu_to_be32(mp->m_sb.sb_features_log_incompat) & - features_mask)) - xchk_block_set_corrupt(sc, bp); + /* + * log incompat features protect newer log record types from + * older log recovery code. Log recovery doesn't check the + * secondary supers, so we can clear these if needed. + */ + if (sb->sb_features_log_incompat) + xchk_block_set_preen(sc, bp); /* Don't care about sb_crc */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d7bfed52f4cd..6da7f2ca77de 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -52,6 +52,18 @@ xrep_superblock( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + /* + * Don't write out a secondary super with NEEDSREPAIR or log incompat + * features set, since both are ignored when set on a secondary. + */ + if (xfs_has_crc(mp)) { + struct xfs_dsb *sb = bp->b_addr; + + sb->sb_features_incompat &= + ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sb->sb_features_log_incompat = 0; + } + /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);