From f9581b1443abac50c90168301d40a7734b13a5dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:26 +0000 Subject: [PATCH 1/7] xfs: implement ->dirty_inode to fix timestamp handling This is picking up on Felix's repost of Dave's patch to implement a .dirty_inode method. We really need this notification because the VFS keeps writing directly into the inode structure instead of going through methods to update this state. In addition to the long-known atime issue we now also have a caller in VM code that updates c/mtime that way for shared writeable mmaps. And I found another one that no one has noticed in practice in the FIFO code. So implement ->dirty_inode to set i_update_core whenever the inode gets externally dirtied, and switch the c/mtime handling to the same scheme we already use for atime (always picking up the value from the Linux inode). Note that this patch also removes the xfs_synchronize_atime call in xfs_reclaim it was superflous as we already synchronize the time when writing the inode via the log (xfs_inode_item_format) or the normal buffers (xfs_iflush_int). In addition also remove the I_CLEAR check before copying the Linux timestamps - now that we always have the Linux inode available we can always use the timestamps in it. Also switch to just using file_update_time for regular reads/writes - that will get us all optimization done to it for free and make sure we notice early when it breaks. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 1 - fs/xfs/linux-2.6/xfs_iops.c | 41 +++++++++++++----------------------- fs/xfs/linux-2.6/xfs_lrw.c | 2 +- fs/xfs/linux-2.6/xfs_super.c | 23 ++++++++++++++++++++ fs/xfs/xfs_dfrag.c | 8 +++---- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_inode_item.c | 18 ++++++++++------ fs/xfs/xfs_itable.c | 21 +++++++++++------- fs/xfs/xfs_vnodeops.c | 6 ------ 10 files changed, 70 insertions(+), 56 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d5e5559e31db..40d2226060d1 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -215,7 +215,6 @@ xfs_setfilesize( if (ip->i_d.di_size < isize) { ip->i_d.di_size = isize; - ip->i_update_core = 1; xfs_mark_inode_dirty_sync(ip); } diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 626b474b2cdd..cdf7114d41fc 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -57,19 +57,22 @@ #include /* - * Bring the atime in the XFS inode uptodate. - * Used before logging the inode to disk or when the Linux inode goes away. + * Bring the timestamps in the XFS inode uptodate. + * + * Used before writing the inode to disk. */ void -xfs_synchronize_atime( +xfs_synchronize_times( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); - if (!(inode->i_state & I_CLEAR)) { - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - } + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; } /* @@ -106,32 +109,20 @@ xfs_ichgtime( if ((flags & XFS_ICHGTIME_MOD) && !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } if ((flags & XFS_ICHGTIME_CHG) && !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. + * Update complete - now make sure everyone knows that the inode + * is dirty. */ - if (sync_it) { - SYNCHRONIZE(); - ip->i_update_core = 1; + if (sync_it) xfs_mark_inode_dirty_sync(ip); - } } /* @@ -514,10 +505,8 @@ xfs_vn_getattr( stat->gid = ip->i_d.di_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; - stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec; - stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 49e4a6aea73c..072050f8d346 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -667,7 +667,7 @@ start: xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 5d7c60ac77b4..1ea65b6e5ab6 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -976,6 +976,28 @@ xfs_fs_inode_init_once( mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); } +/* + * Dirty the XFS inode when mark_inode_dirty_sync() is called so that + * we catch unlogged VFS level updates to the inode. Care must be taken + * here - the transaction code calls mark_inode_dirty_sync() to mark the + * VFS inode dirty in a transaction and clears the i_update_core field; + * it must clear the field after calling mark_inode_dirty_sync() to + * correctly indicate that the dirty state has been propagated into the + * inode log item. + * + * We need the barrier() to maintain correct ordering between unlogged + * updates and the transaction commit code that clears the i_update_core + * field. This requires all updates to be completed before marking the + * inode dirty. + */ +STATIC void +xfs_fs_dirty_inode( + struct inode *inode) +{ + barrier(); + XFS_I(inode)->i_update_core = 1; +} + /* * Attempt to flush the inode, this will actually fail * if the inode is pinned, but we dirty the inode again @@ -1539,6 +1561,7 @@ xfs_fs_get_sb( static struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 7465f9ee125f..ab89a7e94a0f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -206,10 +206,10 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || - (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || - (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || - (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c1dc7ef5a1d8..b92a4fa2a0a1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3068,9 +3068,9 @@ xfs_iflush_int( SYNCHRONIZE(); /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); + xfs_synchronize_times(ip); if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0b38b9a869ec..41555de1d1db 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_atime(xfs_inode_t *); +void xfs_synchronize_times(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #if defined(XFS_INODE_TRACE) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 47d5b663c37e..9794b876d6ff 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -231,6 +231,15 @@ xfs_inode_item_format( vecp++; nvecs = 1; + /* + * Make sure the linux inode is dirty. We do this before + * clearing i_update_core as the VFS will call back into + * XFS here and set i_update_core, so we need to dirty the + * inode first so that the ordering of i_update_core and + * unlogged modifications still works as described below. + */ + xfs_mark_inode_dirty_sync(ip); + /* * Clear i_update_core if the timestamps (or any other * non-transactional modification) need flushing/logging @@ -263,14 +272,9 @@ xfs_inode_item_format( } /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); - - /* - * make sure the linux inode is dirty - */ - xfs_mark_inode_dirty_sync(ip); + xfs_synchronize_times(ip); vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b68f9107e26c..62efab2f3839 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -59,6 +59,7 @@ xfs_bulkstat_one_iget( { xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ + struct inode *inode; int error; error = xfs_iget(mp, NULL, ino, @@ -72,6 +73,7 @@ xfs_bulkstat_one_iget( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; + inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -83,16 +85,19 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; + /* - * We are reading the atime from the Linux inode because the - * dinode might not be uptodate. + * We need to read the timestamps from the Linux inode because + * the VFS keeps writing directly into the inode structure instead + * of telling us about the updates. */ - buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; - buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; - buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; - buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; - buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index a434f287962d..b572f7e840e0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2475,12 +2475,6 @@ xfs_reclaim( ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - /* - * Make sure the atime in the XFS inode is correct before freeing the - * Linux inode. - */ - xfs_synchronize_atime(ip); - /* * If we have nothing to flush with this inode then complete the * teardown now, otherwise break the link between the xfs inode and the From c90b07e8dd9f263d2e2af1d9648ba269f4d0d8fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:27 +0000 Subject: [PATCH 2/7] xfs: fix xfs_quiesce_data We need to do a synchronous xfs_sync_fsdata to make sure the superblock actually is on disk when we return. Also remove SYNC_BDFLUSH flag to xfs_sync_inodes because that particular flag is never checked. Move xfs_filestream_flush call later to only release inodes after they have been written out. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 320be6aea492..7647b8db3ca3 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -419,14 +419,16 @@ xfs_quiesce_data( /* push non-blocking */ xfs_sync_data(mp, 0); xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_filestream_flush(mp); - /* push and block */ + /* push and block till complete */ xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); + /* drop inode references pinned by filestreams */ + xfs_filestream_flush(mp); + /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, 0); + error = xfs_sync_fsdata(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) From 69961a26b82931601e07c9e3656bd90c598f2395 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:28 +0000 Subject: [PATCH 3/7] xfs: cleanup ->sync_fs Sort out ->sync_fs to not perform a superblock writeback for the wait = 0 case as that is just an optional first pass and the superblock will be written back properly in the next call with wait = 1. Instead perform an opportunistic quota writeback to have less work later. Also remove the freeze special case as we do a proper wait = 1 call in the freeze code anyway. Also rename the function to xfs_fs_sync_fs to match the normal naming convention, update comments and avoid calling into the laptop_mode logic on an error. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1ea65b6e5ab6..eefea0d9d03d 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1148,7 +1148,7 @@ xfs_fs_put_super( } STATIC int -xfs_fs_sync_super( +xfs_fs_sync_fs( struct super_block *sb, int wait) { @@ -1156,23 +1156,23 @@ xfs_fs_sync_super( int error; /* - * Treat a sync operation like a freeze. This is to work - * around a race in sync_inodes() which works in two phases - * - an asynchronous flush, which can write out an inode - * without waiting for file size updates to complete, and a - * synchronous flush, which wont do anything because the - * async flush removed the inode's dirty flag. Also - * sync_inodes() will not see any files that just have - * outstanding transactions to be flushed because we don't - * dirty the Linux inode until after the transaction I/O - * completes. + * Not much we can do for the first async pass. Writing out the + * superblock would be counter-productive as we are going to redirty + * when writing out other data and metadata (and writing out a single + * block is quite fast anyway). + * + * Try to asynchronously kick off quota syncing at least. */ - if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - error = xfs_quiesce_data(mp); - else - error = xfs_sync_fsdata(mp, 0); + if (!wait) { + xfs_qm_sync(mp, SYNC_TRYLOCK); + return 0; + } - if (unlikely(laptop_mode)) { + error = xfs_quiesce_data(mp); + if (error) + return -error; + + if (laptop_mode) { int prev_sync_seq = mp->m_sync_seq; /* @@ -1191,7 +1191,7 @@ xfs_fs_sync_super( mp->m_sync_seq != prev_sync_seq); } - return -error; + return 0; } STATIC int @@ -1565,7 +1565,7 @@ static struct super_operations xfs_super_operations = { .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_super, + .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, From 932640e8adaff3d0c28ee32d4863c7a7a74df055 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:29 +0000 Subject: [PATCH 4/7] xfs: mark inodes dirty before issuing I/O To make sure they get properly waited on in sync when I/O is in flight and we latter need to update the inode size. Requires a new helper to check if an ioend structure is beyond the current EOF. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 40d2226060d1..6a2910ee8a28 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -185,6 +185,24 @@ xfs_destroy_ioend( mempool_free(ioend, xfs_ioend_pool); } +/* + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. + */ +STATIC xfs_fsize_t +xfs_ioend_new_eof( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + /* * Update on-disk file size now that data has been written to disk. * The current in-memory file size is i_size. If a write is beyond @@ -192,13 +210,13 @@ xfs_destroy_ioend( * updated. If this write does not extend all the way to the valid * file size then restrict this update to the end of the write. */ + STATIC void xfs_setfilesize( xfs_ioend_t *ioend) { xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - xfs_fsize_t bsize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); @@ -206,14 +224,9 @@ xfs_setfilesize( if (unlikely(ioend->io_error)) return; - bsize = ioend->io_offset + ioend->io_size; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); - - if (ip->i_d.di_size < isize) { + isize = xfs_ioend_new_eof(ioend); + if (isize) { ip->i_d.di_size = isize; xfs_mark_inode_dirty_sync(ip); } @@ -403,10 +416,16 @@ xfs_submit_ioend_bio( struct bio *bio) { atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; + /* + * If the I/O is beyond EOF we mark the inode dirty immediately + * but don't update the inode size until I/O completion. + */ + if (xfs_ioend_new_eof(ioend)) + xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + submit_bio(WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); From dce5065a57c158e0ca5db8e63c50118b2ecf4ac5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:30 +0000 Subject: [PATCH 5/7] xfs: make sure xfs_sync_fsdata covers the log We want to always cover the log after writing out the superblock, and in case of a synchronous writeout make sure we actually wait for the log to be covered. That way a filesystem that has been sync()ed can be considered clean by log recovery. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 7647b8db3ca3..961df0a22c78 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -309,11 +309,15 @@ xfs_sync_attr( STATIC int xfs_commit_dummy_trans( struct xfs_mount *mp, - uint log_flags) + uint flags) { struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; + int log_flags = XFS_LOG_FORCE; + + if (flags & SYNC_WAIT) + log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -331,13 +335,12 @@ xfs_commit_dummy_trans( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* XXX(hch): ignoring the error here.. */ error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* the log force ensures this transaction is pushed to disk */ xfs_log_force(mp, 0, log_flags); - return 0; + return error; } int @@ -385,7 +388,20 @@ xfs_sync_fsdata( else XFS_BUF_ASYNC(bp); - return xfs_bwrite(mp, bp); + error = xfs_bwrite(mp, bp); + if (error) + return error; + + /* + * If this is a data integrity sync make sure all pending buffers + * are flushed out for the log coverage check below. + */ + if (flags & SYNC_WAIT) + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, flags); + return error; out_brelse: xfs_buf_relse(bp); @@ -572,8 +588,6 @@ xfs_sync_worker( /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); From 8e69ce147127a0235e8d1f2b75ea214be78c61b3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 25 Sep 2009 19:42:26 +0000 Subject: [PATCH 6/7] fix readahead calculations in xfs_dir2_leaf_getdents() This is for bug #850, http://oss.sgi.com/bugzilla/show_bug.cgi?id=850 XFS file system segfaults , repeatedly and 100% reproducable in 2.6.30 , 2.6.31 The above only showed up on a CONFIG_XFS_DEBUG=y kernel, because xfs_bmapi() ASSERTs that it has been asked for at least one map, and it was getting 0. The root cause is that our guesstimated "bufsize" from xfs_file_readdir was fairly small, and the bufsize -= length; in the loop was going negative - except bufsize is a size_t, so it was wrapping to a very large number. Then when we did ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; with that very large number, the (int) ra_want was coming out negative, and a subsequent compare: if (1 + ra_want > map_blocks ... was coming out -true- (negative int compare w/ uint) and we went back to xfs_bmapi() for more, even though we did not need more, and asked for 0 maps, and hit the ASSERT. We have kind of a type mess here, but just keeping bufsize from going negative is probably sufficient to avoid the problem. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/xfs_dir2_leaf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index fa913e459442..41ad537c49e9 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents( */ ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; + ASSERT(ra_want >= 0); /* * If we don't have as many as we want, and we haven't @@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents( */ ptr += length; curoff += length; - bufsize -= length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; } /* From d0800703febc04827b8fa91921aa4e254d01e8d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2009 19:55:04 +0000 Subject: [PATCH 7/7] xfs: stop calling filemap_fdatawait inside ->fsync Now that the VFS actually waits for the data I/O to complete before calling into ->fsync we can stop doing it ourselves. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_file.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 988d8f87bc0f..6d65baf15a14 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -176,14 +176,7 @@ xfs_file_fsync( struct dentry *dentry, int datasync) { - struct inode *inode = dentry->d_inode; - struct xfs_inode *ip = XFS_I(inode); - int error; - - /* capture size updates in I/O completion before writing the inode. */ - error = filemap_fdatawait(inode->i_mapping); - if (error) - return error; + struct xfs_inode *ip = XFS_I(dentry->d_inode); xfs_iflags_clear(ip, XFS_ITRUNCATED); return -xfs_fsync(ip);