From 7c9a84a57b57978f0ea0d2dc16394d75a781e6a5 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 12 Mar 2010 17:05:31 +0000 Subject: [PATCH 01/11] GFS2: Remove space from slab cache name Apparently this might confuse parsers. Reported-by: Christoph Hellwig Signed-off-by: Steven Whitehouse --- fs/gfs2/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a88fadc704bb..fb2a5f93b7c3 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_glock_cachep) goto fail; - gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", sizeof(struct gfs2_glock) + sizeof(struct address_space), 0, 0, gfs2_init_gl_aspace_once); From 4cb947b59c5835783fb96aad2f7d92b1e4250aff Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Thu, 25 Mar 2010 11:04:49 +0100 Subject: [PATCH 02/11] GFS2: docs update Now http://sources.redhat.com/cluster/ is redirected to http://sources.redhat.com/cluster/wiki/ Also fixed tabs in the end. Signed-off-by: Andrea Gelmini Signed-off-by: Steven Whitehouse --- Documentation/filesystems/gfs2.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt index 5e3ab8f3beff..0b59c0200912 100644 --- a/Documentation/filesystems/gfs2.txt +++ b/Documentation/filesystems/gfs2.txt @@ -1,7 +1,7 @@ Global File System ------------------ -http://sources.redhat.com/cluster/ +http://sources.redhat.com/cluster/wiki/ GFS is a cluster file system. It allows a cluster of computers to simultaneously use a block device that is shared between them (with FC, @@ -36,11 +36,11 @@ GFS2 is not on-disk compatible with previous versions of GFS, but it is pretty close. The following man pages can be found at the URL above: - fsck.gfs2 to repair a filesystem - gfs2_grow to expand a filesystem online - gfs2_jadd to add journals to a filesystem online - gfs2_tool to manipulate, examine and tune a filesystem + fsck.gfs2 to repair a filesystem + gfs2_grow to expand a filesystem online + gfs2_jadd to add journals to a filesystem online + gfs2_tool to manipulate, examine and tune a filesystem gfs2_quota to examine and change quota values in a filesystem gfs2_convert to convert a gfs filesystem to gfs2 in-place mount.gfs2 to help mount(8) mount a filesystem - mkfs.gfs2 to make a filesystem + mkfs.gfs2 to make a filesystem From 602c89d2e3e8652f94a697c9a919be739b9bcdd5 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 25 Mar 2010 14:32:43 +0000 Subject: [PATCH 03/11] GFS2: Clean up stuffed file copying If the inode size was corrupt for stuffed files, it was possible for the copying of data to overrun the block and/or page. This patch checks for that condition so that this is no longer possible. This is also preparation for the new truncate sequence patch which requires the ability to have stuffed files with larger sizes than (disk block size - sizeof(on disk inode)) with the restriction that only the initial part of the file may be non-zero. Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 8 +++++--- fs/gfs2/bmap.c | 17 ++++++++++------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 0c1d0b82dcf1..a739a0a48067 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping, static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) { struct buffer_head *dibh; + u64 dsize = i_size_read(&ip->i_inode); void *kaddr; int error; @@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) return error; kaddr = kmap_atomic(page, KM_USER0); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_disksize); - memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(page); brelse(dibh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 583e823307ae..0db0cd92a38d 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -72,11 +72,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!PageUptodate(page)) { void *kaddr = kmap(page); + u64 dsize = i_size_read(inode); + + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = dibh->b_size - sizeof(struct gfs2_dinode); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_disksize); - memset(kaddr + ip->i_disksize, 0, - PAGE_CACHE_SIZE - ip->i_disksize); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); kunmap(page); SetPageUptodate(page); @@ -1039,13 +1041,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - ip->i_disksize = size; + u64 dsize = size + sizeof(struct gfs2_inode); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); + if (dsize > dibh->b_size) + dsize = dibh->b_size; + gfs2_buffer_clear_tail(dibh, dsize); error = 1; - } else { if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) error = gfs2_block_truncate_page(ip->i_inode.i_mapping); From 1a0eae8848cde6e0734360f6456496c995ee1e23 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 14 Apr 2010 11:58:16 -0400 Subject: [PATCH 04/11] GFS2: glock livelock This patch fixes a couple gfs2 problems with the reclaiming of unlinked dinodes. First, there were a couple of livelocks where everything would come to a halt waiting for a glock that was seemingly held by a process that no longer existed. In fact, the process did exist, it just had the wrong pid number in the holder information. Second, there was a lock ordering problem between inode locking and glock locking. Third, glock/inode contention could sometimes cause inodes to be improperly marked invalid by iget_failed. Signed-off-by: Bob Peterson --- fs/gfs2/dir.c | 2 +- fs/gfs2/export.c | 2 +- fs/gfs2/glock.c | 3 ++ fs/gfs2/inode.c | 101 +++++++++++++++++++++++++++++++++++++++---- fs/gfs2/inode.h | 5 ++- fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/rgrp.c | 58 ++++++++++++++++++------- 7 files changed, 144 insertions(+), 29 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 25fddc100f18..8295c5b5d4a9 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) inode = gfs2_inode_lookup(dir->i_sb, be16_to_cpu(dent->de_type), be64_to_cpu(dent->de_inum.no_addr), - be64_to_cpu(dent->de_inum.no_formal_ino), 0); + be64_to_cpu(dent->de_inum.no_formal_ino)); brelse(bh); return inode; } diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d15876e9aa26..d81bc7e90e98 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -169,7 +169,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, if (error) goto fail; - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto fail; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 454d4b4eb36b..ddcdbf493536 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * gh->gh_flags = flags; gh->gh_iflags = 0; gh->gh_ip = (unsigned long)__builtin_return_address(0); + if (gh->gh_owner_pid) + put_pid(gh->gh_owner_pid); + gh->gh_owner_pid = get_pid(task_pid(current)); } /** diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b1bf2694fb2b..51d8061fa07a 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode) * @sb: The super block * @no_addr: The inode number * @type: The type of the inode - * @skip_freeing: set this not return an inode if it is currently being freed. * * Returns: A VFS inode, or an error */ @@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode) struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, u64 no_addr, - u64 no_formal_ino, int skip_freeing) + u64 no_formal_ino) { struct inode *inode; struct gfs2_inode *ip; struct gfs2_glock *io_gl; int error; - if (skip_freeing) - inode = gfs2_iget_skip(sb, no_addr); - else - inode = gfs2_iget(sb, no_addr); + inode = gfs2_iget(sb, no_addr); ip = GFS2_I(inode); if (!inode) @@ -233,12 +229,99 @@ fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: gfs2_glock_put(io_gl); +fail_put: + if (inode->i_state & I_NEW) + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); +fail: + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); + return ERR_PTR(error); +} + +/** + * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation + * @sb: The super block + * no_addr: The inode number + * @@inode: A pointer to the inode found, if any + * + * Returns: 0 and *inode if no errors occurred. If an error occurs, + * the resulting *inode may or may not be NULL. + */ + +int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, + struct inode **inode) +{ + struct gfs2_sbd *sdp; + struct gfs2_inode *ip; + struct gfs2_glock *io_gl; + int error; + struct gfs2_holder gh; + + *inode = gfs2_iget_skip(sb, no_addr); + + if (!(*inode)) + return -ENOBUFS; + + if (!((*inode)->i_state & I_NEW)) + return -ENOBUFS; + + ip = GFS2_I(*inode); + sdp = GFS2_SB(*inode); + ip->i_no_formal_ino = -1; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (unlikely(error)) + goto fail; + ip->i_gl->gl_object = ip; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (unlikely(error)) + goto fail_put; + + set_bit(GIF_INVALID, &ip->i_flags); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, + &ip->i_iopen_gh); + if (unlikely(error)) { + if (error == GLR_TRYFAILED) + error = 0; + goto fail_iopen; + } + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + + (*inode)->i_mode = DT2IF(DT_UNKNOWN); + + /* + * We must read the inode in order to work out its type in + * this case. Note that this doesn't happen often as we normally + * know the type beforehand. This code path only occurs during + * unlinked inode recovery (where it is safe to do this glock, + * which is not true in the general case). + */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, + &gh); + if (unlikely(error)) { + if (error == GLR_TRYFAILED) + error = 0; + goto fail_glock; + } + /* Inode is now uptodate */ + gfs2_glock_dq_uninit(&gh); + gfs2_set_iop(*inode); + + return 0; +fail_glock: + gfs2_glock_dq(&ip->i_iopen_gh); +fail_iopen: + gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); fail: - iget_failed(inode); - return ERR_PTR(error); + return error; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) @@ -862,7 +945,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, goto fail_gunlock2; inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, - inum.no_formal_ino, 0); + inum.no_formal_ino); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c341aaf67adb..e161461d4c57 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -83,8 +83,9 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - int skip_freeing); + u64 no_addr, u64 no_formal_ino); +extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, + struct inode **inode); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c1309ed1c496..dc35f3470271 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -487,7 +487,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, struct dentry *dentry; struct inode *inode; - inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); if (IS_ERR(inode)) { fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 503b842f3ba2..37391550284f 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -948,18 +948,20 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes * @rgd: The rgrp * - * Returns: The inode, if one has been found + * Returns: 0 if no error + * The inode, if one has been found, in inode. */ -static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip) +static int try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, + u64 skip, struct inode **inode) { - struct inode *inode; u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int n; + int error = 0; + *inode = NULL; for(;;) { if (goal >= rgd->rd_data) break; @@ -979,14 +981,14 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, - no_addr, -1, 1); - if (!IS_ERR(inode)) - return inode; + error = gfs2_unlinked_inode_lookup(rgd->rd_sbd->sd_vfs, + no_addr, inode); + if (*inode || error) + return error; } rgd->rd_flags &= ~GFS2_RDF_CHECK; - return NULL; + return 0; } /** @@ -1096,12 +1098,27 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + /* If the rg came in already locked, there's no + way we can recover from a failed try_rgrp_unlink + because that would require an iput which can only + happen after the rgrp is unlocked. */ + if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) + error = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr, &inode); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) + if (inode) { + if (error) { + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); + return ERR_PTR(error); + } return inode; + } + if (error) + return ERR_PTR(error); /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); @@ -1130,12 +1147,23 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) + error = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr, &inode); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) + if (inode) { + if (error) { + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); + return ERR_PTR(error); + } return inode; + } + if (error) + return ERR_PTR(error); break; case GLR_TRYFAILED: From 5e687eac1bd31baed110d239ef827d3ba666f311 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 4 May 2010 14:29:16 -0500 Subject: [PATCH 05/11] GFS2: Various gfs2_logd improvements This patch contains various tweaks to how log flushes and active item writeback work. gfs2_logd is now managed by a waitqueue, and gfs2_log_reseve now waits for gfs2_logd to do the log flushing. Multiple functions were rewritten to remove the need to call gfs2_log_lock(). Instead of using one test to see if gfs2_logd had work to do, there are now seperate tests to check if there are two many buffers in the incore log or if there are two many items on the active items list. This patch is a port of a patch Steve Whitehouse wrote about a year ago, with some minor changes. Since gfs2_ail1_start always submits all the active items, it no longer needs to keep track of the first ai submitted, so this has been removed. In gfs2_log_reserve(), the order of the calls to prepare_to_wait_exclusive() and wake_up() when firing off the logd thread has been switched. If it called wake_up first there was a small window for a race, where logd could run and return before gfs2_log_reserve was ready to get woken up. If gfs2_logd ran, but did not free up enough blocks, gfs2_log_reserve() would be left waiting for gfs2_logd to eventualy run because it timed out. Finally, gt_logd_secs, which controls how long to wait before gfs2_logd times out, and flushes the log, can now be set on mount with ar_commit. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 10 +-- fs/gfs2/log.c | 159 ++++++++++++++++++++++++------------------- fs/gfs2/log.h | 1 - fs/gfs2/lops.c | 2 + fs/gfs2/meta_io.c | 1 + fs/gfs2/ops_fstype.c | 17 ++--- fs/gfs2/super.c | 8 +-- fs/gfs2/sys.c | 4 -- fs/gfs2/trans.c | 18 +++++ 9 files changed, 127 insertions(+), 93 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3aac46f6853e..08dd65745473 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -439,9 +439,6 @@ struct gfs2_args { struct gfs2_tune { spinlock_t gt_spin; - unsigned int gt_incore_log_blocks; - unsigned int gt_log_flush_secs; - unsigned int gt_logd_secs; unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ @@ -618,6 +615,7 @@ struct gfs2_sbd { unsigned int sd_log_commited_databuf; int sd_log_commited_revoke; + atomic_t sd_log_pinned; unsigned int sd_log_num_buf; unsigned int sd_log_num_revoke; unsigned int sd_log_num_rg; @@ -629,15 +627,17 @@ struct gfs2_sbd { struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; + atomic_t sd_log_thresh1; + atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; - struct mutex sd_log_reserve_mutex; + wait_queue_head_t sd_log_waitq; + wait_queue_head_t sd_logd_waitq; u64 sd_log_sequence; unsigned int sd_log_head; unsigned int sd_log_tail; int sd_log_idle; - unsigned long sd_log_flush_time; struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; wait_queue_head_t sd_log_flush_wait; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e5bf4b59d46e..d5959df6deb2 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl return list_empty(&ai->ai_ail1_list); } -static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) +static void gfs2_ail1_start(struct gfs2_sbd *sdp) { struct list_head *head; u64 sync_gen; - struct list_head *first; - struct gfs2_ail *first_ai, *ai, *tmp; + struct gfs2_ail *ai; int done = 0; gfs2_log_lock(sdp); @@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) } sync_gen = sdp->sd_ail_sync_gen++; - first = head->prev; - first_ai = list_entry(first, struct gfs2_ail, ai_list); - first_ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */ - - if (flags & DIO_ALL) - first = NULL; - while(!done) { - if (first && (head->prev != first || - gfs2_ail1_empty_one(sdp, first_ai, 0))) - break; - done = 1; - list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { + list_for_each_entry_reverse(ai, head, ai_list) { if (ai->ai_sync_gen >= sync_gen) continue; ai->ai_sync_gen = sync_gen; @@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) * flush time, so we ensure that we have just enough free blocks at all * times to avoid running out during a log flush. * + * We no longer flush the log here, instead we wake up logd to do that + * for us. To avoid the thundering herd and to ensure that we deal fairly + * with queued waiters, we use an exclusive wait. This means that when we + * get woken with enough journal space to get our reservation, we need to + * wake the next waiter on the list. + * * Returns: errno */ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { - unsigned int try = 0; unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned wanted = blks + reserved_blks; + DEFINE_WAIT(wait); + int did_wait = 0; + unsigned int free_blocks; if (gfs2_assert_warn(sdp, blks) || gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) return -EINVAL; - - mutex_lock(&sdp->sd_log_reserve_mutex); - gfs2_log_lock(sdp); - while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { - gfs2_log_unlock(sdp); - gfs2_ail1_empty(sdp, 0); - gfs2_log_flush(sdp, NULL); - - if (try++) - gfs2_ail1_start(sdp, 0); - gfs2_log_lock(sdp); +retry: + free_blocks = atomic_read(&sdp->sd_log_blks_free); + if (unlikely(free_blocks <= wanted)) { + do { + prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sdp->sd_logd_waitq); + did_wait = 1; + if (atomic_read(&sdp->sd_log_blks_free) <= wanted) + io_schedule(); + free_blocks = atomic_read(&sdp->sd_log_blks_free); + } while(free_blocks <= wanted); + finish_wait(&sdp->sd_log_waitq, &wait); } - atomic_sub(blks, &sdp->sd_log_blks_free); + if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, + free_blocks - blks) != free_blocks) + goto retry; trace_gfs2_log_blocks(sdp, -blks); - gfs2_log_unlock(sdp); - mutex_unlock(&sdp->sd_log_reserve_mutex); + + /* + * If we waited, then so might others, wake them up _after_ we get + * our share of the log. + */ + if (unlikely(did_wait)) + wake_up(&sdp->sd_log_waitq); down_read(&sdp->sd_log_flush_lock); return 0; } -/** - * gfs2_log_release - Release a given number of log blocks - * @sdp: The GFS2 superblock - * @blks: The number of blocks - * - */ - -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) -{ - - gfs2_log_lock(sdp); - atomic_add(blks, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, blks); - gfs2_assert_withdraw(sdp, - atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - gfs2_log_unlock(sdp); - up_read(&sdp->sd_log_flush_lock); -} - static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) { struct gfs2_journal_extent *je; @@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) ail2_empty(sdp, new_tail); - gfs2_log_lock(sdp); atomic_add(dist, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, dist); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - gfs2_log_unlock(sdp); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); sdp->sd_log_tail = new_tail; } @@ -822,6 +807,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) * @sdp: the filesystem * @tr: the transaction * + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 + * or the total number of used blocks (pinned blocks plus AIL blocks) + * is greater than thresh2. + * + * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of + * journal size. + * * Returns: errno */ @@ -832,10 +824,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) up_read(&sdp->sd_log_flush_lock); - gfs2_log_lock(sdp); - if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) - wake_up_process(sdp->sd_logd_process); - gfs2_log_unlock(sdp); + if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || + ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > + atomic_read(&sdp->sd_log_thresh2))) + wake_up(&sdp->sd_logd_waitq); } /** @@ -882,13 +874,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) { gfs2_log_flush(sdp, NULL); for (;;) { - gfs2_ail1_start(sdp, DIO_ALL); + gfs2_ail1_start(sdp); if (gfs2_ail1_empty(sdp, DIO_ALL)) break; msleep(10); } } +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + return used_blocks >= atomic_read(&sdp->sd_log_thresh2); +} /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks @@ -901,28 +903,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) int gfs2_logd(void *data) { struct gfs2_sbd *sdp = data; - unsigned long t; - int need_flush; + unsigned long t = 1; + DEFINE_WAIT(wait); + unsigned preflush; while (!kthread_should_stop()) { - /* Advance the log tail */ - t = sdp->sd_log_flush_time + - gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; - - gfs2_ail1_empty(sdp, DIO_ALL); - gfs2_log_lock(sdp); - need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); - gfs2_log_unlock(sdp); - if (need_flush || time_after_eq(jiffies, t)) { + preflush = atomic_read(&sdp->sd_log_pinned); + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { + gfs2_ail1_empty(sdp, DIO_ALL); gfs2_log_flush(sdp, NULL); - sdp->sd_log_flush_time = jiffies; + gfs2_ail1_empty(sdp, DIO_ALL); } + if (gfs2_ail_flush_reqd(sdp)) { + gfs2_ail1_start(sdp); + io_schedule(); + gfs2_ail1_empty(sdp, 0); + gfs2_log_flush(sdp, NULL); + gfs2_ail1_empty(sdp, DIO_ALL); + } + + wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; if (freezing(current)) refrigerator(); - schedule_timeout_interruptible(t); + + do { + prepare_to_wait(&sdp->sd_logd_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()) + t = schedule_timeout(t); + } while(t && !gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()); + finish_wait(&sdp->sd_logd_waitq, &wait); } return 0; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 7c64510ccfd2..eb570b4ad443 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -51,7 +51,6 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); void gfs2_log_incr_head(struct gfs2_sbd *sdp); struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index adc260fbea90..bf33f822058d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd->bd_ail) list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); get_bh(bh); + atomic_inc(&sdp->sd_log_pinned); trace_gfs2_pin(bd, 1); } @@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, trace_gfs2_pin(bd, 0); gfs2_log_unlock(sdp); unlock_buffer(bh); + atomic_dec(&sdp->sd_log_pinned); } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0bb12c80937a..abafda1f637b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -313,6 +313,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct gfs2_bufdata *bd = bh->b_private; if (test_clear_buffer_pinned(bh)) { + atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_le.le_list); if (meta) { gfs2_assert_warn(sdp, sdp->sd_log_num_buf); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index dc35f3470271..3593b3a7290e 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_incore_log_blocks = 1024; - gt->gt_logd_secs = 1; gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; @@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_log_lock); - + atomic_set(&sdp->sd_log_pinned, 0); INIT_LIST_HEAD(&sdp->sd_log_le_buf); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); INIT_LIST_HEAD(&sdp->sd_log_le_rg); INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); - mutex_init(&sdp->sd_log_reserve_mutex); + init_waitqueue_head(&sdp->sd_log_waitq); + init_waitqueue_head(&sdp->sd_logd_waitq); INIT_LIST_HEAD(&sdp->sd_ail1_list); INIT_LIST_HEAD(&sdp->sd_ail2_list); @@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); } else { if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { fs_err(sdp, "can't mount journal #%u\n", @@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_jinode_gh; } atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ map_journal_extents(sdp); @@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_quotad; - sdp->sd_log_flush_time = jiffies; - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); error = IS_ERR(p); if (error) { @@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent GFS2_BASIC_BLOCK_SHIFT; sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; - sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; if (sdp->sd_args.ar_statfs_quantum) { sdp->sd_tune.gt_statfs_slow = 0; @@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; args.ar_data = GFS2_DATA_DEFAULT; - args.ar_commit = 60; + args.ar_commit = 30; args.ar_statfs_quantum = 30; args.ar_quota_quantum = 60; args.ar_errors = GFS2_ERRORS_DEFAULT; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 50aac606b990..7a93e9ff7d3c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) int error; spin_lock(>->gt_spin); - args.ar_commit = gt->gt_log_flush_secs; + args.ar_commit = gt->gt_logd_secs; args.ar_quota_quantum = gt->gt_quota_quantum; if (gt->gt_statfs_slow) args.ar_statfs_quantum = 0; @@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) else clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); spin_lock(>->gt_spin); - gt->gt_log_flush_secs = args.ar_commit; + gt->gt_logd_secs = args.ar_commit; gt->gt_quota_quantum = args.ar_quota_quantum; if (args.ar_statfs_quantum) { gt->gt_statfs_slow = 0; @@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (args->ar_discard) seq_printf(s, ",discard"); - val = sdp->sd_tune.gt_log_flush_secs; - if (val != 60) + val = sdp->sd_tune.gt_logd_secs; + if (val != 30) seq_printf(s, ",commit=%d", val); val = sdp->sd_tune.gt_statfs_quantum; if (val != 30) diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 419042f7f0b6..2ac845d9c46c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -469,8 +469,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -TUNE_ATTR(incore_log_blocks, 0); -TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); TUNE_ATTR(max_readahead, 0); @@ -482,8 +480,6 @@ TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { - &tune_attr_incore_log_blocks.attr, - &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, &tune_attr_quota_quantum.attr, &tune_attr_max_readahead.attr, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 4ef0e9fa3549..9ec73a854111 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -23,6 +23,7 @@ #include "meta_io.h" #include "trans.h" #include "util.h" +#include "trace_gfs2.h" int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes) @@ -75,6 +76,23 @@ fail_holder_uninit: return error; } +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); +} + void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; From ad6bb90f3401556469489f237cb08626d88703d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 May 2010 00:10:56 +0200 Subject: [PATCH 06/11] GFS2: fix quota state reporting We need to report both the accounting and enforcing flags if we are in enforcing mode. Signed-off-by: Christoph Hellwig Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6dbcbad6ab17..6ca0967ce6e7 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1418,10 +1418,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb, memset(fqs, 0, sizeof(struct fs_quota_stat)); fqs->qs_version = FS_QSTAT_VERSION; - if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) - fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); - else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) - fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + + switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_ON: + fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); + /*FALLTHRU*/ + case GFS2_QUOTA_ACCOUNT: + fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + break; + case GFS2_QUOTA_OFF: + break; + } + if (sdp->sd_quota_inode) { fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; From 913a71d250803130eac523e7a2b6439e31a0bc83 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 6 May 2010 11:03:29 +0100 Subject: [PATCH 07/11] GFS2: Add some useful messages The following patch adds a message to indicate when barriers have been disabled due to a block device which doesn't support them. You could already tell this via the mount options in /proc/mounts, but all the other filesystems also log a message at the same time. Also, the same mechanisms are used to indicate when the lock demote interface has been used (only ever used for debugging) which is a request from our support team. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/log.c | 1 + fs/gfs2/super.c | 3 ++- fs/gfs2/sys.c | 2 ++ 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 08dd65745473..b5d7363b22da 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -459,6 +459,7 @@ enum { SDF_SHUTDOWN = 2, SDF_NOBARRIERS = 3, SDF_NORECOVERY = 4, + SDF_DEMOTE = 5, }; #define GFS2_FSNAME_LEN 256 diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index d5959df6deb2..b593f0e28f25 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -600,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); set_buffer_uptodate(bh); + fs_info(sdp, "barrier sync failed - disabling barriers\n"); set_bit(SDF_NOBARRIERS, &sdp->sd_flags); lock_buffer(bh); skip_barrier: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 7a93e9ff7d3c..4d1aad38f1b1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) seq_printf(s, ",nobarrier"); - + if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) + seq_printf(s, ",demote_interface_used"); return 0; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 2ac845d9c46c..7afb62ec97cf 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -233,6 +233,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len glops = gfs2_glops_list[gltype]; if (glops == NULL) return -EINVAL; + if (test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + fs_info(sdp, "demote interface used\n"); rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); if (rv) return rv; From 7e619bc3e6252dc746f64ac3b486e784822e9533 Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Fri, 7 May 2010 17:50:18 -0400 Subject: [PATCH 08/11] GFS2: Fix writing to non-page aligned gfs2_quota structures This is the upstream fix for this bug. This patch differs from the RHEL5 fix (Red Hat bz #555754) which simply writes to the 8-byte value field of the quota. In upstream quota code, we're required to write the entire quota (88 bytes) which can be split across a page boundary. We check for such quotas, and read/write the two parts from/to the corresponding pages holding these parts. With this patch, I don't see the bug anymore using the reproducer in Red Hat bz 555754. I successfully ran a couple of simple tests/mounts/ umounts and it doesn't seem like this patch breaks anything else. Signed-off-by: Abhi Das Signed-off-by: Steven Whitehouse --- fs/gfs2/quota.c | 86 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6ca0967ce6e7..d5f4661287f9 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, unsigned blocksize, iblock, pos; struct buffer_head *bh, *dibh; struct page *page; - void *kaddr; - struct gfs2_quota *qp; - s64 value; - int err = -EIO; + void *kaddr, *ptr; + struct gfs2_quota q, *qp; + int err, nbytes; u64 size; if (gfs2_is_stuffed(ip)) gfs2_unstuff_dinode(ip, NULL); - + + memset(&q, 0, sizeof(struct gfs2_quota)); + err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); + if (err < 0) + return err; + + err = -EIO; + qp = &q; + qp->qu_value = be64_to_cpu(qp->qu_value); + qp->qu_value += change; + qp->qu_value = cpu_to_be64(qp->qu_value); + qd->qd_qb.qb_value = qp->qu_value; + if (fdq) { + if (fdq->d_fieldmask & FS_DQ_BSOFT) { + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qd->qd_qb.qb_warn = qp->qu_warn; + } + if (fdq->d_fieldmask & FS_DQ_BHARD) { + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qd->qd_qb.qb_limit = qp->qu_limit; + } + } + + /* Write the quota into the quota file on disk */ + ptr = qp; + nbytes = sizeof(struct gfs2_quota); +get_a_page: page = grab_cache_page(mapping, index); if (!page) return -ENOMEM; @@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, if (!buffer_mapped(bh)) { gfs2_block_map(inode, iblock, bh, 1); if (!buffer_mapped(bh)) - goto unlock; + goto unlock_out; + /* If it's a newly allocated disk block for quota, zero it */ + if (buffer_new(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } } if (PageUptodate(page)) @@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, ll_rw_block(READ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) - goto unlock; + goto unlock_out; } gfs2_trans_add_bh(ip->i_gl, bh, 0); kaddr = kmap_atomic(page, KM_USER0); - qp = kaddr + offset; - value = (s64)be64_to_cpu(qp->qu_value) + change; - qp->qu_value = cpu_to_be64(value); - qd->qd_qb.qb_value = qp->qu_value; - if (fdq) { - if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); - qd->qd_qb.qb_warn = qp->qu_warn; - } - if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); - qd->qd_qb.qb_limit = qp->qu_limit; - } - } + if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) + nbytes = PAGE_CACHE_SIZE - offset; + memcpy(kaddr + offset, ptr, nbytes); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); + unlock_page(page); + page_cache_release(page); + /* If quota straddles page boundary, we need to update the rest of the + * quota at the beginning of the next page */ + if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ + ptr = ptr + nbytes; + nbytes = sizeof(struct gfs2_quota) - nbytes; + offset = 0; + index++; + goto get_a_page; + } + + /* Update the disk inode timestamp and size (if extended) */ err = gfs2_meta_inode_buffer(ip, &dibh); if (err) - goto unlock; + goto out; size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) { @@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, brelse(dibh); mark_inode_dirty(inode); -unlock: +out: + return err; +unlock_out: unlock_page(page); page_cache_release(page); return err; @@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) * rgrp since it won't be allocated during the transaction */ al->al_requested = 1; - /* +1 in the end for block requested above for unstuffing */ - blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; + /* +3 in the end for unstuffing block, inode size update block + * and another block in case quota straddles page boundary and + * two blocks need to be updated instead of 1 */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; if (nalloc) al->al_requested += nalloc * (data_blocks + ind_blocks); From eaefbf968a83a160324225fb2ac9c49e56c86515 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 11 May 2010 17:35:34 -0400 Subject: [PATCH 09/11] GFS2: Eliminate useless err variable This patch removes an unneeded "err" variable that is always returned as zero. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/meta_io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index abafda1f637b..18176d0b75d7 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -34,7 +34,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { - int err; struct buffer_head *bh, *head; int nr_underway = 0; int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? @@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb } while (bh != head); unlock_page(page); - err = 0; if (nr_underway == 0) end_page_writeback(page); - return err; + return 0; } const struct address_space_operations gfs2_meta_aops = { From cc0581bd6132984641e47809552fc9d5dfcadbcf Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 11 May 2010 17:58:11 -0400 Subject: [PATCH 10/11] GFS2: stuck in inode wait, no glocks stuck This patch changes the lock ordering when gfs2 reclaims unlinked dinodes, thereby avoiding a livelock. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 78 +++++++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 48 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 37391550284f..8bce73ed4d8e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -952,16 +952,14 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * The inode, if one has been found, in inode. */ -static int try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip, struct inode **inode) +static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, + u64 skip) { u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int n; - int error = 0; - *inode = NULL; for(;;) { if (goal >= rgd->rd_data) break; @@ -981,10 +979,7 @@ static int try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - error = gfs2_unlinked_inode_lookup(rgd->rd_sbd->sd_vfs, - no_addr, inode); - if (*inode || error) - return error; + return no_addr; } rgd->rd_flags &= ~GFS2_RDF_CHECK; @@ -1069,11 +1064,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Try to acquire rgrp in way which avoids contending with others. * * Returns: errno + * unlinked: the block address of an unlinked block to be reclaimed */ -static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) +static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, + u64 *last_unlinked) { - struct inode *inode = NULL; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; struct gfs2_alloc *al = ip->i_alloc; @@ -1082,6 +1078,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) int loops = 0; int error, rg_locked; + *unlinked = 0; rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { @@ -1103,29 +1100,19 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) because that would require an iput which can only happen after the rgrp is unlocked. */ if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - error = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr, &inode); + *unlinked = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) { - if (error) { - if (inode->i_state & I_NEW) - iget_failed(inode); - else - iput(inode); - return ERR_PTR(error); - } - return inode; - } - if (error) - return ERR_PTR(error); + if (*unlinked) + return -EAGAIN; /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); break; default: - return ERR_PTR(error); + return error; } } @@ -1148,22 +1135,12 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (try_rgrp_fit(rgd, al)) goto out; if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - error = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr, &inode); + *unlinked = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) { - if (error) { - if (inode->i_state & I_NEW) - iget_failed(inode); - else - iput(inode); - return ERR_PTR(error); - } - return inode; - } - if (error) - return ERR_PTR(error); + if (*unlinked) + return -EAGAIN; break; case GLR_TRYFAILED: @@ -1171,7 +1148,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) break; default: - return ERR_PTR(error); + return error; } rgd = gfs2_rgrpd_get_next(rgd); @@ -1180,7 +1157,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (rgd == begin) { if (++loops >= 3) - return ERR_PTR(-ENOSPC); + return -ENOSPC; if (!skipped) loops++; flags = 0; @@ -1200,7 +1177,7 @@ out: forward_rgrp_set(sdp, rgd); } - return NULL; + return 0; } /** @@ -1216,7 +1193,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) struct gfs2_alloc *al = ip->i_alloc; struct inode *inode; int error = 0; - u64 last_unlinked = NO_BLOCK; + u64 last_unlinked = NO_BLOCK, unlinked; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; @@ -1232,14 +1209,19 @@ try_again: if (error) return error; - inode = get_local_rgrp(ip, &last_unlinked); - if (inode) { + error = get_local_rgrp(ip, &unlinked, &last_unlinked); + if (error) { if (ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - if (IS_ERR(inode)) - return PTR_ERR(inode); - iput(inode); + if (error != -EAGAIN) + return error; + error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb, + unlinked, &inode); + if (inode) + iput(inode); gfs2_log_flush(sdp, NULL); + if (error == GLR_TRYFAILED) + error = 0; goto try_again; } From 6a99be5d7b5973767b1ffa4fa68fed0738589c99 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 14 May 2010 14:05:51 +0100 Subject: [PATCH 11/11] GFS2: Fix typo A missing ! in a test. Signed-off-by: Steven Whitehouse --- fs/gfs2/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7afb62ec97cf..68d2795f29a8 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -233,7 +233,7 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len glops = gfs2_glops_list[gltype]; if (glops == NULL) return -EINVAL; - if (test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) fs_info(sdp, "demote interface used\n"); rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); if (rv)