gfs2 4.19 merge

Changes on top of v4.18-rc1 / iomap-4.19-merge-1: 1. Iomap support for buffered writes and for direct I/O. 2. Two patches that reduce the size of struct gfs2_inode. 3. Lots of fixes and cleanups. -----BEGIN PGP SIGNATURE----- iQIcBAABAgAGBQJbcakSAAoJENW/n+sDE2U6AR0QAJlai+92ERML2pM+1hiuEHWP KizBoV/53pc+dll1dlcEOHQFys2vbcFavcCtcsTXNhLSp1wOqxyzcFQTX6ekWfSZ hTvJvAKTbeXu0zOWSV2DcX40JWb7SKDAxjxNb8XhL0COilgM9r+mdqoY/UNyVSel SVmWvs8UYt6UBnw4G8h5UlzSYxl/M64udU1pVO5D8JMQ5cxDKj3kfFoJLLKBDwLF vaNFxiihdTzmMwMNo3Px7GFSsb5Jnyo9LgAoDKsYd9YlzqGpAvvoYXH8itj4TuSb sM1KTUZK+97XvquZfuv5BniEifP7XZSq4xYIxyr9HMaOefeys0GdzaCSCb3ifFte 7bqjowlAbHWwBNa9ofuJ1NShsAiOv0GUGDzlY+T/0IgSlqRr0JxAikJ3jLIZQ1Hf CwWY66XakeSi5euDTi41SuGZMcxTXaX15VbXl6/SGsv4X0dyVXleBz6RuC9Q+n2H 7nqlGppRW2NB1WUqkJ15n9JaNLAF5I6umERTBXKGODM56p/GmZYoScCEGrqj9obN CntPtL6yrazASjV3+zqXA//OvTb3xfykYu17wVLKhXWD0YWQiuDfA481BLvEut2G aTtNU3b4VDwv5NuBf7G3wvN0+v3WyJ3gRfhTaEdFnX5PpcH3eHz5/fzU7zCJJwDu g53icn3efqNu7WvpAkB+ =oGs4 -----END PGP SIGNATURE----- Merge tag 'gfs2-4.19.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2 Pull gfs2 updates from Andreas Gruenbacher: - iomap support for buffered writes and for direct I/O - two patches that reduce the size of struct gfs2_inode - lots of fixes and cleanups * tag 'gfs2-4.19.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2: (25 commits) gfs2: eliminate update_rgrp_lvb_unlinked gfs2: Fix gfs2_testbit to use clone bitmaps gfs2: Get rid of gfs2_ea_strlen gfs2: cleanup: call gfs2_rgrp_ondisk2lvb from gfs2_rgrp_out gfs2: Special-case rindex for gfs2_grow GFS2: rgrp free blocks used incorrectly gfs2: remove redundant variable 'moved' gfs2: use iomap_readpage for blocksize == PAGE_SIZE gfs2: Use iomap for stuffed direct I/O reads gfs2: fallocate_chunk: Always initialize struct iomap GFS2: Fix recovery issues for spectators fs: gfs2: Adding new return type vm_fault_t gfs2: using posix_acl_xattr_size instead of posix_acl_to_xattr gfs2: Don't reject a supposedly full bitmap if we have blocks reserved gfs2: Eliminate redundant ip->i_rgd gfs2: Stop messing with ip->i_rgd in the rlist code gfs2: Remove gfs2_write_{begin,end} gfs2: iomap direct I/O support gfs2: gfs2_extent_length cleanup gfs2: iomap buffered write support ...
2018-08-15 22:40:03 -07:00 · 2018-08-15 22:40:03 -07:00 · 2b2f2aedba
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@ -82,14 +82,12 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
 int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	int error;
-	int len;
+	size_t len;
 	char *data;
 	const char *name = gfs2_acl_name(type);

 	if (acl) {
-		len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
-		if (len == 0)
-			return 0;
+		len = posix_acl_xattr_size(acl->a_count);
 		data = kmalloc(len, GFP_NOFS);
 		if (data == NULL)
 			return -ENOMEM;
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 #include <trace/events/writeback.h>
+#include <linux/sched/signal.h>

 #include "gfs2.h"
 #include "incore.h"
@ -36,10 +37,11 @@
 #include "super.h"
 #include "util.h"
 #include "glops.h"
+#include "aops.h"


-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-				   unsigned int from, unsigned int len)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+			    unsigned int from, unsigned int len)
 {
 	struct buffer_head *head = page_buffers(page);
 	unsigned int bsize = head->b_size;
@ -82,12 +84,6 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 	return 0;
 }

-static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
-				 struct buffer_head *bh_result, int create)
-{
-	return gfs2_block_map(inode, lblock, bh_result, 0);
-}
-
 /**
 * gfs2_writepage_common - Common bits of writepage
 * @page: The page to be written
@ -462,7 +458,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 * Returns: errno
 */

-static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
 	struct buffer_head *dibh;
 	u64 dsize = i_size_read(&ip->i_inode);
@ -512,9 +508,13 @@ static int __gfs2_readpage(void *file, struct page *page)
 {
 	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+
 	int error;

-	if (gfs2_is_stuffed(ip)) {
+	if (i_blocksize(page->mapping->host) == PAGE_SIZE &&
+	    !page_has_buffers(page)) {
+		error = iomap_readpage(page, &gfs2_iomap_ops);
+	} else if (gfs2_is_stuffed(ip)) {
 		error = stuffed_readpage(ip, page);
 		unlock_page(page);
 	} else {
@ -643,140 +643,11 @@ out_uninit:
 	return ret;
 }

-/**
- * gfs2_write_begin - Begin to write to a file
- * @file: The file to write to
- * @mapping: The mapping in which to write
- * @pos: The file offset at which to start writing
- * @len: Length of the write
- * @flags: Various flags
- * @pagep: Pointer to return the page
- * @fsdata: Pointer to return fs data (unused by GFS2)
- *
- * Returns: errno
- */
-
-static int gfs2_write_begin(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned flags,
-			    struct page **pagep, void **fsdata)
-{
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	unsigned requested = 0;
-	int alloc_required;
-	int error = 0;
-	pgoff_t index = pos >> PAGE_SHIFT;
-	unsigned from = pos & (PAGE_SIZE - 1);
-	struct page *page;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-	error = gfs2_glock_nq(&ip->i_gh);
-	if (unlikely(error))
-		goto out_uninit;
-	if (&ip->i_inode == sdp->sd_rindex) {
-		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
-					   GL_NOCACHE, &m_ip->i_gh);
-		if (unlikely(error)) {
-			gfs2_glock_dq(&ip->i_gh);
-			goto out_uninit;
-		}
-	}
-
-	alloc_required = gfs2_write_alloc_required(ip, pos, len);
-
-	if (alloc_required || gfs2_is_jdata(ip))
-		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
-
-	if (alloc_required) {
-		struct gfs2_alloc_parms ap = { .aflags = 0, };
-		requested = data_blocks + ind_blocks;
-		ap.target = requested;
-		error = gfs2_quota_lock_check(ip, &ap);
-		if (error)
-			goto out_unlock;
-
-		error = gfs2_inplace_reserve(ip, &ap);
-		if (error)
-			goto out_qunlock;
-	}
-
-	rblocks = RES_DINODE + ind_blocks;
-	if (gfs2_is_jdata(ip))
-		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
-		rblocks += RES_STATFS + RES_QUOTA;
-	if (&ip->i_inode == sdp->sd_rindex)
-		rblocks += 2 * RES_STATFS;
-	if (alloc_required)
-		rblocks += gfs2_rg_blocks(ip, requested);
-
-	error = gfs2_trans_begin(sdp, rblocks,
-				 PAGE_SIZE/sdp->sd_sb.sb_bsize);
-	if (error)
-		goto out_trans_fail;
-
-	error = -ENOMEM;
-	flags |= AOP_FLAG_NOFS;
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	*pagep = page;
-	if (unlikely(!page))
-		goto out_endtrans;
-
-	if (gfs2_is_stuffed(ip)) {
-		error = 0;
-		if (pos + len > gfs2_max_stuffed_size(ip)) {
-			error = gfs2_unstuff_dinode(ip, page);
-			if (error == 0)
-				goto prepare_write;
-		} else if (!PageUptodate(page)) {
-			error = stuffed_readpage(ip, page);
-		}
-		goto out;
-	}
-
-prepare_write:
-	error = __block_write_begin(page, from, len, gfs2_block_map);
-out:
-	if (error == 0)
-		return 0;
-
-	unlock_page(page);
-	put_page(page);
-
-	gfs2_trans_end(sdp);
-	if (alloc_required) {
-		gfs2_inplace_release(ip);
-		if (pos + len > ip->i_inode.i_size)
-			gfs2_trim_blocks(&ip->i_inode);
-	}
-	goto out_qunlock;
-
-out_endtrans:
-	gfs2_trans_end(sdp);
-out_trans_fail:
-	if (alloc_required)
-		gfs2_inplace_release(ip);
-out_qunlock:
-	if (alloc_required)
-		gfs2_quota_unlock(ip);
-out_unlock:
-	if (&ip->i_inode == sdp->sd_rindex) {
-		gfs2_glock_dq(&m_ip->i_gh);
-		gfs2_holder_uninit(&m_ip->i_gh);
-	}
-	gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-	gfs2_holder_uninit(&ip->i_gh);
-	return error;
-}
-
 /**
 * adjust_fs_space - Adjusts the free space available due to gfs2_grow
 * @inode: the rindex inode
 */
-static void adjust_fs_space(struct inode *inode)
+void adjust_fs_space(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@ -822,11 +693,11 @@ out:
 * This copies the data from the page into the inode block after
 * the inode data structure itself.
 *
- * Returns: errno
+ * Returns: copied bytes or errno
 */
-static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
-				  loff_t pos, unsigned copied,
-				  struct page *page)
+int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+			   loff_t pos, unsigned copied,
+			   struct page *page)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	u64 to = pos + copied;
@ -852,84 +723,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	return copied;
 }

-/**
- * gfs2_write_end
- * @file: The file to write to
- * @mapping: The address space to write to
- * @pos: The file position
- * @len: The length of the data
- * @copied: How much was actually copied by the VFS
- * @page: The page that has been written
- * @fsdata: The fsdata (unused in GFS2)
- *
- * The main write_end function for GFS2. We just put our locking around the VFS
- * provided functions.
- *
- * Returns: errno
- */
-
-static int gfs2_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned len, unsigned copied,
-			  struct page *page, void *fsdata)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
-	struct buffer_head *dibh;
-	int ret;
-	struct gfs2_trans *tr = current->journal_info;
-	BUG_ON(!tr);
-
-	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
-
-	ret = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(ret))
-		goto out;
-
-	if (gfs2_is_stuffed(ip)) {
-		ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page);
-		page = NULL;
-		goto out2;
-	}
-
-	if (gfs2_is_jdata(ip))
-		gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len);
-	else
-		gfs2_ordered_add_inode(ip);
-
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	page = NULL;
-	if (tr->tr_num_buf_new)
-		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	else
-		gfs2_trans_add_meta(ip->i_gl, dibh);
-
-out2:
-	if (inode == sdp->sd_rindex) {
-		adjust_fs_space(inode);
-		sdp->sd_rindex_uptodate = 0;
-	}
-
-	brelse(dibh);
-out:
-	if (page) {
-		unlock_page(page);
-		put_page(page);
-	}
-	gfs2_trans_end(sdp);
-	gfs2_inplace_release(ip);
-	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
-		gfs2_quota_unlock(ip);
-	if (inode == sdp->sd_rindex) {
-		gfs2_glock_dq(&m_ip->i_gh);
-		gfs2_holder_uninit(&m_ip->i_gh);
-	}
-	gfs2_glock_dq(&ip->i_gh);
-	gfs2_holder_uninit(&ip->i_gh);
-	return ret;
-}
-
 /**
 * jdata_set_page_dirty - Page dirtying function
 * @page: The page to dirty
@ -1022,96 +815,6 @@ out:
 		try_to_release_page(page, 0);
 }

-/**
- * gfs2_ok_for_dio - check that dio is valid on this file
- * @ip: The inode
- * @offset: The offset at which we are reading or writing
- *
- * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
- *          1 (to accept the i/o request)
- */
-static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
-{
-	/*
-	 * Should we return an error here? I can't see that O_DIRECT for
-	 * a stuffed file makes any sense. For now we'll silently fall
-	 * back to buffered I/O
-	 */
-	if (gfs2_is_stuffed(ip))
-		return 0;
-
-	if (offset >= i_size_read(&ip->i_inode))
-		return 0;
-	return 1;
-}
-
-
-
-static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	struct address_space *mapping = inode->i_mapping;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	loff_t offset = iocb->ki_pos;
-	struct gfs2_holder gh;
-	int rv;
-
-	/*
-	 * Deferred lock, even if its a write, since we do no allocation
-	 * on this path. All we need change is atime, and this lock mode
-	 * ensures that other nodes have flushed their buffered read caches
-	 * (i.e. their page cache entries for this inode). We do not,
-	 * unfortunately have the option of only flushing a range like
-	 * the VFS does.
-	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
-	rv = gfs2_glock_nq(&gh);
-	if (rv)
-		goto out_uninit;
-	rv = gfs2_ok_for_dio(ip, offset);
-	if (rv != 1)
-		goto out; /* dio not valid, fall back to buffered i/o */
-
-	/*
-	 * Now since we are holding a deferred (CW) lock at this point, you
-	 * might be wondering why this is ever needed. There is a case however
-	 * where we've granted a deferred local lock against a cached exclusive
-	 * glock. That is ok provided all granted local locks are deferred, but
-	 * it also means that it is possible to encounter pages which are
-	 * cached and possibly also mapped. So here we check for that and sort
-	 * them out ahead of the dio. The glock state machine will take care of
-	 * everything else.
-	 *
-	 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
-	 * the first place, mapping->nr_pages will always be zero.
-	 */
-	if (mapping->nrpages) {
-		loff_t lstart = offset & ~(PAGE_SIZE - 1);
-		loff_t len = iov_iter_count(iter);
-		loff_t end = PAGE_ALIGN(offset + len) - 1;
-
-		rv = 0;
-		if (len == 0)
-			goto out;
-		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
-			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
-		rv = filemap_write_and_wait_range(mapping, lstart, end);
-		if (rv)
-			goto out;
-		if (iov_iter_rw(iter) == WRITE)
-			truncate_inode_pages_range(mapping, lstart, end);
-	}
-
-	rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-				  gfs2_get_block_direct, NULL, NULL, 0);
-out:
-	gfs2_glock_dq(&gh);
-out_uninit:
-	gfs2_holder_uninit(&gh);
-	return rv;
-}
-
 /**
 * gfs2_releasepage - free the metadata associated with a page
 * @page: the page that's being released
@ -1187,12 +890,10 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
 	.bmap = gfs2_bmap,
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
-	.direct_IO = gfs2_direct_IO,
+	.direct_IO = noop_direct_IO,
 	.migratepage = buffer_migrate_page,
 	.is_partially_uptodate = block_is_partially_uptodate,
 	.error_remove_page = generic_error_remove_page,
@ -1203,13 +904,11 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
 	.set_page_dirty = __set_page_dirty_buffers,
 	.bmap = gfs2_bmap,
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
-	.direct_IO = gfs2_direct_IO,
+	.direct_IO = noop_direct_IO,
 	.migratepage = buffer_migrate_page,
 	.is_partially_uptodate = block_is_partially_uptodate,
 	.error_remove_page = generic_error_remove_page,
@ -1220,8 +919,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.writepages = gfs2_jdata_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
 	.set_page_dirty = jdata_set_page_dirty,
 	.bmap = gfs2_bmap,
 	.invalidatepage = gfs2_invalidatepage,
--- a/fs/gfs2/aops.h
+++ b/fs/gfs2/aops.h
@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Red Hat, Inc.  All rights reserved.
+ */
+
+#ifndef __AOPS_DOT_H__
+#define __AOPS_DOT_H__
+
+#include "incore.h"
+
+extern int stuffed_readpage(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+				  loff_t pos, unsigned copied,
+				  struct page *page);
+extern void adjust_fs_space(struct inode *inode);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int len);
+
+#endif /* __AOPS_DOT_H__ */
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@ -28,6 +28,7 @@
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
+#include "aops.h"
 #include "trace_gfs2.h"

 /* This doesn't need to be that large as max 64 bit pointers in a 4k
@ -41,6 +42,8 @@ struct metapath {
 	int mp_aheight; /* actual height (lookup height) */
 };

+static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
+
 /**
 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
 * @ip: the inode
@ -389,7 +392,7 @@ static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 	return mp->mp_aheight - x - 1;
 }

-static inline void release_metapath(struct metapath *mp)
+static void release_metapath(struct metapath *mp)
 {
 	int i;

@ -397,27 +400,23 @@ static inline void release_metapath(struct metapath *mp)
 		if (mp->mp_bh[i] == NULL)
 			break;
 		brelse(mp->mp_bh[i]);
+		mp->mp_bh[i] = NULL;
 	}
 }

 /**
 * gfs2_extent_length - Returns length of an extent of blocks
- * @start: Start of the buffer
- * @len: Length of the buffer in bytes
- * @ptr: Current position in the buffer
- * @limit: Max extent length to return (0 = unlimited)
+ * @bh: The metadata block
+ * @ptr: Current position in @bh
+ * @limit: Max extent length to return
 * @eob: Set to 1 if we hit "end of block"
 *
- * If the first block is zero (unallocated) it will return the number of
- * unallocated blocks in the extent, otherwise it will return the number
- * of contiguous blocks in the extent.
- *
 * Returns: The length of the extent (minimum of one block)
 */

-static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
+static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 {
-	const __be64 *end = (start + len);
+	const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 	const __be64 *first = ptr;
 	u64 d = be64_to_cpu(*ptr);

@ -426,14 +425,11 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b
 		ptr++;
 		if (ptr >= end)
 			break;
-		if (limit && --limit == 0)
-			break;
-		if (d)
-			d++;
+		d++;
 	} while(be64_to_cpu(*ptr) == d);
 	if (ptr >= end)
 		*eob = 1;
-	return (ptr - first);
+	return ptr - first;
 }

 typedef const __be64 *(*gfs2_metadata_walker)(
@ -609,11 +605,13 @@ enum alloc_state {
 *  ii) Indirect blocks to fill in lower part of the metadata tree
 * iii) Data blocks
 *
- * The function is in two parts. The first part works out the total
- * number of blocks which we need. The second part does the actual
- * allocation asking for an extent at a time (if enough contiguous free
- * blocks are available, there will only be one request per bmap call)
- * and uses the state machine to initialise the blocks in order.
+ * This function is called after gfs2_iomap_get, which works out the
+ * total number of blocks which we need via gfs2_alloc_size.
+ *
+ * We then do the actual allocation asking for an extent at a time (if
+ * enough contiguous free blocks are available, there will only be one
+ * allocation request per call) and uses the state machine to initialise
+ * the blocks in order.
 *
 * Right now, this function will allocate at most one indirect block
 * worth of data -- with a default block size of 4K, that's slightly
@ -633,39 +631,26 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 	struct buffer_head *dibh = mp->mp_bh[0];
 	u64 bn;
 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
-	unsigned dblks = 0;
-	unsigned ptrs_per_blk;
+	size_t dblks = iomap->length >> inode->i_blkbits;
 	const unsigned end_of_metadata = mp->mp_fheight - 1;
 	int ret;
 	enum alloc_state state;
 	__be64 *ptr;
 	__be64 zero_bn = 0;
-	size_t maxlen = iomap->length >> inode->i_blkbits;

 	BUG_ON(mp->mp_aheight < 1);
 	BUG_ON(dibh == NULL);
+	BUG_ON(dblks < 1);

 	gfs2_trans_add_meta(ip->i_gl, dibh);

 	down_write(&ip->i_rw_mutex);

 	if (mp->mp_fheight == mp->mp_aheight) {
-		struct buffer_head *bh;
-		int eob;
-
-		/* Bottom indirect block exists, find unalloced extent size */
-		ptr = metapointer(end_of_metadata, mp);
-		bh = mp->mp_bh[end_of_metadata];
-		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
-					   maxlen, &eob);
-		BUG_ON(dblks < 1);
+		/* Bottom indirect block exists */
 		state = ALLOC_DATA;
 	} else {
 		/* Need to allocate indirect blocks */
-		ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
-			sdp->sd_diptrs;
-		dblks = min(maxlen, (size_t)(ptrs_per_blk -
-					     mp->mp_list[end_of_metadata]));
 		if (mp->mp_fheight == ip->i_height) {
 			/* Writing into existing tree, extend tree down */
 			iblks = mp->mp_fheight - mp->mp_aheight;
@ -750,6 +735,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 		}
 	} while (iomap->addr == IOMAP_NULL_ADDR);

+	iomap->type = IOMAP_MAPPED;
 	iomap->length = (u64)dblks << inode->i_blkbits;
 	ip->i_height = mp->mp_fheight;
 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
@ -759,19 +745,52 @@ out:
 	return ret;
 }

-static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
+#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
+
+/**
+ * gfs2_alloc_size - Compute the maximum allocation size
+ * @inode: The inode
+ * @mp: The metapath
+ * @size: Requested size in blocks
+ *
+ * Compute the maximum size of the next allocation at @mp.
+ *
+ * Returns: size in blocks
+ */
+static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	const __be64 *first, *ptr, *end;

-	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
-		      sizeof(struct gfs2_dinode);
-	iomap->offset = 0;
-	iomap->length = i_size_read(inode);
-	iomap->type = IOMAP_INLINE;
+	/*
+	 * For writes to stuffed files, this function is called twice via
+	 * gfs2_iomap_get, before and after unstuffing. The size we return the
+	 * first time needs to be large enough to get the reservation and
+	 * allocation sizes right.  The size we return the second time must
+	 * be exact or else gfs2_iomap_alloc won't do the right thing.
+	 */
+
+	if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
+		unsigned int maxsize = mp->mp_fheight > 1 ?
+			sdp->sd_inptrs : sdp->sd_diptrs;
+		maxsize -= mp->mp_list[mp->mp_fheight - 1];
+		if (size > maxsize)
+			size = maxsize;
+		return size;
+	}
+
+	first = metapointer(ip->i_height - 1, mp);
+	end = metaend(ip->i_height - 1, mp);
+	if (end - first > size)
+		end = first + size;
+	for (ptr = first; ptr < end; ptr++) {
+		if (*ptr)
+			break;
+	}
+	return ptr - first;
 }

-#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
-
 /**
 * gfs2_iomap_get - Map blocks from an inode to disk blocks
 * @inode: The inode
@ -789,37 +808,63 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t size = i_size_read(inode);
 	__be64 *ptr;
 	sector_t lblock;
 	sector_t lblock_stop;
 	int ret;
 	int eob;
 	u64 len;
-	struct buffer_head *bh;
+	struct buffer_head *dibh = NULL, *bh;
 	u8 height;

 	if (!length)
 		return -EINVAL;

+	down_read(&ip->i_rw_mutex);
+
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (ret)
+		goto unlock;
+	iomap->private = dibh;
+
 	if (gfs2_is_stuffed(ip)) {
-		if (flags & IOMAP_REPORT) {
-			if (pos >= i_size_read(inode))
-				return -ENOENT;
-			gfs2_stuffed_iomap(inode, iomap);
-			return 0;
+		if (flags & IOMAP_WRITE) {
+			loff_t max_size = gfs2_max_stuffed_size(ip);
+
+			if (pos + length > max_size)
+				goto unstuff;
+			iomap->length = max_size;
+		} else {
+			if (pos >= size) {
+				if (flags & IOMAP_REPORT) {
+					ret = -ENOENT;
+					goto unlock;
+				} else {
+					/* report a hole */
+					iomap->offset = pos;
+					iomap->length = length;
+					goto do_alloc;
+				}
+			}
+			iomap->length = size;
 		}
-		BUG_ON(!(flags & IOMAP_WRITE));
+		iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
+			      sizeof(struct gfs2_dinode);
+		iomap->type = IOMAP_INLINE;
+		iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
+		goto out;
 	}
+
+unstuff:
 	lblock = pos >> inode->i_blkbits;
 	iomap->offset = lblock << inode->i_blkbits;
 	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 	len = lblock_stop - lblock + 1;
+	iomap->length = len << inode->i_blkbits;

-	down_read(&ip->i_rw_mutex);
-
-	ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]);
-	if (ret)
-		goto unlock;
+	get_bh(dibh);
+	mp->mp_bh[0] = dibh;

 	height = ip->i_height;
 	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
@ -840,12 +885,12 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 		goto do_alloc;

 	bh = mp->mp_bh[ip->i_height - 1];
-	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob);
+	len = gfs2_extent_length(bh, ptr, len, &eob);

 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 	iomap->length = len << inode->i_blkbits;
 	iomap->type = IOMAP_MAPPED;
-	iomap->flags = IOMAP_F_MERGED;
+	iomap->flags |= IOMAP_F_MERGED;
 	if (eob)
 		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;

@ -853,25 +898,185 @@ out:
 	iomap->bdev = inode->i_sb->s_bdev;
 unlock:
 	up_read(&ip->i_rw_mutex);
+	if (ret && dibh)
+		brelse(dibh);
 	return ret;

 do_alloc:
 	iomap->addr = IOMAP_NULL_ADDR;
-	iomap->length = len << inode->i_blkbits;
 	iomap->type = IOMAP_HOLE;
-	iomap->flags = 0;
 	if (flags & IOMAP_REPORT) {
-		loff_t size = i_size_read(inode);
 		if (pos >= size)
 			ret = -ENOENT;
 		else if (height == ip->i_height)
 			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 		else
 			iomap->length = size - pos;
+	} else if (flags & IOMAP_WRITE) {
+		u64 alloc_size;
+
+		if (flags & IOMAP_DIRECT)
+			goto out;  /* (see gfs2_file_direct_write) */
+
+		len = gfs2_alloc_size(inode, mp, len);
+		alloc_size = len << inode->i_blkbits;
+		if (alloc_size < iomap->length)
+			iomap->length = alloc_size;
+	} else {
+		if (pos < size && height == ip->i_height)
+			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 	}
 	goto out;
 }

+static int gfs2_write_lock(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (error)
+		goto out_uninit;
+	if (&ip->i_inode == sdp->sd_rindex) {
+		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &m_ip->i_gh);
+		if (error)
+			goto out_unlock;
+	}
+	return 0;
+
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+static void gfs2_write_unlock(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (&ip->i_inode == sdp->sd_rindex) {
+		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+		gfs2_glock_dq_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq_uninit(&ip->i_gh);
+}
+
+static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos,
+				unsigned copied, struct page *page,
+				struct iomap *iomap)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
+}
+
+static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
+				  loff_t length, unsigned flags,
+				  struct iomap *iomap)
+{
+	struct metapath mp = { .mp_aheight = 1, };
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	bool unstuff, alloc_required;
+	int ret;
+
+	ret = gfs2_write_lock(inode);
+	if (ret)
+		return ret;
+
+	unstuff = gfs2_is_stuffed(ip) &&
+		  pos + length > gfs2_max_stuffed_size(ip);
+
+	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+	if (ret)
+		goto out_release;
+
+	alloc_required = unstuff || iomap->type == IOMAP_HOLE;
+
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
+				       &ind_blocks);
+
+	if (alloc_required) {
+		struct gfs2_alloc_parms ap = {
+			.target = data_blocks + ind_blocks
+		};
+
+		ret = gfs2_quota_lock_check(ip, &ap);
+		if (ret)
+			goto out_release;
+
+		ret = gfs2_inplace_reserve(ip, &ap);
+		if (ret)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+	if (inode == sdp->sd_rindex)
+		rblocks += 2 * RES_STATFS;
+	if (alloc_required)
+		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
+
+	ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
+	if (ret)
+		goto out_trans_fail;
+
+	if (unstuff) {
+		ret = gfs2_unstuff_dinode(ip, NULL);
+		if (ret)
+			goto out_trans_end;
+		release_metapath(&mp);
+		brelse(iomap->private);
+		iomap->private = NULL;
+		ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
+				     flags, iomap, &mp);
+		if (ret)
+			goto out_trans_end;
+	}
+
+	if (iomap->type == IOMAP_HOLE) {
+		ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+		if (ret) {
+			gfs2_trans_end(sdp);
+			gfs2_inplace_release(ip);
+			punch_hole(ip, iomap->offset, iomap->length);
+			goto out_qunlock;
+		}
+	}
+	release_metapath(&mp);
+	if (gfs2_is_jdata(ip))
+		iomap->page_done = gfs2_iomap_journaled_page_done;
+	return 0;
+
+out_trans_end:
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	if (alloc_required)
+		gfs2_inplace_release(ip);
+out_qunlock:
+	if (alloc_required)
+		gfs2_quota_unlock(ip);
+out_release:
+	if (iomap->private)
+		brelse(iomap->private);
+	release_metapath(&mp);
+	gfs2_write_unlock(inode);
+	return ret;
+}
+
 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@ -879,22 +1084,79 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 	struct metapath mp = { .mp_aheight = 1, };
 	int ret;

+	iomap->flags |= IOMAP_F_BUFFER_HEAD;
+
 	trace_gfs2_iomap_start(ip, pos, length, flags);
-	if (flags & IOMAP_WRITE) {
-		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
-		if (!ret && iomap->type == IOMAP_HOLE)
-			ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
-		release_metapath(&mp);
+	if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) {
+		ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap);
 	} else {
 		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
 		release_metapath(&mp);
+		/*
+		 * Silently fall back to buffered I/O for stuffed files or if
+		 * we've hot a hole (see gfs2_file_direct_write).
+		 */
+		if ((flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT) &&
+		    iomap->type != IOMAP_MAPPED)
+			ret = -ENOTBLK;
 	}
 	trace_gfs2_iomap_end(ip, iomap, ret);
 	return ret;
 }

+static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+			  ssize_t written, unsigned flags, struct iomap *iomap)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_trans *tr = current->journal_info;
+	struct buffer_head *dibh = iomap->private;
+
+	if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
+		goto out;
+
+	if (iomap->type != IOMAP_INLINE) {
+		gfs2_ordered_add_inode(ip);
+
+		if (tr->tr_num_buf_new)
+			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		else
+			gfs2_trans_add_meta(ip->i_gl, dibh);
+	}
+
+	if (inode == sdp->sd_rindex) {
+		adjust_fs_space(inode);
+		sdp->sd_rindex_uptodate = 0;
+	}
+
+	gfs2_trans_end(sdp);
+	gfs2_inplace_release(ip);
+
+	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
+		/* Deallocate blocks that were just allocated. */
+		loff_t blockmask = i_blocksize(inode) - 1;
+		loff_t end = (pos + length) & ~blockmask;
+
+		pos = (pos + written + blockmask) & ~blockmask;
+		if (pos < end) {
+			truncate_pagecache_range(inode, pos, end - 1);
+			punch_hole(ip, pos, end - pos);
+		}
+	}
+
+	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
+		gfs2_quota_unlock(ip);
+	gfs2_write_unlock(inode);
+
+out:
+	if (dibh)
+		brelse(dibh);
+	return 0;
+}
+
 const struct iomap_ops gfs2_iomap_ops = {
 	.iomap_begin = gfs2_iomap_begin,
+	.iomap_end = gfs2_iomap_end,
 };

 /**
@ -941,12 +1203,6 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	} else {
 		ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
 		release_metapath(&mp);
-
-		/* Return unmapped buffer beyond the end of file. */
-		if (ret == -ENOENT) {
-			ret = 0;
-			goto out;
-		}
 	}
 	if (ret)
 		goto out;
@ -2060,7 +2316,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
 	lblock = offset >> shift;
 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-	if (lblock_stop > end_of_file)
+	if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
 		return 1;

 	size = (lblock_stop - lblock) << shift;
@ -2154,11 +2410,11 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
 		if (error)
 			goto out;
 	} else {
-		unsigned int start_off, end_off, blocksize;
+		unsigned int start_off, end_len, blocksize;

 		blocksize = i_blocksize(inode);
 		start_off = offset & (blocksize - 1);
-		end_off = (offset + length) & (blocksize - 1);
+		end_len = (offset + length) & (blocksize - 1);
 		if (start_off) {
 			unsigned int len = length;
 			if (length > blocksize - start_off)
@ -2167,11 +2423,11 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
 			if (error)
 				goto out;
 			if (start_off + length < blocksize)
-				end_off = 0;
+				end_len = 0;
 		}
-		if (end_off) {
+		if (end_len) {
 			error = gfs2_block_zero_range(inode,
-				offset + length - end_off, end_off);
+				offset + length - end_len, end_len);
 			if (error)
 				goto out;
 		}
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@ -1011,7 +1011,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 	u64 bn, leaf_no;
 	__be64 *lp;
 	u32 index;
-	int x, moved = 0;
+	int x;
 	int error;

 	index = name->hash >> (32 - dip->i_depth);
@ -1113,8 +1113,6 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)

 			if (!prev)
 				prev = dent;
-
-			moved = 1;
 		} else {
 			prev = dent;
 		}
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@ -26,10 +26,12 @@
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
 #include <linux/delay.h>
+#include <linux/backing-dev.h>

 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
+#include "aops.h"
 #include "dir.h"
 #include "glock.h"
 #include "glops.h"
@ -387,7 +389,7 @@ static int gfs2_allocate_page_backing(struct page *page)
 * blocks allocated on disk to back that page.
 */

-static int gfs2_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
@ -688,12 +690,83 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	return ret ? ret : ret1;
 }

+static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	size_t count = iov_iter_count(to);
+	struct gfs2_holder gh;
+	ssize_t ret;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out_uninit;
+
+	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL);
+
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	size_t len = iov_iter_count(from);
+	loff_t offset = iocb->ki_pos;
+	struct gfs2_holder gh;
+	ssize_t ret;
+
+	/*
+	 * Deferred lock, even if its a write, since we do no allocation on
+	 * this path. All we need to change is the atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately, have the option of only flushing a range like the
+	 * VFS does.
+	 */
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out_uninit;
+
+	/* Silently fall back to buffered I/O when writing beyond EOF */
+	if (offset + len > i_size_read(&ip->i_inode))
+		goto out;
+
+	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL);
+
+out:
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	return ret;
+}
+
+static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = gfs2_file_direct_read(iocb, to);
+		if (likely(ret != -ENOTBLK))
+			return ret;
+		iocb->ki_flags &= ~IOCB_DIRECT;
+	}
+	return generic_file_read_iter(iocb, to);
+}
+
 /**
 * gfs2_file_write_iter - Perform a write to a file
 * @iocb: The io context
- * @iov: The data to write
- * @nr_segs: Number of @iov segments
- * @pos: The file position
+ * @from: The data to write
 *
 * We have to do a lock/unlock here to refresh the inode size for
 * O_APPEND writes, otherwise we can land up writing at the wrong
@ -705,8 +778,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct gfs2_inode *ip = GFS2_I(file_inode(file));
-	int ret;
+	struct inode *inode = file_inode(file);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	ssize_t written = 0, ret;

 	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
@ -723,7 +797,71 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		gfs2_glock_dq_uninit(&gh);
 	}

-	return generic_file_write_iter(iocb, from);
+	inode_lock(inode);
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto out2;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto out2;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct address_space *mapping = file->f_mapping;
+		loff_t pos, endbyte;
+		ssize_t buffered;
+
+		written = gfs2_file_direct_write(iocb, from);
+		if (written < 0 || !iov_iter_count(from))
+			goto out2;
+
+		ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+		if (unlikely(ret < 0))
+			goto out2;
+		buffered = ret;
+
+		/*
+		 * We need to ensure that the page cache pages are written to
+		 * disk and invalidated to preserve the expected O_DIRECT
+		 * semantics.
+		 */
+		pos = iocb->ki_pos;
+		endbyte = pos + buffered - 1;
+		ret = filemap_write_and_wait_range(mapping, pos, endbyte);
+		if (!ret) {
+			iocb->ki_pos += buffered;
+			written += buffered;
+			invalidate_mapping_pages(mapping,
+						 pos >> PAGE_SHIFT,
+						 endbyte >> PAGE_SHIFT);
+		} else {
+			/*
+			 * We don't know how much we wrote, so just return
+			 * the number of bytes which were direct-written
+			 */
+		}
+	} else {
+		ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+		if (likely(ret > 0))
+			iocb->ki_pos += ret;
+	}
+
+out2:
+	current->backing_dev_info = NULL;
+out:
+	inode_unlock(inode);
+	if (likely(ret > 0)) {
+		/* Handle various SYNC-type writes */
+		ret = generic_write_sync(iocb, ret);
+	}
+	return written ? written : ret;
 }

 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@ -733,7 +871,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	loff_t end = offset + len;
 	struct buffer_head *dibh;
-	struct iomap iomap = { };
 	int error;

 	error = gfs2_meta_inode_buffer(ip, &dibh);
@ -749,12 +886,14 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 	}

 	while (offset < end) {
+		struct iomap iomap = { };
+
 		error = gfs2_iomap_get_alloc(inode, offset, end - offset,
 					     &iomap);
 		if (error)
 			goto out;
 		offset = iomap.offset + iomap.length;
-		if (iomap.type != IOMAP_HOLE)
+		if (!(iomap.flags & IOMAP_F_NEW))
 			continue;
 		error = sb_issue_zeroout(sb, iomap.addr >> inode->i_blkbits,
 					 iomap.length >> inode->i_blkbits,
@ -1125,7 +1264,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)

 const struct file_operations gfs2_file_fops = {
 	.llseek		= gfs2_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
@ -1155,7 +1294,7 @@ const struct file_operations gfs2_dir_fops = {

 const struct file_operations gfs2_file_fops_nolock = {
 	.llseek		= gfs2_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= gfs2_file_read_iter,
 	.write_iter	= gfs2_file_write_iter,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.mmap		= gfs2_mmap,
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@ -65,6 +65,27 @@ struct gfs2_log_operations {

 #define GBF_FULL 1

+/**
+ * Clone bitmaps (bi_clone):
+ *
+ * - When a block is freed, we remember the previous state of the block in the
+ *   clone bitmap, and only mark the block as free in the real bitmap.
+ *
+ * - When looking for a block to allocate, we check for a free block in the
+ *   clone bitmap, and if no clone bitmap exists, in the real bitmap.
+ *
+ * - For allocating a block, we mark it as allocated in the real bitmap, and if
+ *   a clone bitmap exists, also in the clone bitmap.
+ *
+ * - At the end of a log_flush, we copy the real bitmap into the clone bitmap
+ *   to make the clone bitmap reflect the current allocation state.
+ *   (Alternatively, we could remove the clone bitmap.)
+ *
+ * The clone bitmaps are in-core only, and is never written to disk.
+ *
+ * These steps ensure that blocks which have been freed in a transaction cannot
+ * be reallocated in that same transaction.
+ */
 struct gfs2_bitmap {
 	struct buffer_head *bi_bh;
 	char *bi_clone;
@ -295,7 +316,6 @@ struct gfs2_blkreserv {
 	struct rb_node rs_node;       /* link to other block reservations */
 	struct gfs2_rbm rs_rbm;       /* Start of reservation */
 	u32 rs_free;                  /* how many blocks are still free */
-	u64 rs_inum;                  /* Inode number for reservation */
 };

 /*
@ -398,7 +418,6 @@ struct gfs2_inode {
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
 	struct gfs2_qadata *i_qadata; /* quota allocation data */
 	struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
-	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
 	struct list_head i_ordered;
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@ -821,6 +821,13 @@ restart:
 		goto fail;
 	}

+	/**
+	 * If we're a spectator, we don't want to take the lock in EX because
+	 * we cannot do the first-mount responsibility it implies: recovery.
+	 */
+	if (sdp->sd_args.ar_spectator)
+		goto locks_done;
+
 	error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
 	if (!error) {
 		mounted_mode = DLM_LOCK_EX;
@ -896,9 +903,16 @@ locks_done:
 	if (lvb_gen < mount_gen) {
 		/* wait for mounted nodes to update control_lock lvb to our
 		   generation, which might include new recovery bits set */
-		fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
-			"lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
-			lvb_gen, ls->ls_recover_flags);
+		if (sdp->sd_args.ar_spectator) {
+			fs_info(sdp, "Recovery is required. Waiting for a "
+				"non-spectator to mount.\n");
+			msleep_interruptible(1000);
+		} else {
+			fs_info(sdp, "control_mount wait1 block %u start %u "
+				"mount %u lvb %u flags %lx\n", block_gen,
+				start_gen, mount_gen, lvb_gen,
+				ls->ls_recover_flags);
+		}
 		spin_unlock(&ls->ls_recover_spin);
 		goto restart;
 	}
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@ -92,7 +92,8 @@ static void gfs2_remove_from_ail(struct gfs2_bufdata *bd)

 static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
 			       struct writeback_control *wbc,
-			       struct gfs2_trans *tr)
+			       struct gfs2_trans *tr,
+			       bool *withdraw)
 __releases(&sdp->sd_ail_lock)
 __acquires(&sdp->sd_ail_lock)
 {
@ -107,8 +108,10 @@ __acquires(&sdp->sd_ail_lock)
 		gfs2_assert(sdp, bd->bd_tr == tr);

 		if (!buffer_busy(bh)) {
-			if (!buffer_uptodate(bh))
+			if (!buffer_uptodate(bh)) {
 				gfs2_io_error_bh(sdp, bh);
+				*withdraw = true;
+			}
 			list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
 			continue;
 		}
@ -148,6 +151,7 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 	struct list_head *head = &sdp->sd_ail1_list;
 	struct gfs2_trans *tr;
 	struct blk_plug plug;
+	bool withdraw = false;

 	trace_gfs2_ail_flush(sdp, wbc, 1);
 	blk_start_plug(&plug);
@ -156,11 +160,13 @@ restart:
 	list_for_each_entry_reverse(tr, head, tr_list) {
 		if (wbc->nr_to_write <= 0)
 			break;
-		if (gfs2_ail1_start_one(sdp, wbc, tr))
+		if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw))
 			goto restart;
 	}
 	spin_unlock(&sdp->sd_ail_lock);
 	blk_finish_plug(&plug);
+	if (withdraw)
+		gfs2_lm_withdraw(sdp, NULL);
 	trace_gfs2_ail_flush(sdp, wbc, 0);
 }

@ -188,7 +194,8 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 *
 */

-static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+				bool *withdraw)
 {
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
@ -199,11 +206,12 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 		gfs2_assert(sdp, bd->bd_tr == tr);
 		if (buffer_busy(bh))
 			continue;
-		if (!buffer_uptodate(bh))
+		if (!buffer_uptodate(bh)) {
 			gfs2_io_error_bh(sdp, bh);
+			*withdraw = true;
+		}
 		list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
 	}
-
 }

 /**
@ -218,10 +226,11 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 	struct gfs2_trans *tr, *s;
 	int oldest_tr = 1;
 	int ret;
+	bool withdraw = false;

 	spin_lock(&sdp->sd_ail_lock);
 	list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
-		gfs2_ail1_empty_one(sdp, tr);
+		gfs2_ail1_empty_one(sdp, tr, &withdraw);
 		if (list_empty(&tr->tr_ail1_list) && oldest_tr)
 			list_move(&tr->tr_list, &sdp->sd_ail2_list);
 		else
@ -230,6 +239,9 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 	ret = list_empty(&sdp->sd_ail1_list);
 	spin_unlock(&sdp->sd_ail_lock);

+	if (withdraw)
+		gfs2_lm_withdraw(sdp, "fatal: I/O error(s)\n");
+
 	return ret;
 }

@ -689,7 +701,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
 	hash = ~crc32(~0, lh, LH_V1_SIZE);
 	lh->lh_hash = cpu_to_be32(hash);

-	tv = current_kernel_time64();
+	ktime_get_coarse_real_ts64(&tv);
 	lh->lh_nsec = cpu_to_be32(tv.tv_nsec);
 	lh->lh_sec = cpu_to_be64(tv.tv_sec);
 	addr = gfs2_log_bmap(sdp);
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@ -49,7 +49,7 @@ void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	if (test_set_buffer_pinned(bh))
 		gfs2_assert_withdraw(sdp, 0);
 	if (!buffer_uptodate(bh))
-		gfs2_io_error_bh(sdp, bh);
+		gfs2_io_error_bh_wd(sdp, bh);
 	bd = bh->b_private;
 	/* If this buffer is in the AIL and it has already been written
 	 * to in-place disk block, remove it from the AIL.
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@ -293,7 +293,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	if (unlikely(!buffer_uptodate(bh))) {
 		struct gfs2_trans *tr = current->journal_info;
 		if (tr && test_bit(TR_TOUCHED, &tr->tr_flags))
-			gfs2_io_error_bh(sdp, bh);
+			gfs2_io_error_bh_wd(sdp, bh);
 		brelse(bh);
 		*bhp = NULL;
 		return -EIO;
@ -320,7 +320,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	if (!buffer_uptodate(bh)) {
 		struct gfs2_trans *tr = current->journal_info;
 		if (tr && test_bit(TR_TOUCHED, &tr->tr_flags))
-			gfs2_io_error_bh(sdp, bh);
+			gfs2_io_error_bh_wd(sdp, bh);
 		return -EIO;
 	}
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@ -413,12 +413,13 @@ void gfs2_recover_func(struct work_struct *work)
 	ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep;
 	int ro = 0;
 	unsigned int pass;
-	int error;
+	int error = 0;
 	int jlocked = 0;

 	t_start = ktime_get();
-	if (sdp->sd_args.ar_spectator ||
-	    (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
+	if (sdp->sd_args.ar_spectator)
+		goto fail;
+	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
 		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
 			jd->jd_jid);
 		jlocked = 1;
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@ -123,17 +123,26 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
 /**
 * gfs2_testbit - test a bit in the bitmaps
 * @rbm: The bit to test
+ * @use_clone: If true, test the clone bitmap, not the official bitmap.
+ *
+ * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
+ * not the "real" bitmaps, to avoid allocating recently freed blocks.
 *
 * Returns: The two bit block state of the requested bit
 */

-static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
+static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
 {
 	struct gfs2_bitmap *bi = rbm_bi(rbm);
-	const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
+	const u8 *buffer;
 	const u8 *byte;
 	unsigned int bit;

+	if (use_clone && bi->bi_clone)
+		buffer = bi->bi_clone;
+	else
+		buffer = bi->bi_bh->b_data;
+	buffer += bi->bi_offset;
 	byte = buffer + (rbm->offset / GFS2_NBBY);
 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;

@ -322,7 +331,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le
 	u8 res;

 	for (n = 0; n < n_unaligned; n++) {
-		res = gfs2_testbit(rbm);
+		res = gfs2_testbit(rbm, true);
 		if (res != GFS2_BLKST_FREE)
 			return true;
 		(*len)--;
@ -607,8 +616,10 @@ int gfs2_rsqa_alloc(struct gfs2_inode *ip)

 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
 {
+	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
+
 	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
-		       (unsigned long long)rs->rs_inum,
+		       (unsigned long long)ip->i_no_addr,
 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
 		       rs->rs_rbm.offset, rs->rs_free);
 }
@ -1051,6 +1062,18 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
 }

+static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
+{
+	const struct gfs2_rgrp *str = buf;
+
+	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
+	rgl->rl_flags = str->rg_flags;
+	rgl->rl_free = str->rg_free;
+	rgl->rl_dinodes = str->rg_dinodes;
+	rgl->rl_igeneration = str->rg_igeneration;
+	rgl->__pad = 0UL;
+}
+
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
@ -1073,6 +1096,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 	str->rg_crc = cpu_to_be32(crc);

 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
+	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
 }

 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
@ -1087,25 +1111,6 @@ static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
 	return 1;
 }

-static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
-{
-	const struct gfs2_rgrp *str = buf;
-
-	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
-	rgl->rl_flags = str->rg_flags;
-	rgl->rl_free = str->rg_free;
-	rgl->rl_dinodes = str->rg_dinodes;
-	rgl->rl_igeneration = str->rg_igeneration;
-	rgl->__pad = 0UL;
-}
-
-static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change)
-{
-	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
-	u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change;
-	rgl->rl_unlinked = cpu_to_be32(unlinked);
-}
-
 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
 {
 	struct gfs2_bitmap *bi;
@ -1424,7 +1429,6 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
 				gfs2_trans_add_meta(rgd->rd_gl, bh);
 				gfs2_rgrp_out(rgd, bh->b_data);
-				gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
 				gfs2_trans_end(sdp);
 			}
 		}
@ -1487,6 +1491,34 @@ static void rs_insert(struct gfs2_inode *ip)
 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
 }

+/**
+ * rgd_free - return the number of free blocks we can allocate.
+ * @rgd: the resource group
+ *
+ * This function returns the number of free blocks for an rgrp.
+ * That's the clone-free blocks (blocks that are free, not including those
+ * still being used for unlinked files that haven't been deleted.)
+ *
+ * It also subtracts any blocks reserved by someone else, but does not
+ * include free blocks that are still part of our current reservation,
+ * because obviously we can (and will) allocate them.
+ */
+static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
+{
+	u32 tot_reserved, tot_free;
+
+	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
+		return 0;
+	tot_reserved = rgd->rd_reserved - rs->rs_free;
+
+	if (rgd->rd_free_clone < tot_reserved)
+		tot_reserved = 0;
+
+	tot_free = rgd->rd_free_clone - tot_reserved;
+
+	return tot_free;
+}
+
 /**
 * rg_mblk_search - find a group of multiple free blocks to form a reservation
 * @rgd: the resource group descriptor
@ -1502,7 +1534,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 	u64 goal;
 	struct gfs2_blkreserv *rs = &ip->i_res;
 	u32 extlen;
-	u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
+	u32 free_blocks = rgd_free(rgd, rs);
 	int ret;
 	struct inode *inode = &ip->i_inode;

@ -1528,7 +1560,6 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 	if (ret == 0) {
 		rs->rs_rbm = rbm;
 		rs->rs_free = extlen;
-		rs->rs_inum = ip->i_no_addr;
 		rs_insert(ip);
 	} else {
 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
@ -1686,7 +1717,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,

 	while(1) {
 		bi = rbm_bi(rbm);
-		if (test_bit(GBF_FULL, &bi->bi_flags) &&
+		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
+		    test_bit(GBF_FULL, &bi->bi_flags) &&
 		    (state == GFS2_BLKST_FREE))
 			goto next_bitmap;

@ -1983,7 +2015,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 	int error = 0, rg_locked, flags = 0;
 	u64 last_unlinked = NO_BLOCK;
 	int loops = 0;
-	u32 skip = 0;
+	u32 free_blocks, skip = 0;

 	if (sdp->sd_args.ar_rgrplvb)
 		flags |= GL_SKIP;
@ -1991,8 +2023,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 		return -EINVAL;
 	if (gfs2_rs_active(rs)) {
 		begin = rs->rs_rbm.rgd;
-	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
-		rs->rs_rbm.rgd = begin = ip->i_rgd;
+	} else if (rs->rs_rbm.rgd &&
+		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
+		begin = rs->rs_rbm.rgd;
 	} else {
 		check_and_update_goal(ip);
 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
@ -2053,11 +2086,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 			goto check_rgrp;

 		/* If rgrp has enough free space, use it */
-		if (rs->rs_rbm.rgd->rd_free_clone >= ap->target ||
+		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
+		if (free_blocks >= ap->target ||
 		    (loops == 2 && ap->min_target &&
-		     rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
-			ip->i_rgd = rs->rs_rbm.rgd;
-			ap->allowed = ip->i_rgd->rd_free_clone;
+		     free_blocks >= ap->min_target)) {
+			ap->allowed = free_blocks;
 			return 0;
 		}
 check_rgrp:
@ -2115,26 +2148,6 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
 }

-/**
- * gfs2_get_block_type - Check a block in a RG is of given type
- * @rgd: the resource group holding the block
- * @block: the block number
- *
- * Returns: The block type (GFS2_BLKST_*)
- */
-
-static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
-{
-	struct gfs2_rbm rbm = { .rgd = rgd, };
-	int ret;
-
-	ret = gfs2_rbm_from_block(&rbm, block);
-	WARN_ON_ONCE(ret != 0);
-
-	return gfs2_testbit(&rbm);
-}
-
-
 /**
 * gfs2_alloc_extent - allocate an extent from a given bitmap
 * @rbm: the resource group information
@ -2159,7 +2172,7 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
 	block++;
 	while (*n < elen) {
 		ret = gfs2_rbm_from_block(&pos, block);
-		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
+		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
 			break;
 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
@ -2335,7 +2348,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
-	struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
+	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
 	unsigned int ndata;
 	u64 block; /* block, within the file system scope */
 	int error;
@ -2393,7 +2406,6 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,

 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
-	gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);

 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
 	if (dinode)
@ -2434,7 +2446,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);

 	/* Directories keep their data in the metadata address space */
 	if (meta || ip->i_depth)
@ -2471,8 +2482,7 @@ void gfs2_unlink_di(struct inode *inode)
 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
-	update_rgrp_lvb_unlinked(rgd, 1);
+	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
 }

 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
@ -2492,8 +2502,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)

 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
-	update_rgrp_lvb_unlinked(rgd, -1);
+	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);

 	gfs2_statfs_change(sdp, 0, +1, -1);
 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
@ -2516,6 +2525,7 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 {
 	struct gfs2_rgrpd *rgd;
 	struct gfs2_holder rgd_gh;
+	struct gfs2_rbm rbm;
 	int error = -EINVAL;

 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
@ -2526,7 +2536,11 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 	if (error)
 		goto fail;

-	if (gfs2_get_block_type(rgd, no_addr) != type)
+	rbm.rgd = rgd;
+	error = gfs2_rbm_from_block(&rbm, no_addr);
+	WARN_ON_ONCE(error != 0);
+
+	if (gfs2_testbit(&rbm, false) != type)
 		error = -ESTALE;

 	gfs2_glock_dq_uninit(&rgd_gh);
@ -2558,19 +2572,34 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
 		return;

-	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
-		rgd = ip->i_rgd;
-	else
+	/*
+	 * The resource group last accessed is kept in the last position.
+	 */
+
+	if (rlist->rl_rgrps) {
+		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
+		if (rgrp_contains_block(rgd, block))
+			return;
 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
+	} else {
+		rgd = ip->i_res.rs_rbm.rgd;
+		if (!rgd || !rgrp_contains_block(rgd, block))
+			rgd = gfs2_blk2rgrpd(sdp, block, 1);
+	}
+
 	if (!rgd) {
-		fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
+		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
+		       (unsigned long long)block);
 		return;
 	}
-	ip->i_rgd = rgd;

-	for (x = 0; x < rlist->rl_rgrps; x++)
-		if (rlist->rl_rgd[x] == rgd)
+	for (x = 0; x < rlist->rl_rgrps; x++) {
+		if (rlist->rl_rgd[x] == rgd) {
+			swap(rlist->rl_rgd[x],
+			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
 			return;
+		}
+	}

 	if (rlist->rl_rgrps == rlist->rl_space) {
 		new_space = rlist->rl_space + 10;
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@ -1729,7 +1729,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
 	if (ip) {
 		ip->i_flags = 0;
 		ip->i_gl = NULL;
-		ip->i_rgd = NULL;
 		memset(&ip->i_res, 0, sizeof(ip->i_res));
 		RB_CLEAR_NODE(&ip->i_res.rs_node);
 		ip->i_rahead = 0;
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@ -429,11 +429,18 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)

 	spin_lock(&sdp->sd_jindex_spin);
 	rv = -EBUSY;
-	if (sdp->sd_jdesc->jd_jid == jid)
+	/**
+	 * If we're a spectator, we use journal0, but it's not really ours.
+	 * So we need to wait for its recovery too. If we skip it we'd never
+	 * queue work to the recovery workqueue, and so its completion would
+	 * never clear the DFL_BLOCK_LOCKS flag, so all our locks would
+	 * permanently stop working.
+	 */
+	if (sdp->sd_jdesc->jd_jid == jid && !sdp->sd_args.ar_spectator)
 		goto out;
 	rv = -ENOENT;
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_jid != jid)
+		if (jd->jd_jid != jid && !sdp->sd_args.ar_spectator)
 			continue;
 		rv = gfs2_recover_journal(jd, false);
 		break;
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@ -606,7 +606,8 @@ TRACE_EVENT(gfs2_rs,
 		__entry->rd_addr	= rs->rs_rbm.rgd->rd_addr;
 		__entry->rd_free_clone	= rs->rs_rbm.rgd->rd_free_clone;
 		__entry->rd_reserved	= rs->rs_rbm.rgd->rd_reserved;
-		__entry->inum		= rs->rs_inum;
+		__entry->inum		= container_of(rs, struct gfs2_inode,
+						       i_res)->i_no_addr;
 		__entry->start		= gfs2_rbm_to_block(&rs->rs_rbm);
 		__entry->free		= rs->rs_free;
 		__entry->func		= func;
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@ -30,9 +30,11 @@ struct gfs2_glock;
 * block, or all of the blocks in the rg, whichever is smaller */
 static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested)
 {
-	if (requested < ip->i_rgd->rd_length)
+	struct gfs2_rgrpd *rgd = ip->i_res.rs_rbm.rgd;
+
+	if (requested < rgd->rd_length)
 		return requested + 1;
-	return ip->i_rgd->rd_length;
+	return rgd->rd_length;
 }

 extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@ -46,14 +46,16 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
 	    test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
 		return 0;

-	va_start(args, fmt);
+	if (fmt) {
+		va_start(args, fmt);

-	vaf.fmt = fmt;
-	vaf.va = &args;
+		vaf.fmt = fmt;
+		vaf.va = &args;

-	fs_err(sdp, "%pV", &vaf);
+		fs_err(sdp, "%pV", &vaf);

-	va_end(args);
+		va_end(args);
+	}

 	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
 		fs_err(sdp, "about to withdraw this file system\n");
@ -246,21 +248,21 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
 }

 /**
- * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
- * Returns: -1 if this call withdrew the machine,
- *          0 if it was already withdrawn
+ * gfs2_io_error_bh_i - Flag a buffer I/O error
+ * @withdraw: withdraw the filesystem
 */

-int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
-		       const char *function, char *file, unsigned int line)
+void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			const char *function, char *file, unsigned int line,
+			bool withdraw)
 {
-	int rv;
-	rv = gfs2_lm_withdraw(sdp,
-			      "fatal: I/O error\n"
-			      "  block = %llu\n"
-			      "  function = %s, file = %s, line = %u\n",
-			      (unsigned long long)bh->b_blocknr,
-			      function, file, line);
-	return rv;
+	fs_err(sdp,
+	       "fatal: I/O error\n"
+	       "  block = %llu\n"
+	       "  function = %s, file = %s, line = %u\n",
+	       (unsigned long long)bh->b_blocknr,
+	       function, file, line);
+	if (withdraw)
+		gfs2_lm_withdraw(sdp, NULL);
 }

--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@ -136,11 +136,15 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
 gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__);


-int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
-		       const char *function, char *file, unsigned int line);
+void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			const char *function, char *file, unsigned int line,
+			bool withdraw);
+
+#define gfs2_io_error_bh_wd(sdp, bh) \
+gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__, true);

 #define gfs2_io_error_bh(sdp, bh) \
-gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
+gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__, false);


 extern struct kmem_cache *gfs2_glock_cachep;
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@ -343,60 +343,45 @@ struct ea_list {
 	unsigned int ei_size;
 };

-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
-	switch (ea->ea_type) {
-	case GFS2_EATYPE_USR:
-		return 5 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SYS:
-		return 7 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SECURITY:
-		return 9 + ea->ea_name_len + 1;
-	default:
-		return 0;
-	}
-}
-
 static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
 		     void *private)
 {
 	struct ea_list *ei = private;
 	struct gfs2_ea_request *er = ei->ei_er;
-	unsigned int ea_size = gfs2_ea_strlen(ea);
+	unsigned int ea_size;
+	char *prefix;
+	unsigned int l;

 	if (ea->ea_type == GFS2_EATYPE_UNUSED)
 		return 0;

-	if (er->er_data_len) {
-		char *prefix = NULL;
-		unsigned int l = 0;
-		char c = 0;
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		prefix = "user.";
+		l = 5;
+		break;
+	case GFS2_EATYPE_SYS:
+		prefix = "system.";
+		l = 7;
+		break;
+	case GFS2_EATYPE_SECURITY:
+		prefix = "security.";
+		l = 9;
+		break;
+	default:
+		BUG();
+	}

+	ea_size = l + ea->ea_name_len + 1;
+	if (er->er_data_len) {
 		if (ei->ei_size + ea_size > er->er_data_len)
 			return -ERANGE;

-		switch (ea->ea_type) {
-		case GFS2_EATYPE_USR:
-			prefix = "user.";
-			l = 5;
-			break;
-		case GFS2_EATYPE_SYS:
-			prefix = "system.";
-			l = 7;
-			break;
-		case GFS2_EATYPE_SECURITY:
-			prefix = "security.";
-			l = 9;
-			break;
-		}
-
-		BUG_ON(l == 0);
-
 		memcpy(er->er_data + ei->ei_size, prefix, l);
 		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
 		       ea->ea_name_len);
-		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+		er->er_data[ei->ei_size + ea_size - 1] = 0;
 	}

 	ei->ei_size += ea_size;