Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (56 commits) ocfs2: Make cached block reads the common case. ocfs2: Kill the last naked wait_on_buffer() for cached reads. ocfs2: Move ocfs2_bread() into dir.c ocfs2: Simplify ocfs2_read_block() ocfs2: Require an inode for ocfs2_read_block(s)(). ocfs2: Separate out sync reads from ocfs2_read_blocks() ocfs2: Refactor xattr list and remove ocfs2_xattr_handler(). ocfs2: Calculate EA hash only by its suffix. ocfs2: Move trusted and user attribute support into xattr.c ocfs2: Uninline ocfs2_xattr_name_hash() ocfs2: Don't check for NULL before brelse() ocfs2: use smaller counters in ocfs2_remove_xattr_clusters_from_cache ocfs2: Documentation update for user_xattr / nouser_xattr mount options ocfs2: make la_debug_mutex static ocfs2: Remove pointless !! ocfs2: Add empty bucket support in xattr. ocfs2/xattr.c: Fix a bug when inserting xattr. ocfs2: Add xattr mount option in ocfs2_show_options() ocfs2: Switch over to JBD2. ocfs2: Add the 'inode64' mount option. ...
2008-10-14 16:34:11 -07:00 · 2008-10-14 16:34:11 -07:00 · acd15a8360
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@ -76,3 +76,9 @@ localalloc=8(*)		Allows custom localalloc size in MB. If the value is too
 			large, the fs will silently revert it to the default.
 			Localalloc is not enabled for local mounts.
 localflocks		This disables cluster aware flock.
+inode64			Indicates that Ocfs2 is allowed to create inodes at
+			any location in the filesystem, including those which
+			will result in inode numbers occupying more than 32
+			bits of significance.
+user_xattr	(*)	Enables Extended User Attributes.
+nouser_xattr		Disables Extended User Attributes.
--- a/fs/Kconfig
+++ b/fs/Kconfig
@ -220,17 +220,16 @@ config JBD
 	tristate
 	help
 	  This is a generic journalling layer for block devices.  It is
-	  currently used by the ext3 and OCFS2 file systems, but it could
-	  also be used to add journal support to other file systems or block
+	  currently used by the ext3 file system, but it could also be
+	  used to add journal support to other file systems or block
 	  devices such as RAID or LVM.

-	  If you are using the ext3 or OCFS2 file systems, you need to
-	  say Y here. If you are not using ext3 OCFS2 then you will probably
-	  want to say N.
+	  If you are using the ext3 file system, you need to say Y here.
+	  If you are not using ext3 then you will probably want to say N.

 	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
-	  you cannot compile this code as a module.
+	  called jbd.  If you are compiling ext3 into the kernel, you
+	  cannot compile this code as a module.

 config JBD_DEBUG
 	bool "JBD (ext3) debugging support"
@ -254,15 +253,16 @@ config JBD2
 	help
 	  This is a generic journaling layer for block devices that support
 	  both 32-bit and 64-bit block numbers.  It is currently used by
-	  the ext4 filesystem, but it could also be used to add
+	  the ext4 and OCFS2 filesystems, but it could also be used to add
 	  journal support to other file systems or block devices such
 	  as RAID or LVM.

-	  If you are using ext4, you need to say Y here. If you are not
-	  using ext4 then you will probably want to say N.
+	  If you are using ext4 or OCFS2, you need to say Y here.
+	  If you are not using ext4 or OCFS2 then you will
+	  probably want to say N.

 	  To compile this device as a module, choose M here. The module will be
-	  called jbd2.  If you are compiling ext4 into the kernel,
+	  called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
 	  you cannot compile this code as a module.

 config JBD2_DEBUG
@ -448,7 +448,7 @@ config OCFS2_FS
 	tristate "OCFS2 file system support"
 	depends on NET && SYSFS
 	select CONFIGFS_FS
-	select JBD
+	select JBD2
 	select CRC32
 	help
 	  OCFS2 is a general purpose extent based shared disk cluster file
@ -519,6 +519,16 @@ config OCFS2_DEBUG_FS
 	  this option for debugging only as it is likely to decrease
 	  performance of the filesystem.

+config OCFS2_COMPAT_JBD
+	bool "Use JBD for compatibility"
+	depends on OCFS2_FS
+	default n
+	select JBD
+	help
+	  The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
+	  is backwards compatible with JBD.  It is safe to say N here.
+	  However, if you really want to use the original JBD, say Y here.
+
 endif # BLOCK

 config DNOTIFY
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@ -34,7 +34,8 @@ ocfs2-objs := \
 	symlink.o 		\
 	sysfile.o 		\
 	uptodate.o		\
-	ver.o
+	ver.o			\
+	xattr.o

 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@ -26,30 +26,102 @@
 #ifndef OCFS2_ALLOC_H
 #define OCFS2_ALLOC_H

+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation.  Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+	struct ocfs2_extent_tree_operations	*et_ops;
+	struct buffer_head			*et_root_bh;
+	struct ocfs2_extent_list		*et_root_el;
+	void					*et_object;
+	unsigned int				et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct inode *inode,
+				       struct buffer_head *bh);
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct inode *inode,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_value_root *xv);
+
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct ocfs2_extent_tree *et,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
 			u8 flags,
 			struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret);
 struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+			struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+			   struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
 {
 	/*
 	 * Rather than do all the work of determining how much we need
@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 	 * new tree_depth==0 extent_block, and one block at the new
 	 * top-of-the tree.
 	 */
-	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+	return le16_to_cpu(root_el->l_tree_depth) + 2;
 }

 void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}

-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  &bh, OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 	err = 0;

 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);

 	mlog_exit(err);
 	return err;
@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 {
 	int ret;
 	struct buffer_head *di_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));

-	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
-			       OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 	}

 	if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 		ret = walk_page_buffers(handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
-		if (ret < 0) 
+#endif
+		if (ret < 0)
 			mlog_errno(ret);
 	}
 out:
@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;

-	journal_invalidatepage(journal, page, offset);
+	jbd2_journal_invalidatepage(journal, page, offset);
 }

 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)

 	if (!page_has_buffers(page))
 		return 0;
-	return journal_try_to_free_buffers(journal, page, wait);
+	return jbd2_journal_try_to_free_buffers(journal, page, wait);
 }

 static ssize_t ocfs2_direct_IO(int rw,
@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode,
 		tmppage = wc->w_pages[i];

 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode))
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 				walk_page_buffers(wc->w_handle,
 						  page_buffers(tmppage),
 						  from, to, NULL,
 						  ocfs2_journal_dirty_data);
+#endif
+			}

 			block_commit_write(tmppage, from, to);
 		}
@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 	int ret, i, new, should_zero = 0;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
+	struct ocfs2_extent_tree et;

 	new = phys == 0 ? 1 : 0;
 	if (new || unwritten)
@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		 * any additional semaphores or cluster locks.
 		 */
 		tmp_pos = cpos;
-		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, 0, wc->w_di_bh,
-						 wc->w_handle, data_ac,
-						 meta_ac, NULL);
+		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+					   &tmp_pos, 1, 0, wc->w_di_bh,
+					   wc->w_handle, data_ac,
+					   meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			goto out;
 		}
 	} else if (unwritten) {
-		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_mark_extent_written(inode, &et,
 						wc->w_handle, cpos, 1, phys,
 						meta_ac, &wc->w_dealloc);
 		if (ret < 0) {
@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;

 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
-					    extents_to_split, &data_ac, &meta_ac);
+		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
+		     " clusters_to_add = %u, extents_to_split = %u\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+		     clusters_to_alloc, extents_to_split);
+
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_lock_allocators(inode, &et,
+					    clusters_to_alloc, extents_to_split,
+					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}

-		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+		credits = ocfs2_calc_extend_credits(inode->i_sb,
+						    &di->id2.i_list,
 						    clusters_to_alloc);

 	}
@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		}

 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode))
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 				walk_page_buffers(wc->w_handle,
 						  page_buffers(tmppage),
 						  from, to, NULL,
 						  ocfs2_journal_dirty_data);
+#endif
+			}
 			block_commit_write(tmppage, from, to);
 		}
 	}
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 	/* remove from dirty list before I/O. */
 	clear_buffer_dirty(bh);

-	get_bh(bh); /* for end_buffer_write_sync() */                   
+	get_bh(bh); /* for end_buffer_write_sync() */
 	bh->b_end_io = end_buffer_write_sync;
 	submit_bh(WRITE, bh);

@ -88,22 +88,103 @@ out:
 	return ret;
 }

-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
-		      struct buffer_head *bhs[], int flags,
-		      struct inode *inode)
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[])
+{
+	int status = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	if (!nr) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		goto bail;
+	}
+
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(osb->sb, block++);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "trying to sync read a jbd "
+			     "managed bh (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (buffer_dirty(bh)) {
+			/* This should probably be a BUG, or
+			 * at least return an error. */
+			mlog(ML_ERROR,
+			     "trying to sync read a dirty "
+			     "buffer! (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		lock_buffer(bh);
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "block %llu had the JBD bit set "
+			     "while I was in lock_buffer!",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		clear_buffer_uptodate(bh);
+		get_bh(bh); /* for end_buffer_read_sync() */
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+	}
+
+	for (i = nr; i > 0; i--) {
+		bh = bhs[i - 1];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "the journal got the buffer while it was "
+			     "locked for io! (blocknr = %llu)\n",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. */
+			status = -EIO;
+			put_bh(bh);
+			bhs[i - 1] = NULL;
+		}
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags)
 {
 	int status = 0;
-	struct super_block *sb;
 	int i, ignore_cache = 0;
 	struct buffer_head *bh;

-	mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
-		   (unsigned long long)block, nr, flags, inode);
+	mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
+		   inode, (unsigned long long)block, nr, flags);

+	BUG_ON(!inode);
 	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
-	       (!inode || !(flags & OCFS2_BH_CACHED)));
+	       (flags & OCFS2_BH_IGNORE_CACHE));

-	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+	if (bhs == NULL) {
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		goto bail;
 	}

-	sb = osb->sb;
-
-	if (flags & OCFS2_BH_CACHED && !inode)
-		flags &= ~OCFS2_BH_CACHED;
-
-	if (inode)
-		mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
-			bhs[i] = sb_getblk(sb, block++);
+			bhs[i] = sb_getblk(inode->i_sb, block++);
 			if (bhs[i] == NULL) {
-				if (inode)
-					mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+				mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 				status = -EIO;
 				mlog_errno(status);
 				goto bail;
 			}
 		}
 		bh = bhs[i];
-		ignore_cache = 0;
+		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);

 		/* There are three read-ahead cases here which we need to
 		 * be concerned with. All three assume a buffer has
@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		 *    before our is-it-in-flight check.
 		 */

-		if (flags & OCFS2_BH_CACHED &&
-		    !ocfs2_buffer_uptodate(inode, bh)) {
+		if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
 			mlog(ML_UPTODATE,
 			     "bh (%llu), inode %llu not uptodate\n",
 			     (unsigned long long)bh->b_blocknr,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			/* We're using ignore_cache here to say
+			 * "go to disk" */
 			ignore_cache = 1;
 		}

 		/* XXX: Can we ever get this and *not* have the cached
 		 * flag set? */
 		if (buffer_jbd(bh)) {
-			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+			if (ignore_cache)
 				mlog(ML_BH_IO, "trying to sync read a jbd "
 					       "managed bh (blocknr = %llu)\n",
 				     (unsigned long long)bh->b_blocknr);
 			continue;
 		}

-		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+		if (ignore_cache) {
 			if (buffer_dirty(bh)) {
 				/* This should probably be a BUG, or
 				 * at least return an error. */
@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 			 * previously read-ahead buffer may have
 			 * completed I/O while we were waiting for the
 			 * buffer lock. */
-			if ((flags & OCFS2_BH_CACHED)
+			if (!(flags & OCFS2_BH_IGNORE_CACHE)
 			    && !(flags & OCFS2_BH_READAHEAD)
 			    && ocfs2_buffer_uptodate(inode, bh)) {
 				unlock_buffer(bh);
@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		/* Always set the buffer in the cache, even if it was
 		 * a forced read, or read-ahead which hasn't yet
 		 * completed. */
-		if (inode)
-			ocfs2_set_buffer_uptodate(inode, bh);
+		ocfs2_set_buffer_uptodate(inode, bh);
 	}
-	if (inode)
-		mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);

 	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
 	     (unsigned long long)block, nr,
-	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+	     ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+	     flags);

 bail:

--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@ -31,31 +31,29 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 			     int uptodate);

-static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+static inline int ocfs2_read_block(struct inode	       *inode,
 				   u64                  off,
-				   struct buffer_head **bh,
-				   int                  flags,
-				   struct inode        *inode);
+				   struct buffer_head **bh);

 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct inode        *inode);
-int ocfs2_read_blocks(struct ocfs2_super          *osb,
+int ocfs2_read_blocks(struct inode	  *inode,
 		      u64                  block,
 		      int                  nr,
 		      struct buffer_head  *bhs[],
-		      int                  flags,
-		      struct inode        *inode);
+		      int                  flags);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[]);

 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);

-#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_IGNORE_CACHE      1
 #define OCFS2_BH_READAHEAD         8

-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
-				   struct buffer_head **bh, int flags,
-				   struct inode *inode)
+static inline int ocfs2_read_block(struct inode *inode, u64 off,
+				   struct buffer_head **bh)
 {
 	int status = 0;

@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
 		goto bail;
 	}

-	status = ocfs2_read_blocks(osb, off, 1, bh,
-				   flags, inode);
+	status = ocfs2_read_blocks(inode, off, 1, bh, 0);

 bail:
 	return status;
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(CONN),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
+	define_mask(XATTR),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@ -112,6 +112,7 @@
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
+#define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);

+static struct buffer_head *ocfs2_bread(struct inode *inode,
+				       int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = 0;
+
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+					     NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	brelse(bh);
+	bh = NULL;
+
+	*err = -EIO;
+	return NULL;
+}
+
 /*
 * bh passed here can be an inode block or a dir data block, depending
 * on the inode inline data flag.
@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;

-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@ -260,14 +302,13 @@ restart:
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			/* read error, skip block & hope for the best */
+		if (ocfs2_read_block(dir, block, &bh)) {
+			/* read error, skip block & hope for the best.
+			 * ocfs2_read_block() has released the bh. */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
 				    block);
-			brelse(bh);
 			goto next;
 		}
 		i = ocfs2_search_dirblock(bh, dir, name, namelen,
@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;

-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;

-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
 				tmp = ocfs2_bread(inode, ++blk, &err, 1);
-				if (tmp)
-					brelse(tmp);
+				brelse(tmp);
 			}
 			last_ra_blk = blk;
 			ra_sectors = 8;
@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
 leave:
 	if (status < 0) {
 		*dirent = NULL;
-		if (*dirent_bh) {
-			brelse(*dirent_bh);
-			*dirent_bh = NULL;
-		}
+		brelse(*dirent_bh);
+		*dirent_bh = NULL;
 	}

 	mlog_exit(status);
@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,

 	ret = 0;
 bail:
-	if (dirent_bh)
-		brelse(dirent_bh);
+	brelse(dirent_bh);

 	mlog_exit(ret);
 	return ret;
@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,

 	status = 0;
 bail:
-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);

 	mlog_exit(status);
 	return status;
@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	struct buffer_head *dirdata_bh = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);

 	alloc = ocfs2_clusters_for_bytes(sb, bytes);

@ -1305,8 +1342,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * This should never fail as our extent list is empty and all
 	 * related blocks have been journaled already.
 	 */
-	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
-				  NULL);
+	ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+				  0, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@ -1337,8 +1374,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		}
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);

-		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
-					  len, 0, NULL);
+		ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+					  blkno, len, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
@ -1383,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;

-		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-						    1, 0, parent_fe_bh, handle,
-						    data_ac, meta_ac, NULL);
+		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+					      1, 0, parent_fe_bh, handle,
+					      data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
 			mlog_errno(status);
@ -1430,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	int credits, num_free_extents, drop_alloc_sem = 0;
 	loff_t dir_i_size;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_dir_entry * de;
 	struct super_block *sb = osb->sb;
+	struct ocfs2_extent_tree et;

 	mlog_entry_void();

@ -1479,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
-		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
@ -1487,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		}

 		if (!num_free_extents) {
-			status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+			status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
 			if (status < 0) {
 				if (status != -ENOSPC)
 					mlog_errno(status);
@ -1502,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			goto bail;
 		}

-		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+		credits = ocfs2_calc_extend_credits(sb, el, 1);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@ -1568,8 +1608,7 @@ bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);

-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);

 	mlog_exit(status);
 	return status;
@ -1696,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,

 	status = 0;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);

 	mlog_exit(status);
 	return status;
@ -1756,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 	*ret_de_bh = bh;
 	bh = NULL;
 out:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 	return ret;
 }
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
-					  bh, OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(inode, oi->ip_blkno, bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 	}

-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  ret_bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
 	if (status < 0)
 		mlog_errno(status);

--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@ -293,8 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;

-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
-			       &eb_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@ -382,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;

-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh, OCFS2_BH_CACHED, inode);
+				       &next_eb_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@ -551,6 +550,66 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
 		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
 }

+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	} else {
+		rec = &el->l_recs[i];
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0) in xattr", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	}
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		       u32 *p_cluster, u32 *num_clusters,
 		       unsigned int *extent_flags)
@ -571,8 +630,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (ret == 0)
 		goto out;

-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@ -53,4 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 u64 map_start, u64 map_len);

+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el);
+
 #endif  /* _EXTENT_MAP_H */
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@ -55,6 +55,7 @@
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
+#include "xattr.h"

 #include "buffer_head_io.h"

@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
 		goto bail;

 	journal = osb->journal->j_journal;
-	err = journal_force_commit(journal);
+	err = jbd2_journal_force_commit(journal);

 bail:
 	mlog_exit(err);
@ -488,7 +489,7 @@ bail:
 }

 /*
- * extend allocation only here.
+ * extend file allocation only here.
 * we'll update all the disk stuff, and oip->alloc_size
 *
 * expect stuff to be locked, a transaction started and enough data /
@ -497,189 +498,25 @@ bail:
 * Will return -EAGAIN, and a reason if a restart is needed.
 * If passed in, *reason will always be set, even in error.
 */
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret)
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret)
 {
-	int status = 0;
-	int free_extents;
-	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	enum ocfs2_alloc_restarted reason = RESTART_NONE;
-	u32 bit_off, num_bits;
-	u64 block;
-	u8 flags = 0;
+	int ret;
+	struct ocfs2_extent_tree et;

-	BUG_ON(!clusters_to_add);
-
-	if (mark_unwritten)
-		flags = OCFS2_EXT_UNWRITTEN;
-
-	free_extents = ocfs2_num_free_extents(osb, inode, fe);
-	if (free_extents < 0) {
-		status = free_extents;
-		mlog_errno(status);
-		goto leave;
-	}
-
-	/* there are two cases which could cause us to EAGAIN in the
-	 * we-need-more-metadata case:
-	 * 1) we haven't reserved *any*
-	 * 2) we are so fragmented, we've needed to add metadata too
-	 *    many times. */
-	if (!free_extents && !meta_ac) {
-		mlog(0, "we haven't reserved any metadata!\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	} else if ((!free_extents)
-		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(fe))) {
-		mlog(0, "filesystem is really fragmented...\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	}
-
-	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
-					clusters_to_add, &bit_off, &num_bits);
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
-		goto leave;
-	}
-
-	BUG_ON(num_bits > clusters_to_add);
-
-	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
-	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-				     *logical_offset, block, num_bits,
-				     flags, meta_ac);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	clusters_to_add -= num_bits;
-	*logical_offset += num_bits;
-
-	if (clusters_to_add) {
-		mlog(0, "need to alloc once more, clusters = %u, wanted = "
-		     "%u\n", fe->i_clusters, clusters_to_add);
-		status = -EAGAIN;
-		reason = RESTART_TRANS;
-	}
-
-leave:
-	mlog_exit(status);
-	if (reason_ret)
-		*reason_ret = reason;
-	return status;
-}
-
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac)
-{
-	int ret = 0, num_free_extents;
-	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	*meta_ac = NULL;
-	if (data_ac)
-		*data_ac = NULL;
-
-	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-
-	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u, extents_to_split = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
-	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-
-	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
-	if (num_free_extents < 0) {
-		ret = num_free_extents;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/*
-	 * Sparse allocation file systems need to be more conservative
-	 * with reserving room for expansion - the actual allocation
-	 * happens while we've got a journal handle open so re-taking
-	 * a cluster lock (because we ran out of room for another
-	 * extent) will violate ordering rules.
-	 *
-	 * Most of the time we'll only be seeing this 1 cluster at a time
-	 * anyway.
-	 *
-	 * Always lock for any unwritten extents - we might want to
-	 * add blocks during a split.
-	 */
-	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
-		if (ret < 0) {
-			if (ret != -ENOSPC)
-				mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	if (clusters_to_add == 0)
-		goto out;
-
-	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
-	if (ret < 0) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
-
-out:
-	if (ret) {
-		if (*meta_ac) {
-			ocfs2_free_alloc_context(*meta_ac);
-			*meta_ac = NULL;
-		}
-
-		/*
-		 * We cannot have an error and a non null *data_ac.
-		 */
-	}
+	ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
+	ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
+					   clusters_to_add, mark_unwritten,
+					   &et, handle,
+					   data_ac, meta_ac, reason_ret);

 	return ret;
 }
@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;

 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);

@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));

-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);

-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
-				       &meta_ac);
+	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+	     clusters_to_add);
+	ocfs2_init_dinode_extent_tree(&et, inode, bh);
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}

-	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@ -753,16 +597,16 @@ restarted_transaction:

 	prev_clusters = OCFS2_I(inode)->ip_clusters;

-	status = ocfs2_do_extend_allocation(osb,
-					    inode,
-					    &logical_start,
-					    clusters_to_add,
-					    mark_unwritten,
-					    bh,
-					    handle,
-					    data_ac,
-					    meta_ac,
-					    &why);
+	status = ocfs2_add_inode_data(osb,
+				      inode,
+				      &logical_start,
+				      clusters_to_add,
+				      mark_unwritten,
+				      bh,
+				      handle,
+				      data_ac,
+				      meta_ac,
+				      &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@ -789,7 +633,7 @@ restarted_transaction:
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
-							    fe,
+							    &fe->id2.i_list,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
@ -826,10 +670,8 @@ leave:
 		restart_func = 0;
 		goto restart_all;
 	}
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
+	brelse(bh);
+	bh = NULL;

 	mlog_exit(status);
 	return status;
@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			goto bail_unlock;
 		}

-		if (i_size_read(inode) > attr->ia_size)
+		if (i_size_read(inode) > attr->ia_size) {
+			if (ocfs2_should_order_data(inode)) {
+				status = ocfs2_begin_ordered_truncate(inode,
+								      attr->ia_size);
+				if (status)
+					goto bail_unlock;
+			}
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
-		else
+		} else
 			status = ocfs2_extend_file(inode, bh, attr->ia_size);
 		if (status < 0) {
 			if (status != -ENOSPC)
@ -1140,8 +988,7 @@ bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);

 	mlog_exit(status);
 	return status;
@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 	struct buffer_head *bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);

-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-			       oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 	struct buffer_head *di_bh = NULL;

 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       OCFS2_I(inode)->ip_blkno, &di_bh,
-				       OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+				       &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_tree et;

-	ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 		goto out;
 	}

-	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
 				  dealloc);
 	if (ret) {
 		mlog_errno(ret);
@ -2040,7 +1888,7 @@ out_dio:
 		 */
 		if (old_size != i_size_read(inode) ||
 		    old_clusters != OCFS2_I(inode)->ip_clusters) {
-			ret = journal_force_commit(osb->journal->j_journal);
+			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
@ -2227,6 +2075,10 @@ const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
@ -2237,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
 	.permission	= ocfs2_permission,
 };

+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
 const struct file_operations ocfs2_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@ -2250,13 +2106,60 @@ const struct file_operations ocfs2_fops = {
 	.unlocked_ioctl	= ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= ocfs2_file_splice_write,
+};
+
+const struct file_operations ocfs2_dops = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
 };

-const struct file_operations ocfs2_dops = {
+const struct file_operations ocfs2_dops_no_plocks = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@ -28,9 +28,12 @@

 extern const struct file_operations ocfs2_fops;
 extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
 extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;

 struct ocfs2_file_private {
 	struct file		*fp_file;
@ -38,27 +41,18 @@ struct ocfs2_file_private {
 	struct ocfs2_lock_res	fp_flock;
 };

-enum ocfs2_alloc_restarted {
-	RESTART_NONE = 0,
-	RESTART_TRANS,
-	RESTART_META
-};
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@ -49,6 +49,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"

 #include "buffer_head_io.h"

@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	struct super_block *sb;
 	struct ocfs2_super *osb;
 	int status = -EINVAL;
+	int use_plocks = 1;

 	mlog_entry("(0x%p, size:%llu)\n", inode,
 		   (unsigned long long)le64_to_cpu(fe->i_size));
@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);

+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+		use_plocks = 0;
+
 	/* this means that read_inode cannot create a superblock inode
 	 * today.  change if needed. */
 	if (!OCFS2_IS_VALID_DINODE(fe) ||
@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,

 	switch (inode->i_mode & S_IFMT) {
 	    case S_IFREG:
-		    inode->i_fop = &ocfs2_fops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_fops;
+		    else
+			    inode->i_fop = &ocfs2_fops_no_plocks;
 		    inode->i_op = &ocfs2_file_iops;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFDIR:
 		    inode->i_op = &ocfs2_dir_iops;
-		    inode->i_fop = &ocfs2_dops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_dops;
+		    else
+			    inode->i_fop = &ocfs2_dops_no_plocks;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFLNK:
@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}

-	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
-				  can_lock ? inode : NULL);
+	if (can_lock)
+		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
+	else
+		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 	 * data and fast symlinks.
 	 */
 	if (fe->i_clusters) {
+		if (ocfs2_should_order_data(inode))
+			ocfs2_begin_ordered_truncate(inode, 0);
+
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}

+	/*Free extended attribute resources associated with this inode.*/
+	status = ocfs2_xattr_remove(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
 	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
 				    orphan_dir_bh);
 	if (status < 0)
@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;
 	oi->ip_blkno = 0ULL;
+	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+				       &oi->ip_jinode);

 bail:
 	mlog_exit_void();
@ -1106,58 +1133,6 @@ void ocfs2_drop_inode(struct inode *inode)
 	mlog_exit_void();
 }

-/*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
-				int block, int *err, int reada)
-{
-	struct buffer_head *bh = NULL;
-	int tmperr;
-	u64 p_blkno;
-	int readflags = OCFS2_BH_CACHED;
-
-	if (reada)
-		readflags |= OCFS2_BH_READAHEAD;
-
-	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!reada);
-		return NULL;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-					     NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (tmperr < 0) {
-		mlog_errno(tmperr);
-		goto fail;
-	}
-
-	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
-				  readflags, inode);
-	if (tmperr < 0)
-		goto fail;
-
-	tmperr = 0;
-
-	*err = 0;
-	return bh;
-
-fail:
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
-	*err = -EIO;
-	return NULL;
-}
-
 /*
 * This is called from our getattr.
 */
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@ -40,6 +40,9 @@ struct ocfs2_inode_info
 	/* protects allocation changes on this inode. */
 	struct rw_semaphore		ip_alloc_sem;

+	/* protects extended attribute changes on this inode */
+	struct rw_semaphore		ip_xattr_sem;
+
 	/* These fields are protected by ip_lock */
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
@ -68,6 +71,7 @@ struct ocfs2_inode_info
 	struct ocfs2_extent_map		ip_extent_map;

 	struct inode			vfs_inode;
+	struct jbd2_inode		ip_jinode;
 };

 /*
@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;

 extern const struct address_space_operations ocfs2_aops;

-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
-				int *err, int reada);
 void ocfs2_clear_inode(struct inode *inode);
 void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@ -102,8 +102,7 @@ bail_unlock:
 bail:
 	mutex_unlock(&inode->i_mutex);

-	if (bh)
-		brelse(bh);
+	brelse(bh);

 	mlog_exit(status);
 	return status;
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 		goto finally;
 	}

-	journal_lock_updates(journal->j_journal);
-	status = journal_flush(journal->j_journal);
-	journal_unlock_updates(journal->j_journal);
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		up_write(&journal->j_trans_barrier);
 		mlog_errno(status);
@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)

 	down_read(&osb->journal->j_trans_barrier);

-	handle = journal_start(journal, max_buffs);
+	handle = jbd2_journal_start(journal, max_buffs);
 	if (IS_ERR(handle)) {
 		up_read(&osb->journal->j_trans_barrier);

@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,

 	BUG_ON(!handle);

-	ret = journal_stop(handle);
+	ret = jbd2_journal_stop(handle);
 	if (ret < 0)
 		mlog_errno(ret);

@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 * transaction. extend_trans will either extend the current handle by
 * nblocks, or commit it and start a new one with nblocks credits.
 *
- * This might call journal_restart() which will commit dirty buffers
+ * This might call jbd2_journal_restart() which will commit dirty buffers
 * and then restart the transaction. Before calling
 * ocfs2_extend_trans(), any changed blocks should have been
 * dirtied. After calling it, all blocks which need to be changed must
@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
 #else
-	status = journal_extend(handle, nblocks);
+	status = jbd2_journal_extend(handle, nblocks);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #endif

 	if (status > 0) {
-		mlog(0, "journal_extend failed, trying journal_restart\n");
-		status = journal_restart(handle, nblocks);
+		mlog(0,
+		     "jbd2_journal_extend failed, trying "
+		     "jbd2_journal_restart\n");
+		status = jbd2_journal_restart(handle, nblocks);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
 	switch (type) {
 	case OCFS2_JOURNAL_ACCESS_CREATE:
 	case OCFS2_JOURNAL_ACCESS_WRITE:
-		status = journal_get_write_access(handle, bh);
+		status = jbd2_journal_get_write_access(handle, bh);
 		break;

 	case OCFS2_JOURNAL_ACCESS_UNDO:
-		status = journal_get_undo_access(handle, bh);
+		status = jbd2_journal_get_undo_access(handle, bh);
 		break;

 	default:
@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	mlog_entry("(bh->b_blocknr=%llu)\n",
 		   (unsigned long long)bh->b_blocknr);

-	status = journal_dirty_metadata(handle, bh);
+	status = jbd2_journal_dirty_metadata(handle, bh);
 	if (status < 0)
 		mlog(ML_ERROR, "Could not dirty metadata buffer. "
 		     "(bh->b_blocknr=%llu)\n",
@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	return status;
 }

+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int ocfs2_journal_dirty_data(handle_t *handle,
 			     struct buffer_head *bh)
 {
@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,

 	return err;
 }
+#endif

-#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)

 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
 	spin_lock(&journal->j_state_lock);
 	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-		journal->j_flags |= JFS_BARRIER;
+		journal->j_flags |= JBD2_BARRIER;
 	else
-		journal->j_flags &= ~JFS_BARRIER;
+		journal->j_flags &= ~JBD2_BARRIER;
 	spin_unlock(&journal->j_state_lock);
 }

@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);

 	/* call the kernels journal init function now */
-	j_journal = journal_init_inode(inode);
+	j_journal = jbd2_journal_init_inode(inode);
 	if (j_journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EINVAL;
 		goto done;
 	}

-	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "Returned from jbd2_journal_init_inode\n");
 	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);

 	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@ -550,8 +554,7 @@ done:
 	if (status < 0) {
 		if (inode_lock)
 			ocfs2_inode_unlock(inode, 1);
-		if (bh != NULL)
-			brelse(bh);
+		brelse(bh);
 		if (inode) {
 			OCFS2_I(inode)->ip_open_count--;
 			iput(inode);
@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	if (journal->j_state != OCFS2_JOURNAL_LOADED)
 		goto done;

-	/* need to inc inode use count as journal_destroy will iput. */
+	/* need to inc inode use count - jbd2_journal_destroy will iput. */
 	if (!igrab(inode))
 		BUG();

@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);

 	if (ocfs2_mount_local(osb)) {
-		journal_lock_updates(journal->j_journal);
-		status = journal_flush(journal->j_journal);
-		journal_unlock_updates(journal->j_journal);
+		jbd2_journal_lock_updates(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal);
+		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
 	}
@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	}

 	/* Shutdown the kernel journal system */
-	journal_destroy(journal->j_journal);
+	jbd2_journal_destroy(journal->j_journal);

 	OCFS2_I(inode)->ip_open_count--;

@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
 {
 	int olderr;

-	olderr = journal_errno(journal);
+	olderr = jbd2_journal_errno(journal);
 	if (olderr) {
 		mlog(ML_ERROR, "File system error %d recorded in "
 		     "journal %u.\n", olderr, slot);
 		mlog(ML_ERROR, "File system on device %s needs checking.\n",
 		     sb->s_id);

-		journal_ack_err(journal);
-		journal_clear_err(journal);
+		jbd2_journal_ack_err(journal);
+		jbd2_journal_clear_err(journal);
 	}
 }

@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)

 	osb = journal->j_osb;

-	status = journal_load(journal->j_journal);
+	status = jbd2_journal_load(journal->j_journal);
 	if (status < 0) {
 		mlog(ML_ERROR, "Failed to load journal!\n");
 		goto done;
@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)

 	BUG_ON(!journal);

-	status = journal_wipe(journal->j_journal, full);
+	status = jbd2_journal_wipe(journal->j_journal, full);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)

 		/* We are reading journal data which should not
 		 * be put in the uptodate cache */
-		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
-					   p_blkno, p_blocks, bhs, 0,
-					   NULL);
+		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+						p_blkno, p_blocks, bhs);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)

 bail:
 	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
-		if (bhs[i])
-			brelse(bhs[i]);
+		brelse(bhs[i]);
 	mlog_exit(status);
 	return status;
 }
@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);

-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+				   OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	}

 	mlog(0, "calling journal_init_inode\n");
-	journal = journal_init_inode(inode);
+	journal = jbd2_journal_init_inode(inode);
 	if (journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EIO;
 		goto done;
 	}

-	status = journal_load(journal);
+	status = jbd2_journal_load(journal);
 	if (status < 0) {
 		mlog_errno(status);
 		if (!igrab(inode))
 			BUG();
-		journal_destroy(journal);
+		jbd2_journal_destroy(journal);
 		goto done;
 	}

@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,

 	/* wipe the journal */
 	mlog(0, "flushing the journal.\n");
-	journal_lock_updates(journal);
-	status = journal_flush(journal);
-	journal_unlock_updates(journal);
+	jbd2_journal_lock_updates(journal);
+	status = jbd2_journal_flush(journal);
+	jbd2_journal_unlock_updates(journal);
 	if (status < 0)
 		mlog_errno(status);

@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	if (!igrab(inode))
 		BUG();

-	journal_destroy(journal);
+	jbd2_journal_destroy(journal);

 done:
 	/* drop the lock on this nodes journal */
@ -1282,8 +1284,7 @@ done:
 	if (inode)
 		iput(inode);

-	if (bh)
-		brelse(bh);
+	brelse(bh);

 	mlog_exit(status);
 	return status;
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@ -27,7 +27,12 @@
 #define OCFS2_JOURNAL_H

 #include <linux/fs.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif

 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
 *                          buffer. Will have to call ocfs2_journal_dirty once
 *                          we've actually dirtied it. Type is one of . or .
 *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
- *                             the current handle commits.
+ *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
+ *                           the current handle commits.
 */

 /* You must always start_trans with a number of buffs > 0, but it's
@ -268,8 +273,10 @@ int                  ocfs2_journal_access(handle_t *handle,
 */
 int                  ocfs2_journal_dirty(handle_t *handle,
 					 struct buffer_head *bh);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int                  ocfs2_journal_dirty_data(handle_t *handle,
 					      struct buffer_head *bh);
+#endif

 /*
 *  Credit Macros:
@ -283,6 +290,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1

+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)

@ -340,11 +350,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
 			     + OCFS2_UNLINK_CREDITS)

+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+					  + OCFS2_INODE_UPDATE_CREDITS \
+					  + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-					    struct ocfs2_dinode *fe,
+					    struct ocfs2_extent_list *root_el,
 					    u32 bits_wanted)
 {
-	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;

 	/* bitmap dinode, group desc. + relinked group. */
 	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	 * however many metadata chunks needed * a remaining suballoc
 	 * alloc. */
 	sysfile_bitmap_blocks = 1 +
-		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);

 	/* this does not include *new* metadata blocks, which are
-	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * accounted for in sysfile_bitmap_blocks. root_el +
 	 * prev. last_eb_blk + blocks along edge of tree.
 	 * calc_symlink_credits passes because we just need 1
 	 * credit for the dinode there. */
-	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);

-	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
 }

 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	return credits;
 }

+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+					       loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+						   new_size);
+}
+
 #endif /* OCFS2_JOURNAL_H */
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>

 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@ -47,8 +48,6 @@

 #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))

-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
-
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);

 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);

-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
-{
-	BUG_ON(osb->s_clustersize_bits > 20);
+#ifdef CONFIG_OCFS2_FS_STATS

-	/* Size local alloc windows by the megabyte */
-	return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
+static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+#define LA_DEBUG_BUF_SZ	PAGE_CACHE_SIZE
+#define LA_DEBUG_VER	1
+static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
+				   size_t count, loff_t *ppos)
+{
+	static DEFINE_MUTEX(la_debug_mutex);
+	struct ocfs2_super *osb = file->private_data;
+	int written, ret;
+	char *buf = osb->local_alloc_debug_buf;
+
+	mutex_lock(&la_debug_mutex);
+	memset(buf, 0, LA_DEBUG_BUF_SZ);
+
+	written = snprintf(buf, LA_DEBUG_BUF_SZ,
+			   "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
+			   LA_DEBUG_VER,
+			   (unsigned long long)osb->la_last_gd,
+			   osb->local_alloc_default_bits,
+			   osb->local_alloc_bits, osb->local_alloc_state);
+
+	ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
+
+	mutex_unlock(&la_debug_mutex);
+	return ret;
+}
+
+static const struct file_operations ocfs2_la_debug_fops = {
+	.open =		ocfs2_la_debug_open,
+	.read =		ocfs2_la_debug_read,
+};
+
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
+	if (!osb->local_alloc_debug_buf)
+		return;
+
+	osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
+						     S_IFREG|S_IRUSR,
+						     osb->osb_debug_root,
+						     osb,
+						     &ocfs2_la_debug_fops);
+	if (!osb->local_alloc_debug) {
+		kfree(osb->local_alloc_debug_buf);
+		osb->local_alloc_debug_buf = NULL;
+	}
+}
+
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	if (osb->local_alloc_debug)
+		debugfs_remove(osb->local_alloc_debug);
+
+	if (osb->local_alloc_debug_buf)
+		kfree(osb->local_alloc_debug_buf);
+
+	osb->local_alloc_debug_buf = NULL;
+	osb->local_alloc_debug = NULL;
+}
+#else	/* CONFIG_OCFS2_FS_STATS */
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+#endif
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
+{
+	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+		osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters)
+{
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
+		if (num_clusters >= osb->local_alloc_default_bits) {
+			cancel_delayed_work(&osb->la_enable_wq);
+			osb->local_alloc_state = OCFS2_LA_ENABLED;
+		}
+	spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     la_enable_wq.work);
+	spin_lock(&osb->osb_lock);
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+	spin_unlock(&osb->osb_lock);
 }

 /*
 * Tell us whether a given allocation should use the local alloc
 * file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
 */
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
-	int la_bits = ocfs2_local_alloc_window_bits(osb);
 	int ret = 0;
+	int la_bits;

-	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+	spin_lock(&osb->osb_lock);
+	la_bits = osb->local_alloc_bits;
+
+	if (!ocfs2_la_state_enabled(osb))
 		goto bail;

 	/* la_bits should be at least twice the size (in clusters) of
@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 bail:
 	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
 	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+	spin_unlock(&osb->osb_lock);
 	return ret;
 }

@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)

 	mlog_entry_void();

-	if (osb->local_alloc_size == 0)
+	ocfs2_init_la_debug(osb);
+
+	if (osb->local_alloc_bits == 0)
 		goto bail;

-	if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+	if (osb->local_alloc_bits >= osb->bitmap_cpg) {
 		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
 		     "than max possible %u. Using defaults.\n",
-		     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
-		osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+		osb->local_alloc_bits =
+			ocfs2_megabytes_to_clusters(osb->sb,
+						    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
 	}

 	/* read the alloc off disk */
@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 	}

-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)

 bail:
 	if (status < 0)
-		if (alloc_bh)
-			brelse(alloc_bh);
+		brelse(alloc_bh);
 	if (inode)
 		iput(inode);

-	mlog(0, "Local alloc window bits = %d\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	if (status < 0)
+		ocfs2_shutdown_la_debug(osb);
+
+	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);

 	mlog_exit(status);
 	return status;
@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)

 	mlog_entry_void();

+	cancel_delayed_work(&osb->la_enable_wq);
+	flush_workqueue(ocfs2_wq);
+
+	ocfs2_shutdown_la_debug(osb);
+
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;

@ -295,8 +410,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);

 out_unlock:
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);

 	ocfs2_inode_unlock(main_bm_inode, 1);

@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,

 	mutex_lock(&inode->i_mutex);

-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -372,8 +486,7 @@ bail:
 		*alloc_copy = NULL;
 	}

-	if (alloc_bh)
-		brelse(alloc_bh);
+	brelse(alloc_bh);

 	if (inode) {
 		mutex_unlock(&inode->i_mutex);
@ -441,8 +554,7 @@ out_unlock:
 out_mutex:
 	mutex_unlock(&main_bm_inode->i_mutex);

-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);

 	iput(main_bm_inode);

@ -453,8 +565,48 @@ out:
 	return status;
 }

+/* Check to see if the local alloc window is within ac->ac_max_block */
+static int ocfs2_local_alloc_in_range(struct inode *inode,
+				      struct ocfs2_alloc_context *ac,
+				      u32 bits_wanted)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+	int start;
+	u64 block_off;
+
+	if (!ac->ac_max_block)
+		return 1;
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		mlog_errno(-ENOSPC);
+		return 0;
+	}
+
+	/*
+	 * Converting (bm_off + start + bits_wanted) to blocks gives us
+	 * the blkno just past our actual allocation.  This is perfect
+	 * to compare with ac_max_block.
+	 */
+	block_off = ocfs2_clusters_to_blocks(inode->i_sb,
+					     le32_to_cpu(la->la_bm_off) +
+					     start + bits_wanted);
+	mlog(0, "Checking %llu against %llu\n",
+	     (unsigned long long)block_off,
+	     (unsigned long long)ac->ac_max_block);
+	if (block_off > ac->ac_max_block)
+		return 0;
+
+	return 1;
+}
+
 /*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
 * local alloc. You lose them when you drop i_mutex.
 *
 * We will add ourselves to the transaction passed in, but may start
@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,

 	mutex_lock(&local_alloc_inode->i_mutex);

-	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
-		status = -ENOSPC;
-		goto bail;
-	}
-
-	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
-		mlog(0, "Asking for more than my max window size!\n");
+	/*
+	 * We must double check state and allocator bits because
+	 * another process may have changed them while holding i_mutex.
+	 */
+	spin_lock(&osb->osb_lock);
+	if (!ocfs2_la_state_enabled(osb) ||
+	    (bits_wanted > osb->local_alloc_bits)) {
+		spin_unlock(&osb->osb_lock);
 		status = -ENOSPC;
 		goto bail;
 	}
+	spin_unlock(&osb->osb_lock);

 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;

@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 				mlog_errno(status);
 			goto bail;
 		}
+
+		/*
+		 * Under certain conditions, the window slide code
+		 * might have reduced the number of bits available or
+		 * disabled the the local alloc entirely. Re-check
+		 * here and return -ENOSPC if necessary.
+		 */
+		status = -ENOSPC;
+		if (!ocfs2_la_state_enabled(osb))
+			goto bail;
+
+		free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+			le32_to_cpu(alloc->id1.bitmap1.i_used);
+		if (bits_wanted > free_bits)
+			goto bail;
+	}
+
+	if (ac->ac_max_block)
+		mlog(0, "Calling in_range for max block %llu\n",
+		     (unsigned long long)ac->ac_max_block);
+
+	if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
+					bits_wanted)) {
+		/*
+		 * The window is outside ac->ac_max_block.
+		 * This errno tells the caller to keep localalloc enabled
+		 * but to get the allocation from the main bitmap.
+		 */
+		status = -EFBIG;
+		goto bail;
 	}

 	ac->ac_inode = local_alloc_inode;
@ -789,6 +973,85 @@ bail:
 	return status;
 }

+enum ocfs2_la_event {
+	OCFS2_LA_EVENT_SLIDE,		/* Normal window slide. */
+	OCFS2_LA_EVENT_FRAGMENTED,	/* The global bitmap has
+					 * enough bits theoretically
+					 * free, but a contiguous
+					 * allocation could not be
+					 * found. */
+	OCFS2_LA_EVENT_ENOSPC,		/* Global bitmap doesn't have
+					 * enough bits free to satisfy
+					 * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+				  enum ocfs2_la_event event)
+{
+	unsigned int bits;
+	int state;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+		WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+		goto out_unlock;
+	}
+
+	/*
+	 * ENOSPC and fragmentation are treated similarly for now.
+	 */
+	if (event == OCFS2_LA_EVENT_ENOSPC ||
+	    event == OCFS2_LA_EVENT_FRAGMENTED) {
+		/*
+		 * We ran out of contiguous space in the primary
+		 * bitmap. Drastically reduce the number of bits used
+		 * by local alloc until we have to disable it.
+		 */
+		bits = osb->local_alloc_bits >> 1;
+		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+			/*
+			 * By setting state to THROTTLED, we'll keep
+			 * the number of local alloc bits used down
+			 * until an event occurs which would give us
+			 * reason to assume the bitmap situation might
+			 * have changed.
+			 */
+			osb->local_alloc_state = OCFS2_LA_THROTTLED;
+			osb->local_alloc_bits = bits;
+		} else {
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+				   OCFS2_LA_ENABLE_INTERVAL);
+		goto out_unlock;
+	}
+
+	/*
+	 * Don't increase the size of the local alloc window until we
+	 * know we might be able to fulfill the request. Otherwise, we
+	 * risk bouncing around the global bitmap during periods of
+	 * low space.
+	 */
+	if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+		osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+	state = osb->local_alloc_state;
+	spin_unlock(&osb->osb_lock);
+
+	return state;
+}
+
 static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 						struct ocfs2_alloc_context **ac,
 						struct inode **bitmap_inode,
@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 		goto bail;
 	}

-	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+retry_enospc:
+	(*ac)->ac_bits_wanted = osb->local_alloc_bits;

 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status == -ENOSPC) {
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ocfs2_free_ac_resource(*ac);
+		memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+		goto retry_enospc;
+	}
 	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
+		mlog_errno(status);
 		goto bail;
 	}

@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 		     "one\n");

 	mlog(0, "Allocating %u clusters for a new window.\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	     osb->local_alloc_bits);

 	/* Instruct the allocation code to try the most recently used
 	 * cluster group. We'll re-record the group used this pass
@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	/* we used the generic suballoc reserve function, but we set
 	 * everything up nicely, so there's no reason why we can't use
 	 * the more specific cluster api to claim bits. */
-	status = ocfs2_claim_clusters(osb, handle, ac,
-				      ocfs2_local_alloc_window_bits(osb),
+	status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
 				      &cluster_off, &cluster_count);
+	if (status == -ENOSPC) {
+retry_enospc:
+		/*
+		 * Note: We could also try syncing the journal here to
+		 * allow use of any free bits which the current
+		 * transaction can't give us access to. --Mark
+		 */
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		status = ocfs2_claim_clusters(osb, handle, ac,
+					      osb->local_alloc_bits,
+					      &cluster_off,
+					      &cluster_count);
+		if (status == -ENOSPC)
+			goto retry_enospc;
+		/*
+		 * We only shrunk the *minimum* number of in our
+		 * request - it's entirely possible that the allocator
+		 * might give us more than we asked for.
+		 */
+		if (status == 0) {
+			spin_lock(&osb->osb_lock);
+			osb->local_alloc_bits = cluster_count;
+			spin_unlock(&osb->osb_lock);
+		}
+	}
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,

 	mlog_entry_void();

+	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
 	/* This will lock the main bitmap for us. */
 	status = ocfs2_local_alloc_reserve_for_window(osb,
 						      &ac,
@ -976,8 +1277,7 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);

-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);

 	if (main_bm_inode)
 		iput(main_bm_inode);
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 u32 *bit_off,
 				 u32 *num_bits);

+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
 #endif /* OCFS2_LOCALALLOC_H */
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@ -24,6 +24,7 @@
 */

 #include <linux/fs.h>
+#include <linux/fcntl.h>

 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@ -32,6 +33,7 @@

 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
 #include "locks.h"

 static int ocfs2_do_flock(struct file *file, struct inode *inode,
@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	else
 		return ocfs2_do_flock(file, inode, cmd, fl);
 }
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(inode))
+		return -ENOLCK;
+
+	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@ -27,5 +27,6 @@
 #define OCFS2_LOCKS_H

 int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);

 #endif /* OCFS2_LOCKS_H */
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@ -60,6 +60,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"

 #include "buffer_head_io.h"

@ -327,14 +328,9 @@ leave:
 	if (status == -ENOSPC)
 		mlog(0, "Disk is full\n");

-	if (new_fe_bh)
-		brelse(new_fe_bh);
-
-	if (de_bh)
-		brelse(de_bh);
-
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(new_fe_bh);
+	brelse(de_bh);
+	brelse(parent_fe_bh);

 	if ((status < 0) && inode)
 		iput(inode);
@ -647,12 +643,9 @@ out_unlock_inode:
 out:
 	ocfs2_inode_unlock(dir, 1);

-	if (de_bh)
-		brelse(de_bh);
-	if (fe_bh)
-		brelse(fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(de_bh);
+	brelse(fe_bh);
+	brelse(parent_fe_bh);

 	mlog_exit(err);

@ -851,17 +844,10 @@ leave:
 		iput(orphan_dir);
 	}

-	if (fe_bh)
-		brelse(fe_bh);
-
-	if (dirent_bh)
-		brelse(dirent_bh);
-
-	if (parent_node_bh)
-		brelse(parent_node_bh);
-
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
+	brelse(fe_bh);
+	brelse(dirent_bh);
+	brelse(parent_node_bh);
+	brelse(orphan_entry_bh);

 	mlog_exit(status);

@ -1372,24 +1358,15 @@ bail:

 	if (new_inode)
 		iput(new_inode);
-	if (newfe_bh)
-		brelse(newfe_bh);
-	if (old_inode_bh)
-		brelse(old_inode_bh);
-	if (old_dir_bh)
-		brelse(old_dir_bh);
-	if (new_dir_bh)
-		brelse(new_dir_bh);
-	if (new_de_bh)
-		brelse(new_de_bh);
-	if (old_de_bh)
-		brelse(old_de_bh);
-	if (old_inode_de_bh)
-		brelse(old_inode_de_bh);
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
-	if (insert_entry_bh)
-		brelse(insert_entry_bh);
+	brelse(newfe_bh);
+	brelse(old_inode_bh);
+	brelse(old_dir_bh);
+	brelse(new_dir_bh);
+	brelse(new_de_bh);
+	brelse(old_de_bh);
+	brelse(old_inode_de_bh);
+	brelse(orphan_entry_bh);
+	brelse(insert_entry_bh);

 	mlog_exit(status);

@ -1492,8 +1469,7 @@ bail:

 	if (bhs) {
 		for(i = 0; i < blocks; i++)
-			if (bhs[i])
-				brelse(bhs[i]);
+			brelse(bhs[i]);
 		kfree(bhs);
 	}

@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;

 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
-						    new_fe_bh,
-						    handle, data_ac, NULL,
-						    NULL);
+		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+					      new_fe_bh,
+					      handle, data_ac, NULL,
+					      NULL);
 		if (status < 0) {
 			if (status != -ENOSPC && status != -EINTR) {
 				mlog(ML_ERROR,
@ -1659,12 +1635,9 @@ bail:

 	ocfs2_inode_unlock(dir, 1);

-	if (new_fe_bh)
-		brelse(new_fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
-	if (de_bh)
-		brelse(de_bh);
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	brelse(de_bh);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
@ -1759,8 +1732,7 @@ leave:
 		iput(orphan_dir_inode);
 	}

-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);

 	mlog_exit(status);
 	return status;
@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,

 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);

-	status = ocfs2_read_block(osb,
+	status = ocfs2_read_block(orphan_dir_inode,
 				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh, OCFS2_BH_CACHED,
-				  orphan_dir_inode);
+				  &orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);

 leave:
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);

 	mlog_exit(status);
 	return status;
@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	}

 leave:
-	if (target_de_bh)
-		brelse(target_de_bh);
+	brelse(target_de_bh);

 	mlog_exit(status);
 	return status;
@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@ -34,7 +34,12 @@
 #include <linux/workqueue.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif

 /* For union ocfs2_dlm_lksb */
 #include "stackglue.h"
@ -171,9 +176,13 @@ struct ocfs2_alloc_stats

 enum ocfs2_local_alloc_state
 {
-	OCFS2_LA_UNUSED = 0,
-	OCFS2_LA_ENABLED,
-	OCFS2_LA_DISABLED
+	OCFS2_LA_UNUSED = 0,	/* Local alloc will never be used for
+				 * this mountpoint. */
+	OCFS2_LA_ENABLED,	/* Local alloc is in use. */
+	OCFS2_LA_THROTTLED,	/* Local alloc is in use, but number
+				 * of bits has been reduced. */
+	OCFS2_LA_DISABLED	/* Local alloc has temporarily been
+				 * disabled. */
 };

 enum ocfs2_mount_options
@ -184,6 +193,8 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
 };

 #define OCFS2_OSB_SOFT_RO	0x0001
@ -214,6 +225,7 @@ struct ocfs2_super
 	u32 bitmap_cpg;
 	u8 *uuid;
 	char *uuid_str;
+	u32 uuid_hash;
 	u8 *vol_label;
 	u64 first_cluster_group_blkno;
 	u32 fs_generation;
@ -241,6 +253,7 @@ struct ocfs2_super
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
+	unsigned int s_xattr_inline_size;

 	atomic_t vol_state;
 	struct mutex recovery_lock;
@ -252,11 +265,27 @@ struct ocfs2_super
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;

-	int local_alloc_size;
-	enum ocfs2_local_alloc_state local_alloc_state;
+	struct delayed_work		la_enable_wq;
+
+	/*
+	 * Must hold local alloc i_mutex and osb->osb_lock to change
+	 * local_alloc_bits. Reads can be done under either lock.
+	 */
+	unsigned int local_alloc_bits;
+	unsigned int local_alloc_default_bits;
+
+	enum ocfs2_local_alloc_state local_alloc_state; /* protected
+							 * by osb_lock */
+
 	struct buffer_head *local_alloc_bh;
+
 	u64 la_last_gd;

+#ifdef CONFIG_OCFS2_FS_STATS
+	struct dentry *local_alloc_debug;
+	char *local_alloc_debug_buf;
+#endif
+
 	/* Next two fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
 	return 0;
 }

+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
 	return pages_per_cluster;
 }

+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+						       unsigned int megs)
+{
+	BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
 static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 {
 	spin_lock(&osb->osb_lock);
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@ -64,6 +64,7 @@
 #define OCFS2_INODE_SIGNATURE		"INODE01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"

 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@ -90,7 +91,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
-					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
+					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+					 | OCFS2_FEATURE_INCOMPAT_XATTR)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN

 /*
@ -127,10 +129,6 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040

-/* Support for the extended slot map */
-#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
-
-
 /*
 * Support for alternate, userspace cluster stacks.  If set, the superblock
 * field s_cluster_info contains a tag for the alternate stack in use as
@ -142,6 +140,12 @@
 */
 #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080

+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
+
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
 */
 #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE	8

+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE     256
+
 struct ocfs2_system_inode_info {
 	char	*si_name;
 	int	si_iflags;
@ -563,7 +573,7 @@ struct ocfs2_super_block {
 /*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
 					   before tunefs required */
 	__le16 s_tunefs_flag;
-	__le32 s_reserved1;
+	__le32 s_uuid_hash;		/* hash value of uuid */
 	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
@ -571,7 +581,11 @@ struct ocfs2_super_block {
 /*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
 						     stack.  Only valid
 						     with INCOMPAT flag. */
-/*B8*/  __le64 s_reserved2[17];		/* Fill out superblock */
+/*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
+					   for this fs*/
+	__le16 s_reserved0;
+	__le32 s_reserved1;
+/*C0*/  __le64 s_reserved2[16];		/* Fill out superblock */
 /*140*/

 	/*
@ -621,7 +635,8 @@ struct ocfs2_dinode {
 					   belongs to */
 	__le16 i_suballoc_bit;		/* Bit offset in suballocator
 					   block group */
-/*10*/	__le32 i_reserved0;
+/*10*/	__le16 i_reserved0;
+	__le16 i_xattr_inline_size;
 	__le32 i_clusters;		/* Cluster count */
 	__le32 i_uid;			/* Owner UID */
 	__le32 i_gid;			/* Owning GID */
@ -640,11 +655,12 @@ struct ocfs2_dinode {
 	__le32 i_atime_nsec;
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
-	__le32 i_attr;
+/*70*/	__le32 i_attr;
 	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
 					   was set in i_flags */
 	__le16 i_dyn_features;
-/*70*/	__le64 i_reserved2[8];
+	__le64 i_xattr_loc;
+/*80*/	__le64 i_reserved2[7];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@ -715,6 +731,136 @@ struct ocfs2_group_desc
 /*40*/	__u8    bg_bitmap[0];
 };

+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
+	__le16	xe_name_offset;  /* byte offset from the 1st etnry in the local
+				    local xattr storage(inode, xattr block or
+				    xattr bucket). */
+	__u8	xe_name_len;	 /* xattr name len, does't include prefix. */
+	__u8	xe_type;         /* the low 7 bits indicates the name prefix's
+				  * type and the highest 1 bits indicate whether
+				  * the EA is stored in the local storage. */
+	__le64	xe_value_size;	 /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+	__le16	xh_count;                       /* contains the count of how
+						   many records are in the
+						   local xattr storage. */
+	__le16	xh_free_start;                  /* current offset for storing
+						   xattr. */
+	__le16	xh_name_value_len;              /* total length of name/value
+						   length in this bucket. */
+	__le16	xh_num_buckets;                 /* bucket nums in one extent
+						   record, only valid in the
+						   first bucket. */
+	__le64  xh_csum;
+	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * It is used when one extended attribute's size is larger, and we will save it
+ * in an outside cluster. It will stored in a b-tree like file content.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
+	__le32	xr_reserved0;
+	__le64	xr_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/	__le32	xt_clusters;              /* clusters covered by xattr. */
+	__le32	xt_reserved0;
+	__le64	xt_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED	0x1
+#define OCFS2_HASH_SHIFT	5
+#define OCFS2_XATTR_ROUND	3
+#define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
+				~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE			4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET 	(OCFS2_XATTR_BUCKET_SIZE \
+						 / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/	__u8	xb_signature[8];     /* Signature for verification */
+	__le16	xb_suballoc_slot;    /* Slot suballocator this
+					block belongs to. */
+	__le16	xb_suballoc_bit;     /* Bit offset in suballocator
+					block group */
+	__le32	xb_fs_generation;    /* Must match super block */
+/*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
+	__le64	xb_csum;
+/*20*/	__le16	xb_flags;            /* Indicates whether this block contains
+					real xattr or a xattr tree. */
+	__le16	xb_reserved0;
+	__le32  xb_reserved1;
+	__le64	xb_reserved2;
+/*30*/	union {
+		struct ocfs2_xattr_header xb_header; /* xattr header if this
+							block contains xattr */
+		struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+							block cotains xattr
+							tree. */
+	} xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL		0x80
+#define OCFS2_XATTR_TYPE_MASK		0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+					 int local)
+{
+	if (local)
+		xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+	else
+		xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+	xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
 		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }

+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+						   struct ocfs2_dinode *di)
+{
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			xattrsize;
+	else
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
 	int size;
@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 	return size / sizeof(struct ocfs2_extent_rec);
 }

+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+						struct super_block *sb,
+						struct ocfs2_dinode *di)
+{
+	int size;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			xattrsize;
+	else
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
 	int size;
@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
 	return 0;

 }
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)

 	return 0;
 }
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #endif  /* __KERNEL__ */


--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_jbd_compat.h
+ *
+ * Compatibility defines for JBD.
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_JBD_COMPAT_H
+#define OCFS2_JBD_COMPAT_H
+
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# error Should not have been included
+#endif
+
+struct jbd2_inode {
+	unsigned int dummy;
+};
+
+#define JBD2_BARRIER			JFS_BARRIER
+#define JBD2_DEFAULT_MAX_COMMIT_AGE	JBD_DEFAULT_MAX_COMMIT_AGE
+
+#define jbd2_journal_ack_err			journal_ack_err
+#define jbd2_journal_clear_err			journal_clear_err
+#define jbd2_journal_destroy			journal_destroy
+#define jbd2_journal_dirty_metadata		journal_dirty_metadata
+#define jbd2_journal_errno			journal_errno
+#define jbd2_journal_extend			journal_extend
+#define jbd2_journal_flush			journal_flush
+#define jbd2_journal_force_commit		journal_force_commit
+#define jbd2_journal_get_write_access		journal_get_write_access
+#define jbd2_journal_get_undo_access		journal_get_undo_access
+#define jbd2_journal_init_inode			journal_init_inode
+#define jbd2_journal_invalidatepage		journal_invalidatepage
+#define jbd2_journal_load			journal_load
+#define jbd2_journal_lock_updates		journal_lock_updates
+#define jbd2_journal_restart			journal_restart
+#define jbd2_journal_start			journal_start
+#define jbd2_journal_start_commit		journal_start_commit
+#define jbd2_journal_stop			journal_stop
+#define jbd2_journal_try_to_free_buffers	journal_try_to_free_buffers
+#define jbd2_journal_unlock_updates		journal_unlock_updates
+#define jbd2_journal_wipe			journal_wipe
+#define jbd2_log_wait_commit			log_wait_commit
+
+static inline int jbd2_journal_file_inode(handle_t *handle,
+					  struct jbd2_inode *inode)
+{
+	return 0;
+}
+
+static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+						      loff_t new_size)
+{
+	return 0;
+}
+
+static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
+					       struct inode *inode)
+{
+	return;
+}
+
+static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
+						  struct jbd2_inode *jinode)
+{
+	return;
+}
+
+
+#endif  /* OCFS2_JBD_COMPAT_H */
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
 		if (cluster > clusters)
 			break;

-		ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+		ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
 		if (ret < 0) {
 			mlog_errno(ret);
 			break;
@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
 	 * update the superblock last.
 	 * It doesn't matter if the write failed.
 	 */
-	ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
-			       &super_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+				     &super_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);

-	ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
-			       main_bm_inode);
+	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_unlock;
 	}

-	ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
 	if (ret < 0) {
 		mlog(ML_ERROR, "Can't read the group descriptor # %llu "
 		     "from the device.", (unsigned long long)input->group);
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
-	ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
-				si->si_inode);
+	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
+				OCFS2_BH_IGNORE_CACHE);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 		     (unsigned long long)blkno);

 		bh = NULL;  /* Acquire a fresh bh */
-		status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@ -28,6 +28,7 @@
 #include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"

+#include <linux/dlm_plock.h>

 /*
 * The control protocol starts with a handshake.  Until the handshake
@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
 }

+static int user_plock(struct ocfs2_cluster_connection *conn,
+		      u64 ino,
+		      struct file *file,
+		      int cmd,
+		      struct file_lock *fl)
+{
+	/*
+	 * This more or less just demuxes the plock request into any
+	 * one of three dlm calls.
+	 *
+	 * Internally, fs/dlm will pass these to a misc device, which
+	 * a userspace daemon will read and write to.
+	 *
+	 * For now, cancel requests (which happen internally only),
+	 * are turned into unlocks. Most of this function taken from
+	 * gfs2_lock.
+	 */
+
+	if (cmd == F_CANCELLK) {
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+	else
+		return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
 /*
 * Compare a requested locking protocol version against the current one.
 *
@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
 	.dlm_unlock	= user_dlm_unlock,
 	.lock_status	= user_dlm_lock_status,
 	.lock_lvb	= user_dlm_lvb,
+	.plock		= user_plock,
 	.dump_lksb	= user_dlm_dump_lksb,
 };

--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@ -288,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);

+int ocfs2_stack_supports_plocks(void)
+{
+	return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl)
+{
+	WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+	if (active_stack->sp_ops->plock)
+		return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@ -28,6 +28,10 @@
 #include "dlm/dlmapi.h"
 #include <linux/dlm.h>

+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
 /*
 * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
 * some day, but right now we need it.  Let's fake it.  This value is larger
@ -186,6 +190,17 @@ struct ocfs2_stack_operations {
 	 */
 	void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);

+	/*
+	 * Cluster-aware posix locks
+	 *
+	 * This is NULL for stacks which do not support posix locks.
+	 */
+	int (*plock)(struct ocfs2_cluster_connection *conn,
+		     u64 ino,
+		     struct file *file,
+		     int cmd,
+		     struct file_lock *fl);
+
 	/*
 	 * This is an optoinal debugging hook.  If provided, the
 	 * stack can dump debugging information about this lock.
@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);

+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl);
+
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);


--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
 				  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh);
+				   struct buffer_head *bh,
+				   u64 max_block);

 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found);
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found);
 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 				     struct ocfs2_alloc_context *ac,
@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u64 data_blkno,
 						u64 *bg_blkno,
 						u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac);

-static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
 	struct inode *inode = ac->ac_inode;

@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 		iput(inode);
 		ac->ac_inode = NULL;
 	}
-	if (ac->ac_bh) {
-		brelse(ac->ac_bh);
-		ac->ac_bh = NULL;
-	}
+	brelse(ac->ac_bh);
+	ac->ac_bh = NULL;
 }

 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 */
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh)
+				   struct buffer_head *bh,
+				   u64 max_block)
 {
 	int status, credits;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	mlog_entry_void();

 	cl = &fe->id2.i_chain;
-	status = ocfs2_reserve_clusters(osb,
-					le16_to_cpu(cl->cl_cpg),
-					&ac);
+	status = ocfs2_reserve_clusters_with_limit(osb,
+						   le16_to_cpu(cl->cl_cpg),
+						   max_block, &ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@ -394,8 +399,7 @@ bail:
 	if (ac)
 		ocfs2_free_alloc_context(ac);

-	if (bg_bh)
-		brelse(bg_bh);
+	brelse(bg_bh);

 	mlog_exit(status);
 	return status;
@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 			goto bail;
 		}

-		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+						 ac->ac_max_block);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	get_bh(bh);
 	ac->ac_bh = bh;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);

 	mlog_exit(status);
 	return status;
 }

-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
-			       struct ocfs2_alloc_context **ac)
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac)
 {
 	int status;
 	u32 slot;
@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 		goto bail;
 	}

-	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_bits_wanted = blocks;
 	(*ac)->ac_which = OCFS2_AC_USE_META;
 	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
@ -532,6 +536,15 @@ bail:
 	return status;
 }

+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_new_metadata_blocks(osb,
+					ocfs2_extend_meta_needed(root_el),
+					ac);
+}
+
 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
 					      struct ocfs2_alloc_context *ac)
 {
@ -581,6 +594,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,

 	(*ac)->ac_group_search = ocfs2_block_group_search;

+	/*
+	 * stat(2) can't handle i_ino > 32bits, so we tell the
+	 * lower levels not to allocate us a block group past that
+	 * limit.  The 'inode64' mount option avoids this behavior.
+	 */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+		(*ac)->ac_max_block = (u32)~0U;
+
 	/*
 	 * slot is set when we successfully steal inode from other nodes.
 	 * It is reset in 3 places:
@ -661,9 +682,9 @@ bail:
 /* Callers don't need to care which bitmap (local alloc or main) to
 * use so we figure it out for them, but unfortunately this clutters
 * things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
-			   u32 bits_wanted,
-			   struct ocfs2_alloc_context **ac)
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac)
 {
 	int status;

@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 	}

 	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_max_block = max_block;

 	status = -ENOSPC;
 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
 		status = ocfs2_reserve_local_alloc_bits(osb,
 							bits_wanted,
 							*ac);
-		if ((status < 0) && (status != -ENOSPC)) {
+		if (status == -EFBIG) {
+			/* The local alloc window is outside ac_max_block.
+			 * use the main bitmap. */
+			status = -ENOSPC;
+		} else if ((status < 0) && (status != -ENOSPC)) {
 			mlog_errno(status);
 			goto bail;
-		} else if (status == -ENOSPC) {
-			/* reserve_local_bits will return enospc with
-			 * the local alloc inode still locked, so we
-			 * can change this safely here. */
-			mlog(0, "Disabling local alloc\n");
-			/* We set to OCFS2_LA_DISABLED so that umount
-			 * can clean up what's left of the local
-			 * allocation */
-			osb->local_alloc_state = OCFS2_LA_DISABLED;
 		}
 	}

@ -718,6 +735,13 @@ bail:
 	return status;
 }

+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+}
+
 /*
 * More or less lifted from ext3. I'll leave their description below:
 *
@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found)
 {
 	int search = -ENOSPC;
 	int ret;
+	u64 blkoff;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u16 tmp_off, tmp_found;
 	unsigned int max_bits, gd_cluster_off;

@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 		if (ret)
 			return ret;

+		if (max_block) {
+			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+							  gd_cluster_off +
+							  tmp_off + tmp_found);
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				return -ENOSPC;
+		}
+
 		/* ocfs2_block_group_find_clear_bits() might
 		 * return success, but we still want to return
 		 * -ENOSPC unless it found the minimum number
@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 			*bit_off = tmp_off;
 			*bits_found = tmp_found;
 			search = 0; /* success */
+		} else if (tmp_found) {
+			/*
+			 * Don't show bits which we'll be returning
+			 * for allocation to the local alloc bitmap.
+			 */
+			ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
 		}
 	}

@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found)
 {
 	int ret = -ENOSPC;
+	u64 blkoff;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;

 	BUG_ON(min_bits != 1);
 	BUG_ON(ocfs2_is_cluster_bitmap(inode));

-	if (bg->bg_free_bits_count)
+	if (bg->bg_free_bits_count) {
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
 							group_bh, bits_wanted,
 							le16_to_cpu(bg->bg_bits),
 							bit_off, bits_found);
+		if (!ret && max_block) {
+			blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+				*bits_found;
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				ret = -ENOSPC;
+		}
+	}

 	return ret;
 }
@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	struct ocfs2_group_desc *gd;
 	struct inode *alloc_inode = ac->ac_inode;

-	ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
-			       &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	}

 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-				  bit_off, &found);
+				  ac->ac_max_block, bit_off, &found);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	     bits_wanted, chain,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);

-	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+	status = ocfs2_read_block(alloc_inode,
 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+				  &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	/* for now, the chain search is a bit simplistic. We just use
 	 * the 1st group with any empty bits. */
 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
-					     bits_wanted, min_bits, bit_off,
+					     bits_wanted, min_bits,
+					     ac->ac_max_block, bit_off,
 					     &tmp_bits)) == -ENOSPC) {
 		if (!bg->bg_next_group)
 			break;

-		if (prev_group_bh) {
-			brelse(prev_group_bh);
-			prev_group_bh = NULL;
-		}
+		brelse(prev_group_bh);
+		prev_group_bh = NULL;
+
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		group_bh = NULL;
-		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
-					  next_group, &group_bh,
-					  OCFS2_BH_CACHED, alloc_inode);
+		status = ocfs2_read_block(alloc_inode,
+					  next_group, &group_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
-	if (group_bh)
-		brelse(group_bh);
-	if (prev_group_bh)
-		brelse(prev_group_bh);
+	brelse(group_bh);
+	brelse(prev_group_bh);

 	mlog_exit(status);
 	return status;
@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 {
 	int status = 0;
 	u32 tmp_used;
-	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
 	struct buffer_head *group_bh = NULL;
@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);

-	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
-				  alloc_inode);
+	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	}

 bail:
-	if (group_bh)
-		brelse(group_bh);
+	brelse(group_bh);

 	mlog_exit(status);
 	return status;
@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
 					  bg_start_bit, bg_blkno,
 					  num_clusters);
-	if (status < 0)
+	if (status < 0) {
 		mlog_errno(status);
+		goto out;
+	}

+	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+					 num_clusters);
+
+out:
 	mlog_exit(status);
 	return status;
 }
@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
 	}
 }
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+			  struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+	num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_to_add == 0)
+		goto out;
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@ -28,10 +28,11 @@

 typedef int (group_search_t)(struct inode *,
 			     struct buffer_head *,
-			     u32,
-			     u32,
-			     u16 *,
-			     u16 *);
+			     u32,			/* bits_wanted */
+			     u32,			/* min_bits */
+			     u64,			/* max_block */
+			     u16 *,			/* *bit_off */
+			     u16 *);			/* *bits_found */

 struct ocfs2_alloc_context {
 	struct inode *ac_inode;    /* which bitmap are we allocating from? */
@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
 	group_search_t *ac_group_search;

 	u64    ac_last_group;
+	u64    ac_max_block;  /* Highest block number to allocate. 0 is
+				 is the same as ~0 - unlimited */
 };

 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 	return ac->ac_bits_wanted - ac->ac_bits_given;
 }

+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
+			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
 * apis above. */
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 				      struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);

 /* given a cluster offset, calculate which block group it belongs to
 * and return that block offset. */
@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
 				 struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 #endif /* _CHAINALLOC_H_ */
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@ -64,6 +64,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
+#include "xattr.h"

 #include "buffer_head_io.h"

@ -154,6 +155,9 @@ enum {
 	Opt_localalloc,
 	Opt_localflocks,
 	Opt_stack,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_inode64,
 	Opt_err,
 };

@ -173,6 +177,9 @@ static const match_table_t tokens = {
 	{Opt_localalloc, "localalloc=%d"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_stack, "cluster_stack=%s"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_inode64, "inode64"},
 	{Opt_err, NULL}
 };

@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 	}

-	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+	if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+				      &target)) {
 		if (wait)
-			log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
-					target);
+			jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					     target);
 	}
 	return 0;
 }
@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 	if (!oi)
 		return NULL;

+	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
 	return &oi->vfs_inode;
 }

@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		goto out;
 	}

+	/* Probably don't want this on remount; it might
+	 * mess with other nodes */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+		goto out;
+	}
+
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		/* Lock here so the check of HARD_RO and the potential
@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
-	osb->local_alloc_size = parsed_options.localalloc_opt;
+	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+	osb->local_alloc_bits = osb->local_alloc_default_bits;

 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	return status;

 read_super_error:
-	if (bh != NULL)
-		brelse(bh);
+	brelse(bh);

 	if (inode)
 		iput(inode);
@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_data_writeback:
 			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
+		case Opt_user_xattr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_nouser_xattr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+			break;
 		case Opt_atime_quantum:
 			if (match_int(&args[0], &option)) {
 				status = 0;
@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 			if (option < 0)
 				return 0;
 			if (option == 0)
-				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			mopt->commit_interval = HZ * option;
 			break;
 		case Opt_localalloc:
@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
 			       OCFS2_STACK_LABEL_LEN);
 			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
 			break;
+		case Opt_inode64:
+			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
 	unsigned long opts = osb->s_mount_opt;
+	unsigned int local_alloc_megs;

 	if (opts & OCFS2_MOUNT_HB_LOCAL)
 		seq_printf(s, ",_netdev,heartbeat=local");
@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",commit=%u",
 			   (unsigned) (osb->osb_commit_interval / HZ));

-	if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
-		seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+	if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+		seq_printf(s, ",localalloc=%d", local_alloc_megs);

 	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
 		seq_printf(s, ",localflocks,");
@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);

+	if (opts & OCFS2_MOUNT_NOUSERXATTR)
+		seq_printf(s, ",nouser_xattr");
+	else
+		seq_printf(s, ",user_xattr");
+
+	if (opts & OCFS2_MOUNT_INODE64)
+		seq_printf(s, ",inode64");
+
 	return 0;
 }

@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data)
 	oi->ip_dir_start_lookup = 0;

 	init_rwsem(&oi->ip_alloc_sem);
+	init_rwsem(&oi->ip_xattr_sem);
 	mutex_init(&oi->ip_io_mutex);

 	oi->ip_blkno = 0ULL;
@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,

 	osb->slot_num = OCFS2_INVALID_SLOT;

+	osb->s_xattr_inline_size = le16_to_cpu(
+					di->id2.i_super.s_xattr_inline_size);
+
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
+	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);

 	init_waitqueue_head(&osb->osb_mount_event);

@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->first_cluster_group_blkno =
 		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
 	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
 	mlog(0, "vol_label: %s\n", osb->vol_label);
 	mlog(0, "uuid: %s\n", osb->uuid_str);
 	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@ -50,6 +50,7 @@
 #include "inode.h"
 #include "journal.h"
 #include "symlink.h"
+#include "xattr.h"

 #include "buffer_head_io.h"

@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,

 	mlog_entry_void();

-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		link = ERR_PTR(status);
@ -157,8 +154,7 @@ bail:
 		kunmap(page);
 		page_cache_release(page);
 	}
-	if (bh)
-		brelse(bh);
+	brelse(bh);

 	return ERR_PTR(status);
 }
@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
 	.readlink	= ocfs2_readlink,
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@ -53,7 +53,11 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+#endif

 #define MLOG_MASK_PREFIX ML_UPTODATE

@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
 	ci->ci_num_cached--;
 }

-/* Called when we remove a chunk of metadata from an inode. We don't
- * bother reverting things to an inlined array in the case of a remove
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
-			     struct buffer_head *bh)
+static void ocfs2_remove_block_from_cache(struct inode *inode,
+					  sector_t block)
 {
 	int index;
-	sector_t block = bh->b_blocknr;
 	struct ocfs2_meta_cache_item *item = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
 		kmem_cache_free(ocfs2_uptodate_cachep, item);
 }

+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	sector_t block = bh->b_blocknr;
+
+	ocfs2_remove_block_from_cache(inode, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len)
+{
+	unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+
+	for (i = 0; i < b_len; i++, block++)
+		ocfs2_remove_block_from_cache(inode, block);
+}
+
 int __init init_ocfs2_uptodate_cache(void)
 {
 	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
 				   struct buffer_head *bh);
 void ocfs2_remove_from_cache(struct inode *inode,
 			     struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len);
 int ocfs2_buffer_read_ahead(struct inode *inode,
 			    struct buffer_head *bh);

--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+	OCFS2_XATTR_INDEX_USER = 1,
+	OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+	OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	OCFS2_XATTR_INDEX_TRUSTED,
+	OCFS2_XATTR_INDEX_SECURITY,
+	OCFS2_XATTR_MAX
+};
+
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+			   size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+	u16 len = sb->s_blocksize -
+		 offsetof(struct ocfs2_xattr_header, xh_entries);
+
+	return len / sizeof(struct ocfs2_xattr_entry);
+}
+#endif /* OCFS2_XATTR_H */