[GFS2] Faster gfs2_bitfit algorithm

This version of the gfs2_bitfit algorithm includes the latest suggestions from Steve Whitehouse. It is typically eight to ten times faster than the version we're using today. If there is a lot of metadata mixed in (lots of small files) the algorithm is often 15 times faster, and given the right conditions, I've seen peaks of 20 times faster. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2008-03-10 18:17:47 -05:00 · 2008-03-10 18:17:47 -05:00 · 1f466a47e8
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/prefetch.h>

 #include "gfs2.h"
 #include "incore.h"
@ -33,6 +34,16 @@
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)

+#if BITS_PER_LONG == 32
+#define LBITMASK   (0x55555555UL)
+#define LBITSKIP55 (0x55555555UL)
+#define LBITSKIP00 (0x00000000UL)
+#else
+#define LBITMASK   (0x5555555555555555UL)
+#define LBITSKIP55 (0x5555555555555555UL)
+#define LBITSKIP00 (0x0000000000000000UL)
+#endif
+
 /*
 * These routines are used by the resource group routines (rgrp.c)
 * to keep track of block allocation.  Each block is represented by two
@ -138,45 +149,63 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
 static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
 		       u8 old_state)
 {
-	const u8 *byte;
-	u32 blk = goal;
-	unsigned int bit, bitlong;
-	const unsigned long *plong;
-#if BITS_PER_LONG == 32
-	const unsigned long plong55 = 0x55555555;
-#else
-	const unsigned long plong55 = 0x5555555555555555;
-#endif
+	const u8 *byte, *start, *end;
+	int bit, startbit;
+	u32 g1, g2, misaligned;
+	unsigned long *plong;
+	unsigned long lskipval;

-	byte = buffer + (goal / GFS2_NBBY);
-	plong = (const unsigned long *)(buffer + (goal / GFS2_NBBY));
-	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	bitlong = bit;
-
-	while (byte < buffer + buflen) {
-
-		if (bitlong == 0 && old_state == 0 && *plong == plong55) {
-			plong++;
-			byte += sizeof(unsigned long);
-			blk += sizeof(unsigned long) * GFS2_NBBY;
-			continue;
+	lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
+	g1 = (goal / GFS2_NBBY);
+	start = buffer + g1;
+	byte = start;
+        end = buffer + buflen;
+	g2 = ALIGN(g1, sizeof(unsigned long));
+	plong = (unsigned long *)(buffer + g2);
+	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+	misaligned = g2 - g1;
+	if (!misaligned)
+		goto ulong_aligned;
+/* parse the bitmap a byte at a time */
+misaligned:
+	while (byte < end) {
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
+			return goal +
+				(((byte - start) * GFS2_NBBY) +
+				 ((bit - startbit) >> 1));
 		}
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
-			return blk;
 		bit += GFS2_BIT_SIZE;
-		if (bit >= 8) {
+		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
 			bit = 0;
 			byte++;
+			misaligned--;
+			if (!misaligned) {
+				plong = (unsigned long *)byte;
+				goto ulong_aligned;
 			}
-		bitlong += GFS2_BIT_SIZE;
-		if (bitlong >= sizeof(unsigned long) * 8) {
-			bitlong = 0;
+		}
+	}
+	return BFITNOENT;
+
+/* parse the bitmap a unsigned long at a time */
+ulong_aligned:
+	/* Stop at "end - 1" or else prefetch can go past the end and segfault.
+	   We could "if" it but we'd lose some of the performance gained.
+	   This way will only slow down searching the very last 4/8 bytes
+	   depending on architecture.  I've experimented with several ways
+	   of writing this section such as using an else before the goto
+	   but this one seems to be the fastest. */
+	while ((unsigned char *)plong < end - 1) {
+		prefetch(plong + 1);
+		if (((*plong) & LBITMASK) != lskipval)
+			break;
 		plong++;
 	}
-
-		blk++;
+	if ((unsigned char *)plong < end) {
+		byte = (const u8 *)plong;
+		misaligned += sizeof(unsigned long) - 1;
+		goto misaligned;
 	}
-
 	return BFITNOENT;
 }