bcache: Kill unused freelist

This was originally added as at optimization that for various reasons isn't needed anymore, but it does add a lot of nasty corner cases (and it was responsible for some recently fixed bugs). Just get rid of it now. Signed-off-by: Kent Overstreet <kmo@daterainc.com>
2014-03-17 16:55:55 -07:00 · 2014-03-17 16:55:55 -07:00 · 2531d9ee61
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
 	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
 	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);

-	if (CACHE_SYNC(&ca->set->sb)) {
-		ca->need_save_prio = max(ca->need_save_prio,
-					 bucket_disk_gen(b));
-		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
-	}
-
 	return ret;
 }

@ -120,58 +114,46 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 	mutex_unlock(&c->bucket_lock);
 }

-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */

 static inline bool can_inc_bucket_gen(struct bucket *b)
 {
-	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
-		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
 }

-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
+	BUG_ON(!ca->set->gc_mark_valid);

-	if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-		unsigned i;
-
-		for (i = 0; i < RESERVE_NONE; i++)
-			if (!fifo_full(&ca->free[i]))
-				goto add;
-
-		return false;
-	}
-add:
-	b->prio = 0;
-
-	if (can_inc_bucket_gen(b) &&
-	    fifo_push(&ca->unused, b - ca->buckets)) {
-		atomic_inc(&b->pin);
-		return true;
-	}
-
-	return false;
-}
-
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
 	return (!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
 		can_inc_bucket_gen(b);
 }

-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
-	size_t bucket = b - ca->buckets;
+	lockdep_assert_held(&ca->set->bucket_lock);
+	BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);

 	if (GC_SECTORS_USED(b))
-		trace_bcache_invalidate(ca, bucket);
+		trace_bcache_invalidate(ca, b - ca->buckets);

 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
-	fifo_push(&ca->free_inc, bucket);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+	__bch_invalidate_one_bucket(ca, b);
+
+	fifo_push(&ca->free_inc, b - ca->buckets);
 }

 /*
@ -201,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 	ca->heap.used = 0;

 	for_each_bucket(b, ca) {
-		/*
-		 * If we fill up the unused list, if we then return before
-		 * adding anything to the free_inc list we'll skip writing
-		 * prios/gens and just go back to allocating from the unused
-		 * list:
-		 */
-		if (fifo_full(&ca->unused))
-			return;
-
-		if (!can_invalidate_bucket(ca, b))
-			continue;
-
-		if (!GC_SECTORS_USED(b) &&
-		    bch_bucket_add_unused(ca, b))
+		if (!bch_can_invalidate_bucket(ca, b))
 			continue;

 		if (!heap_full(&ca->heap))
@ -239,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 			return;
 		}

-		invalidate_one_bucket(ca, b);
+		bch_invalidate_one_bucket(ca, b);
 	}
 }

@ -255,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)

 		b = ca->buckets + ca->fifo_last_bucket++;

-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);

 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
@ -280,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)

 		b = ca->buckets + n;

-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);

 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
@ -293,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)

 static void invalidate_buckets(struct cache *ca)
 {
-	if (ca->invalidate_needs_gc)
-		return;
+	BUG_ON(ca->invalidate_needs_gc);

 	switch (CACHE_REPLACEMENT(&ca->sb)) {
 	case CACHE_REPLACEMENT_LRU:
@ -354,17 +322,10 @@ static int bch_allocator_thread(void *arg)
 		 * possibly issue discards to them, then we add the bucket to
 		 * the free list:
 		 */
-		while (1) {
+		while (!fifo_empty(&ca->free_inc)) {
 			long bucket;

-			if ((!atomic_read(&ca->set->prio_blocked) ||
-			     !CACHE_SYNC(&ca->set->sb)) &&
-			    !fifo_empty(&ca->unused))
-				fifo_pop(&ca->unused, bucket);
-			else if (!fifo_empty(&ca->free_inc))
-				fifo_pop(&ca->free_inc, bucket);
-			else
-				break;
+			fifo_pop(&ca->free_inc, bucket);

 			if (ca->discard) {
 				mutex_unlock(&ca->set->bucket_lock);
@ -385,9 +346,9 @@ static int bch_allocator_thread(void *arg)
 		 * them to the free_inc list:
 		 */

+retry_invalidate:
 		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       (ca->need_save_prio > 64 ||
-				!ca->invalidate_needs_gc));
+			       !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);

 		/*
@ -395,13 +356,28 @@ static int bch_allocator_thread(void *arg)
 		 * new stuff to them:
 		 */
 		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
-		if (CACHE_SYNC(&ca->set->sb) &&
-		    (!fifo_empty(&ca->free_inc) ||
-		     ca->need_save_prio > 64))
+		if (CACHE_SYNC(&ca->set->sb)) {
+			/*
+			 * This could deadlock if an allocation with a btree
+			 * node locked ever blocked - having the btree node
+			 * locked would block garbage collection, but here we're
+			 * waiting on garbage collection before we invalidate
+			 * and free anything.
+			 *
+			 * But this should be safe since the btree code always
+			 * uses btree_check_reserve() before allocating now, and
+			 * if it fails it blocks without btree nodes locked.
+			 */
+			if (!fifo_full(&ca->free_inc))
+				goto retry_invalidate;
+
 			bch_prio_write(ca);
+		}
 	}
 }

+/* Allocation */
+
 long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
 	DEFINE_WAIT(w);
@ -447,8 +423,6 @@ out:
 				BUG_ON(i == r);
 		fifo_for_each(i, &ca->free_inc, iter)
 			BUG_ON(i == r);
-		fifo_for_each(i, &ca->unused, iter)
-			BUG_ON(i == r);
 	}

 	b = ca->buckets + r;
@ -470,17 +444,19 @@ out:
 	return r;
 }

+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+	SET_GC_MARK(b, 0);
+	SET_GC_SECTORS_USED(b, 0);
+}
+
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
 	unsigned i;

-	for (i = 0; i < KEY_PTRS(k); i++) {
-		struct bucket *b = PTR_BUCKET(c, k, i);
-
-		SET_GC_MARK(b, 0);
-		SET_GC_SECTORS_USED(b, 0);
-		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
-	}
+	for (i = 0; i < KEY_PTRS(k); i++)
+		__bch_bucket_free(PTR_CACHE(c, k, i),
+				  PTR_BUCKET(c, k, i));
 }

 int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@ -195,7 +195,6 @@ struct bucket {
 	atomic_t	pin;
 	uint16_t	prio;
 	uint8_t		gen;
-	uint8_t		disk_gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint8_t		gc_gen;
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
@ -426,14 +425,9 @@ struct cache {
 	 * their new gen to disk. After prio_write() finishes writing the new
 	 * gens/prios, they'll be moved to the free list (and possibly discarded
 	 * in the process)
-	 *
-	 * unused: GC found nothing pointing into these buckets (possibly
-	 * because all the data they contained was overwritten), so we only
-	 * need to discard them before they can be moved to the free list.
 	 */
 	DECLARE_FIFO(long, free)[RESERVE_NR];
 	DECLARE_FIFO(long, free_inc);
-	DECLARE_FIFO(long, unused);

 	size_t			fifo_last_bucket;

@ -442,12 +436,6 @@ struct cache {

 	DECLARE_HEAP(struct bucket *, heap);

-	/*
-	 * max(gen - disk_gen) for all buckets. When it gets too big we have to
-	 * call prio_write() to keep gens from wrapping.
-	 */
-	uint8_t			need_save_prio;
-
 	/*
 	 * If nonzero, we know we aren't going to find any buckets to invalidate
 	 * until a gc finishes - otherwise we could pointlessly burn a ton of
@ -848,9 +836,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
 /*
 * bucket_gc_gen() returns the difference between the bucket's current gen and
 * the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
 */

 static inline uint8_t bucket_gc_gen(struct bucket *b)
@ -858,13 +843,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
 	return b->gen - b->last_gc;
 }

-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
-	return b->gen - b->disk_gen;
-}
-
 #define BUCKET_GC_GEN_MAX	96U
-#define BUCKET_DISK_GEN_MAX	64U

 #define kobj_attribute_write(n, fn)					\
 	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@ -897,11 +876,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);

 uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);

-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
 void bch_bucket_free(struct cache_set *, struct bkey *);

+long bch_bucket_alloc(struct cache *, unsigned, bool);
 int __bch_bucket_alloc_set(struct cache_set *, unsigned,
 			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@ -1641,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)
 	mutex_unlock(&c->bucket_lock);
 }

-size_t bch_btree_gc_finish(struct cache_set *c)
+static size_t bch_btree_gc_finish(struct cache_set *c)
 {
 	size_t available = 0;
 	struct bucket *b;
@ -1703,9 +1703,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)

 			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
 				available++;
-
-			if (!GC_MARK(b))
-				bch_bucket_add_unused(ca, b);
 		}
 	}

@ -1836,6 +1833,42 @@ int bch_btree_check(struct cache_set *c)
 	return btree_root(check_recurse, c, &op);
 }

+void bch_initial_gc_finish(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+
+	bch_btree_gc_finish(c);
+
+	mutex_lock(&c->bucket_lock);
+
+	/*
+	 * We need to put some unused buckets directly on the prio freelist in
+	 * order to get the allocator thread started - it needs freed buckets in
+	 * order to rewrite the prios and gens, and it needs to rewrite prios
+	 * and gens in order to free buckets.
+	 *
+	 * This is only safe for buckets that have no live data in them, which
+	 * there should always be some of.
+	 */
+	for_each_cache(ca, c, i) {
+		for_each_bucket(b, ca) {
+			if (fifo_full(&ca->free[RESERVE_PRIO]))
+				break;
+
+			if (bch_can_invalidate_bucket(ca, b) &&
+			    !GC_MARK(b)) {
+				__bch_invalidate_one_bucket(ca, b);
+				fifo_push(&ca->free[RESERVE_PRIO],
+					  b - ca->buckets);
+			}
+		}
+	}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
 /* Btree insertion */

 static bool btree_insert_key(struct btree *b, struct bkey *k,
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@ -252,7 +252,7 @@ int bch_btree_insert(struct cache_set *, struct keylist *,
 		     atomic_t *, struct bkey *);

 int bch_gc_thread_start(struct cache_set *);
-size_t bch_btree_gc_finish(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
 int bch_btree_check(struct cache_set *);
 void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
 	closure_sync(cl);
 }

-#define buckets_free(c)	"free %zu, free_inc %zu, unused %zu",		\
-	fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
-
 void bch_prio_write(struct cache *ca)
 {
 	int i;
@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca)

 	lockdep_assert_held(&ca->set->bucket_lock);

-	for (b = ca->buckets;
-	     b < ca->buckets + ca->sb.nbuckets; b++)
-		b->disk_gen = b->gen;
-
 	ca->disk_buckets->seq++;

 	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca)

 	mutex_lock(&ca->set->bucket_lock);

-	ca->need_save_prio = 0;
-
 	/*
 	 * Don't want the old priorities to get garbage collected until after we
 	 * finish writing the new ones, and they're journalled
 	 */
-	for (i = 0; i < prio_buckets(ca); i++)
+	for (i = 0; i < prio_buckets(ca); i++) {
+		if (ca->prio_last_buckets[i])
+			__bch_bucket_free(ca,
+				&ca->buckets[ca->prio_last_buckets[i]]);
+
 		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+	}
 }

 static void prio_read(struct cache *ca, uint64_t bucket)
@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 		}

 		b->prio = le16_to_cpu(d->prio);
-		b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+		b->gen = b->last_gc = b->gc_gen = d->gen;
 	}
 }

@ -1606,7 +1602,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;

 		bch_journal_mark(c, &journal);
-		bch_btree_gc_finish(c);
+		bch_initial_gc_finish(c);
 		pr_debug("btree_check() done");

 		/*
@ -1648,7 +1644,7 @@ static void run_cache_set(struct cache_set *c)
 				ca->sb.d[j] = ca->sb.first_bucket + j;
 		}

-		bch_btree_gc_finish(c);
+		bch_initial_gc_finish(c);

 		err = "error starting allocator thread";
 		for_each_cache(ca, c, i)
@ -1794,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj)
 	vfree(ca->buckets);

 	free_heap(&ca->heap);
-	free_fifo(&ca->unused);
 	free_fifo(&ca->free_inc);

 	for (i = 0; i < RESERVE_NR; i++)
@ -1831,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
-	    !init_fifo(&ca->unused,	free << 2, GFP_KERNEL) ||
 	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
 	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
 					  ca->sb.nbuckets)) ||
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@ -446,7 +446,6 @@ TRACE_EVENT(bcache_alloc_fail,
 		__field(dev_t,		dev			)
 		__field(unsigned,	free			)
 		__field(unsigned,	free_inc		)
-		__field(unsigned,	unused			)
 		__field(unsigned,	blocked			)
 	),

@ -454,13 +453,12 @@ TRACE_EVENT(bcache_alloc_fail,
 		__entry->dev		= ca->bdev->bd_dev;
 		__entry->free		= fifo_used(&ca->free[reserve]);
 		__entry->free_inc	= fifo_used(&ca->free_inc);
-		__entry->unused		= fifo_used(&ca->unused);
 		__entry->blocked	= atomic_read(&ca->set->prio_blocked);
 	),

-	TP_printk("alloc fail %d,%d free %u free_inc %u unused %u blocked %u",
+	TP_printk("alloc fail %d,%d free %u free_inc %u blocked %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->free,
-		  __entry->free_inc, __entry->unused, __entry->blocked)
+		  __entry->free_inc, __entry->blocked)
 );

 /* Background writeback */