Merge branch 'jk/pack-objects-optim-mru'

"git pack-objects" in a repository with many packfiles used to spend a lot of time looking for/at objects in them; the accesses to the packfiles are now optimized by checking the most-recently-used packfile first. * jk/pack-objects-optim-mru: pack-objects: use mru list when iterating over packs pack-objects: break delta cycles before delta-search phase sha1_file: make packed_object_info public provide an initializer for "struct object_info"
2016-10-10 14:03:46 -07:00 · 2016-10-10 14:03:46 -07:00 · e6e24c94df
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@ -53,7 +53,7 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name,
 	char *buf;
 	unsigned long size;
 	struct object_context obj_context;
-	struct object_info oi = {NULL};
+	struct object_info oi = OBJECT_INFO_INIT;
 	struct strbuf sb = STRBUF_INIT;
 	unsigned flags = LOOKUP_REPLACE_OBJECT;
 	const char *path = force_path;
@ -449,8 +449,7 @@ static int batch_objects(struct batch_options *opt)
 		data.split_on_whitespace = 1;

 	if (opt->all_objects) {
-		struct object_info empty;
-		memset(&empty, 0, sizeof(empty));
+		struct object_info empty = OBJECT_INFO_INIT;
 		if (!memcmp(&data.info, &empty, sizeof(empty)))
 			data.skip_object_info = 1;
 	}
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@ -23,6 +23,7 @@
 #include "reachable.h"
 #include "sha1-array.h"
 #include "argv-array.h"
+#include "mru.h"

 static const char *pack_usage[] = {
 	N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@ -994,7 +995,7 @@ static int want_object_in_pack(const unsigned char *sha1,
 			       struct packed_git **found_pack,
 			       off_t *found_offset)
 {
-	struct packed_git *p;
+	struct mru_entry *entry;
 	int want;

 	if (!exclude && local && has_loose_object_nonlocal(sha1))
@ -1011,7 +1012,8 @@ static int want_object_in_pack(const unsigned char *sha1,
 			return want;
 	}

-	for (p = packed_git; p; p = p->next) {
+	for (entry = packed_git_mru->head; entry; entry = entry->next) {
+		struct packed_git *p = entry->item;
 		off_t offset;

 		if (p == *found_pack)
@ -1027,6 +1029,8 @@ static int want_object_in_pack(const unsigned char *sha1,
 				*found_pack = p;
 			}
 			want = want_found_object(exclude, p);
+			if (!exclude && want > 0)
+				mru_mark(packed_git_mru, entry);
 			if (want != -1)
 				return want;
 		}
@ -1527,6 +1531,83 @@ static int pack_offset_sort(const void *_a, const void *_b)
 			(a->in_pack_offset > b->in_pack_offset);
 }

+/*
+ * Drop an on-disk delta we were planning to reuse. Naively, this would
+ * just involve blanking out the "delta" field, but we have to deal
+ * with some extra book-keeping:
+ *
+ *   1. Removing ourselves from the delta_sibling linked list.
+ *
+ *   2. Updating our size/type to the non-delta representation. These were
+ *      either not recorded initially (size) or overwritten with the delta type
+ *      (type) when check_object() decided to reuse the delta.
+ */
+static void drop_reused_delta(struct object_entry *entry)
+{
+	struct object_entry **p = &entry->delta->delta_child;
+	struct object_info oi = OBJECT_INFO_INIT;
+
+	while (*p) {
+		if (*p == entry)
+			*p = (*p)->delta_sibling;
+		else
+			p = &(*p)->delta_sibling;
+	}
+	entry->delta = NULL;
+
+	oi.sizep = &entry->size;
+	oi.typep = &entry->type;
+	if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
+		/*
+		 * We failed to get the info from this pack for some reason;
+		 * fall back to sha1_object_info, which may find another copy.
+		 * And if that fails, the error will be recorded in entry->type
+		 * and dealt with in prepare_pack().
+		 */
+		entry->type = sha1_object_info(entry->idx.sha1, &entry->size);
+	}
+}
+
+/*
+ * Follow the chain of deltas from this entry onward, throwing away any links
+ * that cause us to hit a cycle (as determined by the DFS state flags in
+ * the entries).
+ */
+static void break_delta_chains(struct object_entry *entry)
+{
+	/* If it's not a delta, it can't be part of a cycle. */
+	if (!entry->delta) {
+		entry->dfs_state = DFS_DONE;
+		return;
+	}
+
+	switch (entry->dfs_state) {
+	case DFS_NONE:
+		/*
+		 * This is the first time we've seen the object. We mark it as
+		 * part of the active potential cycle and recurse.
+		 */
+		entry->dfs_state = DFS_ACTIVE;
+		break_delta_chains(entry->delta);
+		entry->dfs_state = DFS_DONE;
+		break;
+
+	case DFS_DONE:
+		/* object already examined, and not part of a cycle */
+		break;
+
+	case DFS_ACTIVE:
+		/*
+		 * We found a cycle that needs broken. It would be correct to
+		 * break any link in the chain, but it's convenient to
+		 * break this one.
+		 */
+		drop_reused_delta(entry);
+		entry->dfs_state = DFS_DONE;
+		break;
+	}
+}
+
 static void get_object_details(void)
 {
 	uint32_t i;
@ -1544,6 +1625,13 @@ static void get_object_details(void)
 			entry->no_try_delta = 1;
 	}

+	/*
+	 * This must happen in a second pass, since we rely on the delta
+	 * information for the whole list being completed.
+	 */
+	for (i = 0; i < to_pack.nr_objects; i++)
+		break_delta_chains(&to_pack.objects[i]);
+
 	free(sorted_by_offset);
 }

--- a/cache.h
+++ b/cache.h
@ -1602,7 +1602,15 @@ struct object_info {
 		} packed;
 	} u;
 };
+
+/*
+ * Initializer for a "struct object_info" that wants no items. You may
+ * also memset() the memory to all-zeroes.
+ */
+#define OBJECT_INFO_INIT {NULL}
+
 extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags);
+extern int packed_object_info(struct packed_git *pack, off_t offset, struct object_info *);

 /* Dumb servers support */
 extern int update_server_info(int);
--- a/pack-objects.h
+++ b/pack-objects.h
@ -27,6 +27,15 @@ struct object_entry {
 	unsigned no_try_delta:1;
 	unsigned tagged:1; /* near the very tip of refs */
 	unsigned filled:1; /* assigned write-order */
+
+	/*
+	 * State flags for depth-first search used for analyzing delta cycles.
+	 */
+	enum {
+		DFS_NONE = 0,
+		DFS_ACTIVE,
+		DFS_DONE
+	} dfs_state;
 };

 struct packing_data {
--- a/sha1_file.c
+++ b/sha1_file.c
@ -1826,11 +1826,9 @@ static int parse_sha1_header_extended(const char *hdr, struct object_info *oi,

 int parse_sha1_header(const char *hdr, unsigned long *sizep)
 {
-	struct object_info oi;
+	struct object_info oi = OBJECT_INFO_INIT;

 	oi.sizep = sizep;
-	oi.typename = NULL;
-	oi.typep = NULL;
 	return parse_sha1_header_extended(hdr, &oi, LOOKUP_REPLACE_OBJECT);
 }

@ -2068,8 +2066,8 @@ unwind:
 	goto out;
 }

-static int packed_object_info(struct packed_git *p, off_t obj_offset,
-			      struct object_info *oi)
+int packed_object_info(struct packed_git *p, off_t obj_offset,
+		       struct object_info *oi)
 {
 	struct pack_window *w_curs = NULL;
 	unsigned long size;
@ -2840,7 +2838,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi,
 int sha1_object_info(const unsigned char *sha1, unsigned long *sizep)
 {
 	enum object_type type;
-	struct object_info oi = {NULL};
+	struct object_info oi = OBJECT_INFO_INIT;

 	oi.typep = &type;
 	oi.sizep = sizep;
--- a/streaming.c
+++ b/streaming.c
@ -135,7 +135,7 @@ struct git_istream *open_istream(const unsigned char *sha1,
 				 struct stream_filter *filter)
 {
 	struct git_istream *st;
-	struct object_info oi = {NULL};
+	struct object_info oi = OBJECT_INFO_INIT;
 	const unsigned char *real = lookup_replace_object(sha1);
 	enum input_source src = istream_source(real, type, &oi);

--- a/t/t5314-pack-cycle-detection.sh
+++ b/t/t5314-pack-cycle-detection.sh
@ -0,0 +1,113 @@
+#!/bin/sh
+
+test_description='test handling of inter-pack delta cycles during repack
+
+The goal here is to create a situation where we have two blobs, A and B, with A
+as a delta against B in one pack, and vice versa in the other. Then if we can
+persuade a full repack to find A from one pack and B from the other, that will
+give us a cycle when we attempt to reuse those deltas.
+
+The trick is in the "persuade" step, as it depends on the internals of how
+pack-objects picks which pack to reuse the deltas from. But we can assume
+that it does so in one of two general strategies:
+
+ 1. Using a static ordering of packs. In this case, no inter-pack cycles can
+    happen. Any objects with a delta relationship must be present in the same
+    pack (i.e., no "--thin" packs on disk), so we will find all related objects
+    from that pack. So assuming there are no cycles within a single pack (and
+    we avoid generating them via pack-objects or importing them via
+    index-pack), then our result will have no cycles.
+
+    So this case should pass the tests no matter how we arrange things.
+
+ 2. Picking the next pack to examine based on locality (i.e., where we found
+    something else recently).
+
+    In this case, we want to make sure that we find the delta versions of A and
+    B and not their base versions. We can do this by putting two blobs in each
+    pack. The first is a "dummy" blob that can only be found in the pack in
+    question.  And then the second is the actual delta we want to find.
+
+    The two blobs must be present in the same tree, not present in other trees,
+    and the dummy pathname must sort before the delta path.
+
+The setup below focuses on case 2. We have two commits HEAD and HEAD^, each
+which has two files: "dummy" and "file". Then we can make two packs which
+contain:
+
+  [pack one]
+  HEAD:dummy
+  HEAD:file  (as delta against HEAD^:file)
+  HEAD^:file (as base)
+
+  [pack two]
+  HEAD^:dummy
+  HEAD^:file (as delta against HEAD:file)
+  HEAD:file  (as base)
+
+Then no matter which order we start looking at the packs in, we know that we
+will always find a delta for "file", because its lookup will always come
+immediately after the lookup for "dummy".
+'
+. ./test-lib.sh
+
+
+
+# Create a pack containing the the tree $1 and blob $1:file, with
+# the latter stored as a delta against $2:file.
+#
+# We convince pack-objects to make the delta in the direction of our choosing
+# by marking $2 as a preferred-base edge. That results in $1:file as a thin
+# delta, and index-pack completes it by adding $2:file as a base.
+#
+# Note that the two variants of "file" must be similar enough to convince git
+# to create the delta.
+make_pack () {
+	{
+		printf '%s\n' "-$(git rev-parse $2)"
+		printf '%s dummy\n' "$(git rev-parse $1:dummy)"
+		printf '%s file\n' "$(git rev-parse $1:file)"
+	} |
+	git pack-objects --stdout |
+	git index-pack --stdin --fix-thin
+}
+
+test_expect_success 'setup' '
+	test-genrandom base 4096 >base &&
+	for i in one two
+	do
+		# we want shared content here to encourage deltas...
+		cp base file &&
+		echo $i >>file &&
+
+		# ...whereas dummy should be short, because we do not want
+		# deltas that would create duplicates when we --fix-thin
+		echo $i >dummy &&
+
+		git add file dummy &&
+		test_tick &&
+		git commit -m $i ||
+		return 1
+	done &&
+
+	make_pack HEAD^ HEAD &&
+	make_pack HEAD HEAD^
+'
+
+test_expect_success 'repack' '
+	# We first want to check that we do not have any internal errors,
+	# and also that we do not hit the last-ditch cycle-breaking code
+	# in write_object(), which will issue a warning to stderr.
+	>expect &&
+	git repack -ad 2>stderr &&
+	test_cmp expect stderr &&
+
+	# And then double-check that the resulting pack is usable (i.e.,
+	# we did not fail to notice any cycles). We know we are accessing
+	# the objects via the new pack here, because "repack -d" will have
+	# removed the others.
+	git cat-file blob HEAD:file >/dev/null &&
+	git cat-file blob HEAD^:file >/dev/null
+'
+
+test_done