Merge branch 'ew/many-alternate-optim'

Optimization for repositories with many alternate object store. * ew/many-alternate-optim: oidtree: a crit-bit tree for odb_loose_cache oidcpy_with_padding: constify `src' arg make object_directory.loose_objects_subdir_seen a bitmap avoid strlen via strbuf_addstr in link_alt_odb_entry speed up alt_odb_usable() with many alternates
2021-07-28 13:17:57 -07:00 · 2021-07-28 13:17:57 -07:00 · e5cc59c77c
--- a/3
+++ b/3
@ -726,6 +726,7 @@ TEST_BUILTINS_OBJS += test-mergesort.o
 TEST_BUILTINS_OBJS += test-mktemp.o
 TEST_BUILTINS_OBJS += test-oid-array.o
 TEST_BUILTINS_OBJS += test-oidmap.o
+TEST_BUILTINS_OBJS += test-oidtree.o
 TEST_BUILTINS_OBJS += test-online-cpus.o
 TEST_BUILTINS_OBJS += test-parse-options.o
 TEST_BUILTINS_OBJS += test-parse-pathspec-file.o
@ -850,6 +851,7 @@ LIB_OBJS += branch.o
 LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
 LIB_OBJS += cache-tree.o
+LIB_OBJS += cbtree.o
 LIB_OBJS += chdir-notify.o
 LIB_OBJS += checkout.o
 LIB_OBJS += chunk-format.o
@ -945,6 +947,7 @@ LIB_OBJS += object.o
 LIB_OBJS += oid-array.o
 LIB_OBJS += oidmap.o
 LIB_OBJS += oidset.o
+LIB_OBJS += oidtree.o
 LIB_OBJS += pack-bitmap-write.o
 LIB_OBJS += pack-bitmap.o
 LIB_OBJS += pack-check.o
--- a/cbtree.c
+++ b/cbtree.c
@ -0,0 +1,167 @@
+/*
+ * crit-bit tree implementation, does no allocations internally
+ * For more information on crit-bit trees: https://cr.yp.to/critbit.html
+ * Based on Adam Langley's adaptation of Dan Bernstein's public domain code
+ * git clone https://github.com/agl/critbit.git
+ */
+#include "cbtree.h"
+
+static struct cb_node *cb_node_of(const void *p)
+{
+	return (struct cb_node *)((uintptr_t)p - 1);
+}
+
+/* locate the best match, does not do a final comparision */
+static struct cb_node *cb_internal_best_match(struct cb_node *p,
+					const uint8_t *k, size_t klen)
+{
+	while (1 & (uintptr_t)p) {
+		struct cb_node *q = cb_node_of(p);
+		uint8_t c = q->byte < klen ? k[q->byte] : 0;
+		size_t direction = (1 + (q->otherbits | c)) >> 8;
+
+		p = q->child[direction];
+	}
+	return p;
+}
+
+/* returns NULL if successful, existing cb_node if duplicate */
+struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen)
+{
+	size_t newbyte, newotherbits;
+	uint8_t c;
+	int newdirection;
+	struct cb_node **wherep, *p;
+
+	assert(!((uintptr_t)node & 1)); /* allocations must be aligned */
+
+	if (!t->root) {		/* insert into empty tree */
+		t->root = node;
+		return NULL;	/* success */
+	}
+
+	/* see if a node already exists */
+	p = cb_internal_best_match(t->root, node->k, klen);
+
+	/* find first differing byte */
+	for (newbyte = 0; newbyte < klen; newbyte++) {
+		if (p->k[newbyte] != node->k[newbyte])
+			goto different_byte_found;
+	}
+	return p;	/* element exists, let user deal with it */
+
+different_byte_found:
+	newotherbits = p->k[newbyte] ^ node->k[newbyte];
+	newotherbits |= newotherbits >> 1;
+	newotherbits |= newotherbits >> 2;
+	newotherbits |= newotherbits >> 4;
+	newotherbits = (newotherbits & ~(newotherbits >> 1)) ^ 255;
+	c = p->k[newbyte];
+	newdirection = (1 + (newotherbits | c)) >> 8;
+
+	node->byte = newbyte;
+	node->otherbits = newotherbits;
+	node->child[1 - newdirection] = node;
+
+	/* find a place to insert it */
+	wherep = &t->root;
+	for (;;) {
+		struct cb_node *q;
+		size_t direction;
+
+		p = *wherep;
+		if (!(1 & (uintptr_t)p))
+			break;
+		q = cb_node_of(p);
+		if (q->byte > newbyte)
+			break;
+		if (q->byte == newbyte && q->otherbits > newotherbits)
+			break;
+		c = q->byte < klen ? node->k[q->byte] : 0;
+		direction = (1 + (q->otherbits | c)) >> 8;
+		wherep = q->child + direction;
+	}
+
+	node->child[newdirection] = *wherep;
+	*wherep = (struct cb_node *)(1 + (uintptr_t)node);
+
+	return NULL; /* success */
+}
+
+struct cb_node *cb_lookup(struct cb_tree *t, const uint8_t *k, size_t klen)
+{
+	struct cb_node *p = cb_internal_best_match(t->root, k, klen);
+
+	return p && !memcmp(p->k, k, klen) ? p : NULL;
+}
+
+struct cb_node *cb_unlink(struct cb_tree *t, const uint8_t *k, size_t klen)
+{
+	struct cb_node **wherep = &t->root;
+	struct cb_node **whereq = NULL;
+	struct cb_node *q = NULL;
+	size_t direction = 0;
+	uint8_t c;
+	struct cb_node *p = t->root;
+
+	if (!p) return NULL;	/* empty tree, nothing to delete */
+
+	/* traverse to find best match, keeping link to parent */
+	while (1 & (uintptr_t)p) {
+		whereq = wherep;
+		q = cb_node_of(p);
+		c = q->byte < klen ? k[q->byte] : 0;
+		direction = (1 + (q->otherbits | c)) >> 8;
+		wherep = q->child + direction;
+		p = *wherep;
+	}
+
+	if (memcmp(p->k, k, klen))
+		return NULL;		/* no match, nothing unlinked */
+
+	/* found an exact match */
+	if (whereq)	/* update parent */
+		*whereq = q->child[1 - direction];
+	else
+		t->root = NULL;
+	return p;
+}
+
+static enum cb_next cb_descend(struct cb_node *p, cb_iter fn, void *arg)
+{
+	if (1 & (uintptr_t)p) {
+		struct cb_node *q = cb_node_of(p);
+		enum cb_next n = cb_descend(q->child[0], fn, arg);
+
+		return n == CB_BREAK ? n : cb_descend(q->child[1], fn, arg);
+	} else {
+		return fn(p, arg);
+	}
+}
+
+void cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen,
+			cb_iter fn, void *arg)
+{
+	struct cb_node *p = t->root;
+	struct cb_node *top = p;
+	size_t i = 0;
+
+	if (!p) return; /* empty tree */
+
+	/* Walk tree, maintaining top pointer */
+	while (1 & (uintptr_t)p) {
+		struct cb_node *q = cb_node_of(p);
+		uint8_t c = q->byte < klen ? kpfx[q->byte] : 0;
+		size_t direction = (1 + (q->otherbits | c)) >> 8;
+
+		p = q->child[direction];
+		if (q->byte < klen)
+			top = p;
+	}
+
+	for (i = 0; i < klen; i++) {
+		if (p->k[i] != kpfx[i])
+			return; /* "best" match failed */
+	}
+	cb_descend(top, fn, arg);
+}
--- a/cbtree.h
+++ b/cbtree.h
@ -0,0 +1,56 @@
+/*
+ * crit-bit tree implementation, does no allocations internally
+ * For more information on crit-bit trees: https://cr.yp.to/critbit.html
+ * Based on Adam Langley's adaptation of Dan Bernstein's public domain code
+ * git clone https://github.com/agl/critbit.git
+ *
+ * This is adapted to store arbitrary data (not just NUL-terminated C strings
+ * and allocates no memory internally.  The user needs to allocate
+ * "struct cb_node" and fill cb_node.k[] with arbitrary match data
+ * for memcmp.
+ * If "klen" is variable, then it should be embedded into "c_node.k[]"
+ * Recursion is bound by the maximum value of "klen" used.
+ */
+#ifndef CBTREE_H
+#define CBTREE_H
+
+#include "git-compat-util.h"
+
+struct cb_node;
+struct cb_node {
+	struct cb_node *child[2];
+	/*
+	 * n.b. uint32_t for `byte' is excessive for OIDs,
+	 * we may consider shorter variants if nothing else gets stored.
+	 */
+	uint32_t byte;
+	uint8_t otherbits;
+	uint8_t k[FLEX_ARRAY]; /* arbitrary data */
+};
+
+struct cb_tree {
+	struct cb_node *root;
+};
+
+enum cb_next {
+	CB_CONTINUE = 0,
+	CB_BREAK = 1
+};
+
+#define CBTREE_INIT { .root = NULL }
+
+static inline void cb_init(struct cb_tree *t)
+{
+	t->root = NULL;
+}
+
+struct cb_node *cb_lookup(struct cb_tree *, const uint8_t *k, size_t klen);
+struct cb_node *cb_insert(struct cb_tree *, struct cb_node *, size_t klen);
+struct cb_node *cb_unlink(struct cb_tree *t, const uint8_t *k, size_t klen);
+
+typedef enum cb_next (*cb_iter)(struct cb_node *, void *arg);
+
+void cb_each(struct cb_tree *, const uint8_t *kpfx, size_t klen,
+		cb_iter, void *arg);
+
+#endif /* CBTREE_H */
--- a/dir.c
+++ b/dir.c
@ -78,11 +78,21 @@ int fspathcmp(const char *a, const char *b)
 	return ignore_case ? strcasecmp(a, b) : strcmp(a, b);
 }

+int fspatheq(const char *a, const char *b)
+{
+	return !fspathcmp(a, b);
+}
+
 int fspathncmp(const char *a, const char *b, size_t count)
 {
 	return ignore_case ? strncasecmp(a, b, count) : strncmp(a, b, count);
 }

+unsigned int fspathhash(const char *str)
+{
+	return ignore_case ? strihash(str) : strhash(str);
+}
+
 int git_fnmatch(const struct pathspec_item *item,
 		const char *pattern, const char *string,
 		int prefix)
--- a/dir.h
+++ b/dir.h
@ -489,7 +489,9 @@ int remove_dir_recursively(struct strbuf *path, int flag);
 int remove_path(const char *path);

 int fspathcmp(const char *a, const char *b);
+int fspatheq(const char *a, const char *b);
 int fspathncmp(const char *a, const char *b, size_t count);
+unsigned int fspathhash(const char *str);

 /*
 * The prefix part of pattern must not contains wildcards.
--- a/hash.h
+++ b/hash.h
@ -265,7 +265,7 @@ static inline void oidcpy(struct object_id *dst, const struct object_id *src)

 /* Like oidcpy() but zero-pads the unused bytes in dst's hash array. */
 static inline void oidcpy_with_padding(struct object_id *dst,
-				       struct object_id *src)
+				       const struct object_id *src)
 {
 	size_t hashsz;

--- a/object-file.c
+++ b/object-file.c
@ -517,9 +517,9 @@ const char *loose_object_path(struct repository *r, struct strbuf *buf,
 */
 static int alt_odb_usable(struct raw_object_store *o,
 			  struct strbuf *path,
-			  const char *normalized_objdir)
+			  const char *normalized_objdir, khiter_t *pos)
 {
-	struct object_directory *odb;
+	int r;

 	/* Detect cases where alternate disappeared */
 	if (!is_directory(path->buf)) {
@ -533,14 +533,20 @@ static int alt_odb_usable(struct raw_object_store *o,
 	 * Prevent the common mistake of listing the same
 	 * thing twice, or object directory itself.
 	 */
-	for (odb = o->odb; odb; odb = odb->next) {
-		if (!fspathcmp(path->buf, odb->path))
-			return 0;
-	}
-	if (!fspathcmp(path->buf, normalized_objdir))
-		return 0;
+	if (!o->odb_by_path) {
+		khiter_t p;

-	return 1;
+		o->odb_by_path = kh_init_odb_path_map();
+		assert(!o->odb->next);
+		p = kh_put_odb_path_map(o->odb_by_path, o->odb->path, &r);
+		assert(r == 1); /* never used */
+		kh_value(o->odb_by_path, p) = o->odb;
+	}
+	if (fspatheq(path->buf, normalized_objdir))
+		return 0;
+	*pos = kh_put_odb_path_map(o->odb_by_path, path->buf, &r);
+	/* r: 0 = exists, 1 = never used, 2 = deleted */
+	return r == 0 ? 0 : 1;
 }

 /*
@ -561,17 +567,18 @@ static int alt_odb_usable(struct raw_object_store *o,
 static void read_info_alternates(struct repository *r,
 				 const char *relative_base,
 				 int depth);
-static int link_alt_odb_entry(struct repository *r, const char *entry,
+static int link_alt_odb_entry(struct repository *r, const struct strbuf *entry,
 	const char *relative_base, int depth, const char *normalized_objdir)
 {
 	struct object_directory *ent;
 	struct strbuf pathbuf = STRBUF_INIT;
+	khiter_t pos;

-	if (!is_absolute_path(entry) && relative_base) {
+	if (!is_absolute_path(entry->buf) && relative_base) {
 		strbuf_realpath(&pathbuf, relative_base, 1);
 		strbuf_addch(&pathbuf, '/');
 	}
-	strbuf_addstr(&pathbuf, entry);
+	strbuf_addbuf(&pathbuf, entry);

 	if (strbuf_normalize_path(&pathbuf) < 0 && relative_base) {
 		error(_("unable to normalize alternate object path: %s"),
@ -587,23 +594,25 @@ static int link_alt_odb_entry(struct repository *r, const char *entry,
 	while (pathbuf.len && pathbuf.buf[pathbuf.len - 1] == '/')
 		strbuf_setlen(&pathbuf, pathbuf.len - 1);

-	if (!alt_odb_usable(r->objects, &pathbuf, normalized_objdir)) {
+	if (!alt_odb_usable(r->objects, &pathbuf, normalized_objdir, &pos)) {
 		strbuf_release(&pathbuf);
 		return -1;
 	}

 	CALLOC_ARRAY(ent, 1);
-	ent->path = xstrdup(pathbuf.buf);
+	/* pathbuf.buf is already in r->objects->odb_by_path */
+	ent->path = strbuf_detach(&pathbuf, NULL);

 	/* add the alternate entry */
 	*r->objects->odb_tail = ent;
 	r->objects->odb_tail = &(ent->next);
 	ent->next = NULL;
+	assert(r->objects->odb_by_path);
+	kh_value(r->objects->odb_by_path, pos) = ent;

 	/* recursively add alternates */
-	read_info_alternates(r, pathbuf.buf, depth + 1);
+	read_info_alternates(r, ent->path, depth + 1);

-	strbuf_release(&pathbuf);
 	return 0;
 }

@ -660,7 +669,7 @@ static void link_alt_odb_entries(struct repository *r, const char *alt,
 		alt = parse_alt_odb_entry(alt, sep, &entry);
 		if (!entry.len)
 			continue;
-		link_alt_odb_entry(r, entry.buf,
+		link_alt_odb_entry(r, &entry,
 				   relative_base, depth, objdirbuf.buf);
 	}
 	strbuf_release(&entry);
@ -1178,7 +1187,7 @@ static int quick_has_loose(struct repository *r,

 	prepare_alt_odb(r);
 	for (odb = r->objects->odb; odb; odb = odb->next) {
-		if (oid_array_lookup(odb_loose_cache(odb, oid), oid) >= 0)
+		if (oidtree_contains(odb_loose_cache(odb, oid), oid))
 			return 1;
 	}
 	return 0;
@ -2454,39 +2463,45 @@ int for_each_loose_object(each_loose_object_fn cb, void *data,
 static int append_loose_object(const struct object_id *oid, const char *path,
 			       void *data)
 {
-	oid_array_append(data, oid);
+	oidtree_insert(data, oid);
 	return 0;
 }

-struct oid_array *odb_loose_cache(struct object_directory *odb,
+struct oidtree *odb_loose_cache(struct object_directory *odb,
 				  const struct object_id *oid)
 {
 	int subdir_nr = oid->hash[0];
 	struct strbuf buf = STRBUF_INIT;
+	size_t word_bits = bitsizeof(odb->loose_objects_subdir_seen[0]);
+	size_t word_index = subdir_nr / word_bits;
+	size_t mask = 1 << (subdir_nr % word_bits);
+	uint32_t *bitmap;

 	if (subdir_nr < 0 ||
-	    subdir_nr >= ARRAY_SIZE(odb->loose_objects_subdir_seen))
+	    subdir_nr >= bitsizeof(odb->loose_objects_subdir_seen))
 		BUG("subdir_nr out of range");

-	if (odb->loose_objects_subdir_seen[subdir_nr])
-		return &odb->loose_objects_cache[subdir_nr];
-
+	bitmap = &odb->loose_objects_subdir_seen[word_index];
+	if (*bitmap & mask)
+		return odb->loose_objects_cache;
+	if (!odb->loose_objects_cache) {
+		ALLOC_ARRAY(odb->loose_objects_cache, 1);
+		oidtree_init(odb->loose_objects_cache);
+	}
 	strbuf_addstr(&buf, odb->path);
 	for_each_file_in_obj_subdir(subdir_nr, &buf,
 				    append_loose_object,
 				    NULL, NULL,
-				    &odb->loose_objects_cache[subdir_nr]);
-	odb->loose_objects_subdir_seen[subdir_nr] = 1;
+				    odb->loose_objects_cache);
+	*bitmap |= mask;
 	strbuf_release(&buf);
-	return &odb->loose_objects_cache[subdir_nr];
+	return odb->loose_objects_cache;
 }

 void odb_clear_loose_cache(struct object_directory *odb)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(odb->loose_objects_cache); i++)
-		oid_array_clear(&odb->loose_objects_cache[i]);
+	oidtree_clear(odb->loose_objects_cache);
+	FREE_AND_NULL(odb->loose_objects_cache);
 	memset(&odb->loose_objects_subdir_seen, 0,
 	       sizeof(odb->loose_objects_subdir_seen));
 }
--- a/object-name.c
+++ b/object-name.c
@ -87,27 +87,21 @@ static void update_candidates(struct disambiguate_state *ds, const struct object

 static int match_hash(unsigned, const unsigned char *, const unsigned char *);

+static enum cb_next match_prefix(const struct object_id *oid, void *arg)
+{
+	struct disambiguate_state *ds = arg;
+	/* no need to call match_hash, oidtree_each did prefix match */
+	update_candidates(ds, oid);
+	return ds->ambiguous ? CB_BREAK : CB_CONTINUE;
+}
+
 static void find_short_object_filename(struct disambiguate_state *ds)
 {
 	struct object_directory *odb;

-	for (odb = ds->repo->objects->odb; odb && !ds->ambiguous; odb = odb->next) {
-		int pos;
-		struct oid_array *loose_objects;
-
-		loose_objects = odb_loose_cache(odb, &ds->bin_pfx);
-		pos = oid_array_lookup(loose_objects, &ds->bin_pfx);
-		if (pos < 0)
-			pos = -1 - pos;
-		while (!ds->ambiguous && pos < loose_objects->nr) {
-			const struct object_id *oid;
-			oid = loose_objects->oid + pos;
-			if (!match_hash(ds->len, ds->bin_pfx.hash, oid->hash))
-				break;
-			update_candidates(ds, oid);
-			pos++;
-		}
-	}
+	for (odb = ds->repo->objects->odb; odb && !ds->ambiguous; odb = odb->next)
+		oidtree_each(odb_loose_cache(odb, &ds->bin_pfx),
+				&ds->bin_pfx, ds->len, match_prefix, ds);
 }

 static int match_hash(unsigned len, const unsigned char *a, const unsigned char *b)
--- a/object-store.h
+++ b/object-store.h
@ -7,6 +7,9 @@
 #include "oid-array.h"
 #include "strbuf.h"
 #include "thread-utils.h"
+#include "khash.h"
+#include "dir.h"
+#include "oidtree.h"

 struct object_directory {
 	struct object_directory *next;
@ -20,8 +23,8 @@ struct object_directory {
 	 *
 	 * Be sure to call odb_load_loose_cache() before using.
 	 */
-	char loose_objects_subdir_seen[256];
-	struct oid_array loose_objects_cache[256];
+	uint32_t loose_objects_subdir_seen[8]; /* 256 bits */
+	struct oidtree *loose_objects_cache;

 	/*
 	 * Path to the alternative object store. If this is a relative path,
@ -30,6 +33,9 @@ struct object_directory {
 	char *path;
 };

+KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
+	struct object_directory *, 1, fspathhash, fspatheq);
+
 void prepare_alt_odb(struct repository *r);
 char *compute_alternate_path(const char *path, struct strbuf *err);
 typedef int alt_odb_fn(struct object_directory *, void *);
@ -54,7 +60,7 @@ void add_to_alternates_memory(const char *dir);
 * Populate and return the loose object cache array corresponding to the
 * given object ID.
 */
-struct oid_array *odb_loose_cache(struct object_directory *odb,
+struct oidtree *odb_loose_cache(struct object_directory *odb,
 				  const struct object_id *oid);

 /* Empty the loose object cache for the specified object directory. */
@ -116,6 +122,8 @@ struct raw_object_store {
 	 */
 	struct object_directory *odb;
 	struct object_directory **odb_tail;
+	kh_odb_path_map_t *odb_by_path;
+
 	int loaded_alternates;

 	/*
--- a/object.c
+++ b/object.c
@ -511,6 +511,8 @@ static void free_object_directories(struct raw_object_store *o)
 		free_object_directory(o->odb);
 		o->odb = next;
 	}
+	kh_destroy_odb_path_map(o->odb_by_path);
+	o->odb_by_path = NULL;
 }

 void raw_object_store_clear(struct raw_object_store *o)
--- a/oidtree.c
+++ b/oidtree.c
@ -0,0 +1,104 @@
+/*
+ * A wrapper around cbtree which stores oids
+ * May be used to replace oid-array for prefix (abbreviation) matches
+ */
+#include "oidtree.h"
+#include "alloc.h"
+#include "hash.h"
+
+struct oidtree_node {
+	/* n.k[] is used to store "struct object_id" */
+	struct cb_node n;
+};
+
+struct oidtree_iter_data {
+	oidtree_iter fn;
+	void *arg;
+	size_t *last_nibble_at;
+	int algo;
+	uint8_t last_byte;
+};
+
+void oidtree_init(struct oidtree *ot)
+{
+	cb_init(&ot->tree);
+	mem_pool_init(&ot->mem_pool, 0);
+}
+
+void oidtree_clear(struct oidtree *ot)
+{
+	if (ot) {
+		mem_pool_discard(&ot->mem_pool, 0);
+		oidtree_init(ot);
+	}
+}
+
+void oidtree_insert(struct oidtree *ot, const struct object_id *oid)
+{
+	struct oidtree_node *on;
+
+	if (!oid->algo)
+		BUG("oidtree_insert requires oid->algo");
+
+	on = mem_pool_alloc(&ot->mem_pool, sizeof(*on) + sizeof(*oid));
+	oidcpy_with_padding((struct object_id *)on->n.k, oid);
+
+	/*
+	 * n.b. Current callers won't get us duplicates, here.  If a
+	 * future caller causes duplicates, there'll be a a small leak
+	 * that won't be freed until oidtree_clear.  Currently it's not
+	 * worth maintaining a free list
+	 */
+	cb_insert(&ot->tree, &on->n, sizeof(*oid));
+}
+
+
+int oidtree_contains(struct oidtree *ot, const struct object_id *oid)
+{
+	struct object_id k;
+	size_t klen = sizeof(k);
+
+	oidcpy_with_padding(&k, oid);
+
+	if (oid->algo == GIT_HASH_UNKNOWN)
+		klen -= sizeof(oid->algo);
+
+	/* cb_lookup relies on memcmp on the struct, so order matters: */
+	klen += BUILD_ASSERT_OR_ZERO(offsetof(struct object_id, hash) <
+				offsetof(struct object_id, algo));
+
+	return cb_lookup(&ot->tree, (const uint8_t *)&k, klen) ? 1 : 0;
+}
+
+static enum cb_next iter(struct cb_node *n, void *arg)
+{
+	struct oidtree_iter_data *x = arg;
+	const struct object_id *oid = (const struct object_id *)n->k;
+
+	if (x->algo != GIT_HASH_UNKNOWN && x->algo != oid->algo)
+		return CB_CONTINUE;
+
+	if (x->last_nibble_at) {
+		if ((oid->hash[*x->last_nibble_at] ^ x->last_byte) & 0xf0)
+			return CB_CONTINUE;
+	}
+
+	return x->fn(oid, x->arg);
+}
+
+void oidtree_each(struct oidtree *ot, const struct object_id *oid,
+			size_t oidhexsz, oidtree_iter fn, void *arg)
+{
+	size_t klen = oidhexsz / 2;
+	struct oidtree_iter_data x = { 0 };
+	assert(oidhexsz <= GIT_MAX_HEXSZ);
+
+	x.fn = fn;
+	x.arg = arg;
+	x.algo = oid->algo;
+	if (oidhexsz & 1) {
+		x.last_byte = oid->hash[klen];
+		x.last_nibble_at = &klen;
+	}
+	cb_each(&ot->tree, (const uint8_t *)oid, klen, iter, &x);
+}
--- a/oidtree.h
+++ b/oidtree.h
@ -0,0 +1,22 @@
+#ifndef OIDTREE_H
+#define OIDTREE_H
+
+#include "cbtree.h"
+#include "hash.h"
+#include "mem-pool.h"
+
+struct oidtree {
+	struct cb_tree tree;
+	struct mem_pool mem_pool;
+};
+
+void oidtree_init(struct oidtree *);
+void oidtree_clear(struct oidtree *);
+void oidtree_insert(struct oidtree *, const struct object_id *);
+int oidtree_contains(struct oidtree *, const struct object_id *);
+
+typedef enum cb_next (*oidtree_iter)(const struct object_id *, void *data);
+void oidtree_each(struct oidtree *, const struct object_id *,
+			size_t oidhexsz, oidtree_iter, void *data);
+
+#endif /* OIDTREE_H */
--- a/t/helper/test-oidtree.c
+++ b/t/helper/test-oidtree.c
@ -0,0 +1,49 @@
+#include "test-tool.h"
+#include "cache.h"
+#include "oidtree.h"
+
+static enum cb_next print_oid(const struct object_id *oid, void *data)
+{
+	puts(oid_to_hex(oid));
+	return CB_CONTINUE;
+}
+
+int cmd__oidtree(int argc, const char **argv)
+{
+	struct oidtree ot;
+	struct strbuf line = STRBUF_INIT;
+	int nongit_ok;
+	int algo = GIT_HASH_UNKNOWN;
+
+	oidtree_init(&ot);
+	setup_git_directory_gently(&nongit_ok);
+
+	while (strbuf_getline(&line, stdin) != EOF) {
+		const char *arg;
+		struct object_id oid;
+
+		if (skip_prefix(line.buf, "insert ", &arg)) {
+			if (get_oid_hex_any(arg, &oid) == GIT_HASH_UNKNOWN)
+				die("insert not a hexadecimal oid: %s", arg);
+			algo = oid.algo;
+			oidtree_insert(&ot, &oid);
+		} else if (skip_prefix(line.buf, "contains ", &arg)) {
+			if (get_oid_hex(arg, &oid))
+				die("contains not a hexadecimal oid: %s", arg);
+			printf("%d\n", oidtree_contains(&ot, &oid));
+		} else if (skip_prefix(line.buf, "each ", &arg)) {
+			char buf[GIT_MAX_HEXSZ + 1] = { '0' };
+			memset(&oid, 0, sizeof(oid));
+			memcpy(buf, arg, strlen(arg));
+			buf[hash_algos[algo].hexsz] = '\0';
+			get_oid_hex_any(buf, &oid);
+			oid.algo = algo;
+			oidtree_each(&ot, &oid, strlen(arg), print_oid, NULL);
+		} else if (!strcmp(line.buf, "clear")) {
+			oidtree_clear(&ot);
+		} else {
+			die("unknown command: %s", line.buf);
+		}
+	}
+	return 0;
+}
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@ -43,6 +43,7 @@ static struct test_cmd cmds[] = {
 	{ "mktemp", cmd__mktemp },
 	{ "oid-array", cmd__oid_array },
 	{ "oidmap", cmd__oidmap },
+	{ "oidtree", cmd__oidtree },
 	{ "online-cpus", cmd__online_cpus },
 	{ "parse-options", cmd__parse_options },
 	{ "parse-pathspec-file", cmd__parse_pathspec_file },
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@ -32,6 +32,7 @@ int cmd__match_trees(int argc, const char **argv);
 int cmd__mergesort(int argc, const char **argv);
 int cmd__mktemp(int argc, const char **argv);
 int cmd__oidmap(int argc, const char **argv);
+int cmd__oidtree(int argc, const char **argv);
 int cmd__online_cpus(int argc, const char **argv);
 int cmd__parse_options(int argc, const char **argv);
 int cmd__parse_pathspec_file(int argc, const char** argv);
--- a/t/t0069-oidtree.sh
+++ b/t/t0069-oidtree.sh
@ -0,0 +1,49 @@
+#!/bin/sh
+
+test_description='basic tests for the oidtree implementation'
+. ./test-lib.sh
+
+maxhexsz=$(test_oid hexsz)
+echoid () {
+	prefix="${1:+$1 }"
+	shift
+	while test $# -gt 0
+	do
+		shortoid="$1"
+		shift
+		difference=$(($maxhexsz - ${#shortoid}))
+		printf "%s%s%0${difference}d\\n" "$prefix" "$shortoid" "0"
+	done
+}
+
+test_expect_success 'oidtree insert and contains' '
+	cat >expect <<-\EOF &&
+		0
+		0
+		0
+		1
+		1
+		0
+	EOF
+	{
+		echoid insert 444 1 2 3 4 5 a b c d e &&
+		echoid contains 44 441 440 444 4440 4444
+		echo clear
+	} | test-tool oidtree >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'oidtree each' '
+	echoid "" 123 321 321 >expect &&
+	{
+		echoid insert f 9 8 123 321 a b c d e
+		echo each 12300
+		echo each 3211
+		echo each 3210
+		echo each 32100
+		echo clear
+	} | test-tool oidtree >actual &&
+	test_cmp expect actual
+'
+
+test_done