diff --git a/Makefile b/Makefile index c3565fc0f8..a1525978fb 100644 --- a/Makefile +++ b/Makefile @@ -722,6 +722,7 @@ TEST_BUILTINS_OBJS += test-mergesort.o TEST_BUILTINS_OBJS += test-mktemp.o TEST_BUILTINS_OBJS += test-oid-array.o TEST_BUILTINS_OBJS += test-oidmap.o +TEST_BUILTINS_OBJS += test-oidtree.o TEST_BUILTINS_OBJS += test-online-cpus.o TEST_BUILTINS_OBJS += test-parse-options.o TEST_BUILTINS_OBJS += test-parse-pathspec-file.o @@ -845,6 +846,7 @@ LIB_OBJS += branch.o LIB_OBJS += bulk-checkin.o LIB_OBJS += bundle.o LIB_OBJS += cache-tree.o +LIB_OBJS += cbtree.o LIB_OBJS += chdir-notify.o LIB_OBJS += checkout.o LIB_OBJS += chunk-format.o @@ -940,6 +942,7 @@ LIB_OBJS += object.o LIB_OBJS += oid-array.o LIB_OBJS += oidmap.o LIB_OBJS += oidset.o +LIB_OBJS += oidtree.o LIB_OBJS += pack-bitmap-write.o LIB_OBJS += pack-bitmap.o LIB_OBJS += pack-check.o diff --git a/cbtree.c b/cbtree.c new file mode 100644 index 0000000000..b0c65d810f --- /dev/null +++ b/cbtree.c @@ -0,0 +1,167 @@ +/* + * crit-bit tree implementation, does no allocations internally + * For more information on crit-bit trees: https://cr.yp.to/critbit.html + * Based on Adam Langley's adaptation of Dan Bernstein's public domain code + * git clone https://github.com/agl/critbit.git + */ +#include "cbtree.h" + +static struct cb_node *cb_node_of(const void *p) +{ + return (struct cb_node *)((uintptr_t)p - 1); +} + +/* locate the best match, does not do a final comparision */ +static struct cb_node *cb_internal_best_match(struct cb_node *p, + const uint8_t *k, size_t klen) +{ + while (1 & (uintptr_t)p) { + struct cb_node *q = cb_node_of(p); + uint8_t c = q->byte < klen ? k[q->byte] : 0; + size_t direction = (1 + (q->otherbits | c)) >> 8; + + p = q->child[direction]; + } + return p; +} + +/* returns NULL if successful, existing cb_node if duplicate */ +struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen) +{ + size_t newbyte, newotherbits; + uint8_t c; + int newdirection; + struct cb_node **wherep, *p; + + assert(!((uintptr_t)node & 1)); /* allocations must be aligned */ + + if (!t->root) { /* insert into empty tree */ + t->root = node; + return NULL; /* success */ + } + + /* see if a node already exists */ + p = cb_internal_best_match(t->root, node->k, klen); + + /* find first differing byte */ + for (newbyte = 0; newbyte < klen; newbyte++) { + if (p->k[newbyte] != node->k[newbyte]) + goto different_byte_found; + } + return p; /* element exists, let user deal with it */ + +different_byte_found: + newotherbits = p->k[newbyte] ^ node->k[newbyte]; + newotherbits |= newotherbits >> 1; + newotherbits |= newotherbits >> 2; + newotherbits |= newotherbits >> 4; + newotherbits = (newotherbits & ~(newotherbits >> 1)) ^ 255; + c = p->k[newbyte]; + newdirection = (1 + (newotherbits | c)) >> 8; + + node->byte = newbyte; + node->otherbits = newotherbits; + node->child[1 - newdirection] = node; + + /* find a place to insert it */ + wherep = &t->root; + for (;;) { + struct cb_node *q; + size_t direction; + + p = *wherep; + if (!(1 & (uintptr_t)p)) + break; + q = cb_node_of(p); + if (q->byte > newbyte) + break; + if (q->byte == newbyte && q->otherbits > newotherbits) + break; + c = q->byte < klen ? node->k[q->byte] : 0; + direction = (1 + (q->otherbits | c)) >> 8; + wherep = q->child + direction; + } + + node->child[newdirection] = *wherep; + *wherep = (struct cb_node *)(1 + (uintptr_t)node); + + return NULL; /* success */ +} + +struct cb_node *cb_lookup(struct cb_tree *t, const uint8_t *k, size_t klen) +{ + struct cb_node *p = cb_internal_best_match(t->root, k, klen); + + return p && !memcmp(p->k, k, klen) ? p : NULL; +} + +struct cb_node *cb_unlink(struct cb_tree *t, const uint8_t *k, size_t klen) +{ + struct cb_node **wherep = &t->root; + struct cb_node **whereq = NULL; + struct cb_node *q = NULL; + size_t direction = 0; + uint8_t c; + struct cb_node *p = t->root; + + if (!p) return NULL; /* empty tree, nothing to delete */ + + /* traverse to find best match, keeping link to parent */ + while (1 & (uintptr_t)p) { + whereq = wherep; + q = cb_node_of(p); + c = q->byte < klen ? k[q->byte] : 0; + direction = (1 + (q->otherbits | c)) >> 8; + wherep = q->child + direction; + p = *wherep; + } + + if (memcmp(p->k, k, klen)) + return NULL; /* no match, nothing unlinked */ + + /* found an exact match */ + if (whereq) /* update parent */ + *whereq = q->child[1 - direction]; + else + t->root = NULL; + return p; +} + +static enum cb_next cb_descend(struct cb_node *p, cb_iter fn, void *arg) +{ + if (1 & (uintptr_t)p) { + struct cb_node *q = cb_node_of(p); + enum cb_next n = cb_descend(q->child[0], fn, arg); + + return n == CB_BREAK ? n : cb_descend(q->child[1], fn, arg); + } else { + return fn(p, arg); + } +} + +void cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen, + cb_iter fn, void *arg) +{ + struct cb_node *p = t->root; + struct cb_node *top = p; + size_t i = 0; + + if (!p) return; /* empty tree */ + + /* Walk tree, maintaining top pointer */ + while (1 & (uintptr_t)p) { + struct cb_node *q = cb_node_of(p); + uint8_t c = q->byte < klen ? kpfx[q->byte] : 0; + size_t direction = (1 + (q->otherbits | c)) >> 8; + + p = q->child[direction]; + if (q->byte < klen) + top = p; + } + + for (i = 0; i < klen; i++) { + if (p->k[i] != kpfx[i]) + return; /* "best" match failed */ + } + cb_descend(top, fn, arg); +} diff --git a/cbtree.h b/cbtree.h new file mode 100644 index 0000000000..fe4587087e --- /dev/null +++ b/cbtree.h @@ -0,0 +1,56 @@ +/* + * crit-bit tree implementation, does no allocations internally + * For more information on crit-bit trees: https://cr.yp.to/critbit.html + * Based on Adam Langley's adaptation of Dan Bernstein's public domain code + * git clone https://github.com/agl/critbit.git + * + * This is adapted to store arbitrary data (not just NUL-terminated C strings + * and allocates no memory internally. The user needs to allocate + * "struct cb_node" and fill cb_node.k[] with arbitrary match data + * for memcmp. + * If "klen" is variable, then it should be embedded into "c_node.k[]" + * Recursion is bound by the maximum value of "klen" used. + */ +#ifndef CBTREE_H +#define CBTREE_H + +#include "git-compat-util.h" + +struct cb_node; +struct cb_node { + struct cb_node *child[2]; + /* + * n.b. uint32_t for `byte' is excessive for OIDs, + * we may consider shorter variants if nothing else gets stored. + */ + uint32_t byte; + uint8_t otherbits; + uint8_t k[FLEX_ARRAY]; /* arbitrary data */ +}; + +struct cb_tree { + struct cb_node *root; +}; + +enum cb_next { + CB_CONTINUE = 0, + CB_BREAK = 1 +}; + +#define CBTREE_INIT { .root = NULL } + +static inline void cb_init(struct cb_tree *t) +{ + t->root = NULL; +} + +struct cb_node *cb_lookup(struct cb_tree *, const uint8_t *k, size_t klen); +struct cb_node *cb_insert(struct cb_tree *, struct cb_node *, size_t klen); +struct cb_node *cb_unlink(struct cb_tree *t, const uint8_t *k, size_t klen); + +typedef enum cb_next (*cb_iter)(struct cb_node *, void *arg); + +void cb_each(struct cb_tree *, const uint8_t *kpfx, size_t klen, + cb_iter, void *arg); + +#endif /* CBTREE_H */ diff --git a/object-file.c b/object-file.c index 91ded8c22a..35f3e7e9bb 100644 --- a/object-file.c +++ b/object-file.c @@ -1173,7 +1173,7 @@ static int quick_has_loose(struct repository *r, prepare_alt_odb(r); for (odb = r->objects->odb; odb; odb = odb->next) { - if (oid_array_lookup(odb_loose_cache(odb, oid), oid) >= 0) + if (oidtree_contains(odb_loose_cache(odb, oid), oid)) return 1; } return 0; @@ -2452,11 +2452,11 @@ int for_each_loose_object(each_loose_object_fn cb, void *data, static int append_loose_object(const struct object_id *oid, const char *path, void *data) { - oid_array_append(data, oid); + oidtree_insert(data, oid); return 0; } -struct oid_array *odb_loose_cache(struct object_directory *odb, +struct oidtree *odb_loose_cache(struct object_directory *odb, const struct object_id *oid) { int subdir_nr = oid->hash[0]; @@ -2472,24 +2472,25 @@ struct oid_array *odb_loose_cache(struct object_directory *odb, bitmap = &odb->loose_objects_subdir_seen[word_index]; if (*bitmap & mask) - return &odb->loose_objects_cache[subdir_nr]; - + return odb->loose_objects_cache; + if (!odb->loose_objects_cache) { + ALLOC_ARRAY(odb->loose_objects_cache, 1); + oidtree_init(odb->loose_objects_cache); + } strbuf_addstr(&buf, odb->path); for_each_file_in_obj_subdir(subdir_nr, &buf, append_loose_object, NULL, NULL, - &odb->loose_objects_cache[subdir_nr]); + odb->loose_objects_cache); *bitmap |= mask; strbuf_release(&buf); - return &odb->loose_objects_cache[subdir_nr]; + return odb->loose_objects_cache; } void odb_clear_loose_cache(struct object_directory *odb) { - int i; - - for (i = 0; i < ARRAY_SIZE(odb->loose_objects_cache); i++) - oid_array_clear(&odb->loose_objects_cache[i]); + oidtree_clear(odb->loose_objects_cache); + FREE_AND_NULL(odb->loose_objects_cache); memset(&odb->loose_objects_subdir_seen, 0, sizeof(odb->loose_objects_subdir_seen)); } diff --git a/object-name.c b/object-name.c index 64202de60b..3263c19457 100644 --- a/object-name.c +++ b/object-name.c @@ -87,27 +87,21 @@ static void update_candidates(struct disambiguate_state *ds, const struct object static int match_hash(unsigned, const unsigned char *, const unsigned char *); +static enum cb_next match_prefix(const struct object_id *oid, void *arg) +{ + struct disambiguate_state *ds = arg; + /* no need to call match_hash, oidtree_each did prefix match */ + update_candidates(ds, oid); + return ds->ambiguous ? CB_BREAK : CB_CONTINUE; +} + static void find_short_object_filename(struct disambiguate_state *ds) { struct object_directory *odb; - for (odb = ds->repo->objects->odb; odb && !ds->ambiguous; odb = odb->next) { - int pos; - struct oid_array *loose_objects; - - loose_objects = odb_loose_cache(odb, &ds->bin_pfx); - pos = oid_array_lookup(loose_objects, &ds->bin_pfx); - if (pos < 0) - pos = -1 - pos; - while (!ds->ambiguous && pos < loose_objects->nr) { - const struct object_id *oid; - oid = loose_objects->oid + pos; - if (!match_hash(ds->len, ds->bin_pfx.hash, oid->hash)) - break; - update_candidates(ds, oid); - pos++; - } - } + for (odb = ds->repo->objects->odb; odb && !ds->ambiguous; odb = odb->next) + oidtree_each(odb_loose_cache(odb, &ds->bin_pfx), + &ds->bin_pfx, ds->len, match_prefix, ds); } static int match_hash(unsigned len, const unsigned char *a, const unsigned char *b) diff --git a/object-store.h b/object-store.h index ab6d469970..e679acc4c3 100644 --- a/object-store.h +++ b/object-store.h @@ -9,6 +9,7 @@ #include "thread-utils.h" #include "khash.h" #include "dir.h" +#include "oidtree.h" struct object_directory { struct object_directory *next; @@ -23,7 +24,7 @@ struct object_directory { * Be sure to call odb_load_loose_cache() before using. */ uint32_t loose_objects_subdir_seen[8]; /* 256 bits */ - struct oid_array loose_objects_cache[256]; + struct oidtree *loose_objects_cache; /* * Path to the alternative object store. If this is a relative path, @@ -59,7 +60,7 @@ void add_to_alternates_memory(const char *dir); * Populate and return the loose object cache array corresponding to the * given object ID. */ -struct oid_array *odb_loose_cache(struct object_directory *odb, +struct oidtree *odb_loose_cache(struct object_directory *odb, const struct object_id *oid); /* Empty the loose object cache for the specified object directory. */ diff --git a/oidtree.c b/oidtree.c new file mode 100644 index 0000000000..7eb0e9ba05 --- /dev/null +++ b/oidtree.c @@ -0,0 +1,104 @@ +/* + * A wrapper around cbtree which stores oids + * May be used to replace oid-array for prefix (abbreviation) matches + */ +#include "oidtree.h" +#include "alloc.h" +#include "hash.h" + +struct oidtree_node { + /* n.k[] is used to store "struct object_id" */ + struct cb_node n; +}; + +struct oidtree_iter_data { + oidtree_iter fn; + void *arg; + size_t *last_nibble_at; + int algo; + uint8_t last_byte; +}; + +void oidtree_init(struct oidtree *ot) +{ + cb_init(&ot->tree); + mem_pool_init(&ot->mem_pool, 0); +} + +void oidtree_clear(struct oidtree *ot) +{ + if (ot) { + mem_pool_discard(&ot->mem_pool, 0); + oidtree_init(ot); + } +} + +void oidtree_insert(struct oidtree *ot, const struct object_id *oid) +{ + struct oidtree_node *on; + + if (!oid->algo) + BUG("oidtree_insert requires oid->algo"); + + on = mem_pool_alloc(&ot->mem_pool, sizeof(*on) + sizeof(*oid)); + oidcpy_with_padding((struct object_id *)on->n.k, oid); + + /* + * n.b. Current callers won't get us duplicates, here. If a + * future caller causes duplicates, there'll be a a small leak + * that won't be freed until oidtree_clear. Currently it's not + * worth maintaining a free list + */ + cb_insert(&ot->tree, &on->n, sizeof(*oid)); +} + + +int oidtree_contains(struct oidtree *ot, const struct object_id *oid) +{ + struct object_id k; + size_t klen = sizeof(k); + + oidcpy_with_padding(&k, oid); + + if (oid->algo == GIT_HASH_UNKNOWN) + klen -= sizeof(oid->algo); + + /* cb_lookup relies on memcmp on the struct, so order matters: */ + klen += BUILD_ASSERT_OR_ZERO(offsetof(struct object_id, hash) < + offsetof(struct object_id, algo)); + + return cb_lookup(&ot->tree, (const uint8_t *)&k, klen) ? 1 : 0; +} + +static enum cb_next iter(struct cb_node *n, void *arg) +{ + struct oidtree_iter_data *x = arg; + const struct object_id *oid = (const struct object_id *)n->k; + + if (x->algo != GIT_HASH_UNKNOWN && x->algo != oid->algo) + return CB_CONTINUE; + + if (x->last_nibble_at) { + if ((oid->hash[*x->last_nibble_at] ^ x->last_byte) & 0xf0) + return CB_CONTINUE; + } + + return x->fn(oid, x->arg); +} + +void oidtree_each(struct oidtree *ot, const struct object_id *oid, + size_t oidhexsz, oidtree_iter fn, void *arg) +{ + size_t klen = oidhexsz / 2; + struct oidtree_iter_data x = { 0 }; + assert(oidhexsz <= GIT_MAX_HEXSZ); + + x.fn = fn; + x.arg = arg; + x.algo = oid->algo; + if (oidhexsz & 1) { + x.last_byte = oid->hash[klen]; + x.last_nibble_at = &klen; + } + cb_each(&ot->tree, (const uint8_t *)oid, klen, iter, &x); +} diff --git a/oidtree.h b/oidtree.h new file mode 100644 index 0000000000..77898f510a --- /dev/null +++ b/oidtree.h @@ -0,0 +1,22 @@ +#ifndef OIDTREE_H +#define OIDTREE_H + +#include "cbtree.h" +#include "hash.h" +#include "mem-pool.h" + +struct oidtree { + struct cb_tree tree; + struct mem_pool mem_pool; +}; + +void oidtree_init(struct oidtree *); +void oidtree_clear(struct oidtree *); +void oidtree_insert(struct oidtree *, const struct object_id *); +int oidtree_contains(struct oidtree *, const struct object_id *); + +typedef enum cb_next (*oidtree_iter)(const struct object_id *, void *data); +void oidtree_each(struct oidtree *, const struct object_id *, + size_t oidhexsz, oidtree_iter, void *data); + +#endif /* OIDTREE_H */ diff --git a/t/helper/test-oidtree.c b/t/helper/test-oidtree.c new file mode 100644 index 0000000000..180ee28dd9 --- /dev/null +++ b/t/helper/test-oidtree.c @@ -0,0 +1,49 @@ +#include "test-tool.h" +#include "cache.h" +#include "oidtree.h" + +static enum cb_next print_oid(const struct object_id *oid, void *data) +{ + puts(oid_to_hex(oid)); + return CB_CONTINUE; +} + +int cmd__oidtree(int argc, const char **argv) +{ + struct oidtree ot; + struct strbuf line = STRBUF_INIT; + int nongit_ok; + int algo = GIT_HASH_UNKNOWN; + + oidtree_init(&ot); + setup_git_directory_gently(&nongit_ok); + + while (strbuf_getline(&line, stdin) != EOF) { + const char *arg; + struct object_id oid; + + if (skip_prefix(line.buf, "insert ", &arg)) { + if (get_oid_hex_any(arg, &oid) == GIT_HASH_UNKNOWN) + die("insert not a hexadecimal oid: %s", arg); + algo = oid.algo; + oidtree_insert(&ot, &oid); + } else if (skip_prefix(line.buf, "contains ", &arg)) { + if (get_oid_hex(arg, &oid)) + die("contains not a hexadecimal oid: %s", arg); + printf("%d\n", oidtree_contains(&ot, &oid)); + } else if (skip_prefix(line.buf, "each ", &arg)) { + char buf[GIT_MAX_HEXSZ + 1] = { '0' }; + memset(&oid, 0, sizeof(oid)); + memcpy(buf, arg, strlen(arg)); + buf[hash_algos[algo].hexsz] = '\0'; + get_oid_hex_any(buf, &oid); + oid.algo = algo; + oidtree_each(&ot, &oid, strlen(arg), print_oid, NULL); + } else if (!strcmp(line.buf, "clear")) { + oidtree_clear(&ot); + } else { + die("unknown command: %s", line.buf); + } + } + return 0; +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index c5bd0c6d4c..9d37debf28 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -43,6 +43,7 @@ static struct test_cmd cmds[] = { { "mktemp", cmd__mktemp }, { "oid-array", cmd__oid_array }, { "oidmap", cmd__oidmap }, + { "oidtree", cmd__oidtree }, { "online-cpus", cmd__online_cpus }, { "parse-options", cmd__parse_options }, { "parse-pathspec-file", cmd__parse_pathspec_file }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index e8069a3b22..f683a2f59c 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -32,6 +32,7 @@ int cmd__match_trees(int argc, const char **argv); int cmd__mergesort(int argc, const char **argv); int cmd__mktemp(int argc, const char **argv); int cmd__oidmap(int argc, const char **argv); +int cmd__oidtree(int argc, const char **argv); int cmd__online_cpus(int argc, const char **argv); int cmd__parse_options(int argc, const char **argv); int cmd__parse_pathspec_file(int argc, const char** argv); diff --git a/t/t0069-oidtree.sh b/t/t0069-oidtree.sh new file mode 100755 index 0000000000..bfb1397d7b --- /dev/null +++ b/t/t0069-oidtree.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +test_description='basic tests for the oidtree implementation' +. ./test-lib.sh + +maxhexsz=$(test_oid hexsz) +echoid () { + prefix="${1:+$1 }" + shift + while test $# -gt 0 + do + shortoid="$1" + shift + difference=$(($maxhexsz - ${#shortoid})) + printf "%s%s%0${difference}d\\n" "$prefix" "$shortoid" "0" + done +} + +test_expect_success 'oidtree insert and contains' ' + cat >expect <<-\EOF && + 0 + 0 + 0 + 1 + 1 + 0 + EOF + { + echoid insert 444 1 2 3 4 5 a b c d e && + echoid contains 44 441 440 444 4440 4444 + echo clear + } | test-tool oidtree >actual && + test_cmp expect actual +' + +test_expect_success 'oidtree each' ' + echoid "" 123 321 321 >expect && + { + echoid insert f 9 8 123 321 a b c d e + echo each 12300 + echo each 3211 + echo each 3210 + echo each 32100 + echo clear + } | test-tool oidtree >actual && + test_cmp expect actual +' + +test_done