From 0fac15652322e607ef3cb9f59e46ca2b168e933a Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 1 Feb 2021 12:47:23 +0000 Subject: [PATCH 1/5] commit-reach: reduce requirements for remove_redundant() Remove a comment at the beggining of remove_redundant() that mentions a reordering of the input array to have the initial segment be the independent commits and the final segment be the redundant commits. While this behavior is followed in remove_redundant(), no callers rely on that behavior. Remove the final loop that copies this final segment and update the comment to match the new behavior. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-reach.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index e38771ca5a..9af51fe7e0 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -160,9 +160,10 @@ static int remove_redundant(struct repository *r, struct commit **array, int cnt { /* * Some commit in the array may be an ancestor of - * another commit. Move such commit to the end of - * the array, and return the number of commits that - * are independent from each other. + * another commit. Move the independent commits to the + * beginning of 'array' and return their number. Callers + * should not rely upon the contents of 'array' after + * that number. */ struct commit **work; unsigned char *redundant; @@ -209,9 +210,6 @@ static int remove_redundant(struct repository *r, struct commit **array, int cnt for (i = filled = 0; i < cnt; i++) if (!redundant[i]) array[filled++] = work[i]; - for (j = filled, i = 0; i < cnt; i++) - if (redundant[i]) - array[j++] = work[i]; free(work); free(redundant); free(filled_index); From fbc21e3fbba982c50a3c2a273038770a249ed510 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 19 Feb 2021 12:34:07 +0000 Subject: [PATCH 2/5] commit-reach: use one walk in remove_redundant() The current implementation of remove_redundant() uses several calls to paint_down_to_common() to determine that commits are independent of each other. This leads to quadratic behavior when many inputs are passed to commands such as 'git merge-base'. For example, in the Linux kernel repository, I tested the performance by passing all tags: git merge-base --independent $(git for-each-ref refs/tags --format="$(refname)") (Note: I had to delete the tags v2.6.11-tree and v2.6.11 as they do not point to commits.) Here is the performance improvement introduced by this change: Before: 16.4s After: 1.1s This performance improvement requires the commit-graph file to be present. We keep the old algorithm around as remove_redundant_no_gen() and use it when generation_numbers_enabled() is false. This is similar to other algorithms within commit-reach.c. The new algorithm is implemented in remove_redundant_with_gen(). The basic approach is to do one commit walk instead of many. First, scan all commits in the list and mark their _parents_ with the STALE flag. This flag will indicate commits that are reachable from one of the inputs, except not including themselves. Then, walk commits until covering all commits up to the minimum generation number pushing the STALE flag throughout. At the end, we need to clear the STALE bit from all of the commits we walked. We move the non-stale commits in 'array' to the beginning of the list, and this might overwrite stale commits. However, we store an array of commits that started the walk, and use clear_commit_marks() on each of those starting commits. That method will walk the reachable commits with the STALE bit and clear them all. This makes the algorithm safe for re-entry or for other uses of those commits after this walk. This logic is covered by tests in t6600-test-reach.sh, so the behavior does not change. This is tested both in the case with a commit-graph and without. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-reach.c | 104 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 8 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index 9af51fe7e0..7a3a1eb1a2 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -156,15 +156,9 @@ struct commit_list *get_octopus_merge_bases(struct commit_list *in) return ret; } -static int remove_redundant(struct repository *r, struct commit **array, int cnt) +static int remove_redundant_no_gen(struct repository *r, + struct commit **array, int cnt) { - /* - * Some commit in the array may be an ancestor of - * another commit. Move the independent commits to the - * beginning of 'array' and return their number. Callers - * should not rely upon the contents of 'array' after - * that number. - */ struct commit **work; unsigned char *redundant; int *filled_index; @@ -216,6 +210,100 @@ static int remove_redundant(struct repository *r, struct commit **array, int cnt return filled; } +static int remove_redundant_with_gen(struct repository *r, + struct commit **array, int cnt) +{ + int i, count_non_stale = 0; + timestamp_t min_generation = GENERATION_NUMBER_INFINITY; + struct commit **walk_start; + size_t walk_start_nr = 0, walk_start_alloc = cnt; + struct prio_queue queue = { compare_commits_by_gen_then_commit_date }; + + ALLOC_ARRAY(walk_start, walk_start_alloc); + + /* Mark all parents of the input as STALE */ + for (i = 0; i < cnt; i++) { + struct commit_list *parents; + timestamp_t generation; + + repo_parse_commit(r, array[i]); + parents = array[i]->parents; + + while (parents) { + repo_parse_commit(r, parents->item); + if (!(parents->item->object.flags & STALE)) { + parents->item->object.flags |= STALE; + ALLOC_GROW(walk_start, walk_start_nr + 1, walk_start_alloc); + walk_start[walk_start_nr++] = parents->item; + prio_queue_put(&queue, parents->item); + } + parents = parents->next; + } + + generation = commit_graph_generation(array[i]); + + if (generation < min_generation) + min_generation = generation; + } + + /* push the STALE bits up to min generation */ + while (queue.nr) { + struct commit_list *parents; + struct commit *c = prio_queue_get(&queue); + + repo_parse_commit(r, c); + + if (commit_graph_generation(c) < min_generation) + continue; + + parents = c->parents; + while (parents) { + if (!(parents->item->object.flags & STALE)) { + parents->item->object.flags |= STALE; + prio_queue_put(&queue, parents->item); + } + parents = parents->next; + } + } + + /* rearrange array */ + for (i = count_non_stale = 0; i < cnt; i++) { + if (!(array[i]->object.flags & STALE)) + array[count_non_stale++] = array[i]; + } + + /* clear marks */ + clear_commit_marks_many(walk_start_nr, walk_start, STALE); + free(walk_start); + + return count_non_stale; +} + +static int remove_redundant(struct repository *r, struct commit **array, int cnt) +{ + /* + * Some commit in the array may be an ancestor of + * another commit. Move the independent commits to the + * beginning of 'array' and return their number. Callers + * should not rely upon the contents of 'array' after + * that number. + */ + if (generation_numbers_enabled(r)) { + int i; + + /* + * If we have a single commit with finite generation + * number, then the _with_gen algorithm is preferred. + */ + for (i = 0; i < cnt; i++) { + if (commit_graph_generation(array[i]) < GENERATION_NUMBER_INFINITY) + return remove_redundant_with_gen(r, array, cnt); + } + } + + return remove_redundant_no_gen(r, array, cnt); +} + static struct commit_list *get_merge_bases_many_0(struct repository *r, struct commit *one, int n, From c8d693e1e6d3bd883224559cf4c1b07d9d58e0cf Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 19 Feb 2021 12:34:08 +0000 Subject: [PATCH 3/5] commit-reach: move compare_commits_by_gen Move this earlier in the file so it can be used by more methods. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-reach.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index 7a3a1eb1a2..a34c5ba09e 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -17,6 +17,21 @@ static const unsigned all_flags = (PARENT1 | PARENT2 | STALE | RESULT); +static int compare_commits_by_gen(const void *_a, const void *_b) +{ + const struct commit *a = *(const struct commit * const *)_a; + const struct commit *b = *(const struct commit * const *)_b; + + timestamp_t generation_a = commit_graph_generation(a); + timestamp_t generation_b = commit_graph_generation(b); + + if (generation_a < generation_b) + return -1; + if (generation_a > generation_b) + return 1; + return 0; +} + static int queue_has_nonstale(struct prio_queue *queue) { int i; @@ -647,21 +662,6 @@ int commit_contains(struct ref_filter *filter, struct commit *commit, return repo_is_descendant_of(the_repository, commit, list); } -static int compare_commits_by_gen(const void *_a, const void *_b) -{ - const struct commit *a = *(const struct commit * const *)_a; - const struct commit *b = *(const struct commit * const *)_b; - - timestamp_t generation_a = commit_graph_generation(a); - timestamp_t generation_b = commit_graph_generation(b); - - if (generation_a < generation_b) - return -1; - if (generation_a > generation_b) - return 1; - return 0; -} - int can_all_from_reach_with_flag(struct object_array *from, unsigned int with_flag, unsigned int assign_flag, From 36777733713afeee31e7cf2dbb6d6a3dac186a51 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 19 Feb 2021 12:34:09 +0000 Subject: [PATCH 4/5] commit-reach: use heuristic in remove_redundant() Reachability algorithms in commit-reach.c frequently benefit from using the first-parent history as a heuristic for satisfying reachability queries. The most obvious example was implemented in 4fbcca4e (commit-reach: make can_all_from_reach... linear, 2018-07-20). Update the walk in remove_redundant() to use this same heuristic. Here, we are walking starting at the parents of the input commits. Sort those parents and walk from the highest generation to lower. Each time, use the heuristic of searching the first parent history before continuing to expand the walk. The order in which we explore the commits matters, so update compare_commits_by_gen to break generation number ties with commit date. This has no effect when the commits are in a commit-graph file with corrected commit dates computed, but it will assist when the commits are in the region "above" the commit-graph with "infinite" generation number. Note that we cannot shift to use compare_commits_by_gen_then_commit_date as the method prototype is different. We use compare_commits_by_gen for QSORT() as opposed to as a priority function. The important piece is to ensure we short-circuit the walk when we find that there is a single non-redundant commit. This happens frequently when looking for merge-bases or comparing several tags with 'git merge-base --independent'. Use a new count 'count_still_independent' and if that hits 1 we can stop walking. To update 'count_still_independent' properly, we add use of the RESULT flag on the input commits. Then we can detect when we reach one of these commits and decrease the count. We need to remove the RESULT flag at that moment because we might re-visit that commit when popping the stack. We use the STALE flag to mark parents that have been added to the new walk_start list, but we need to clear that flag before we start walking so those flags don't halt our depth-first-search walk. On my copy of the Linux kernel repository, the performance of 'git merge-base --independent ' goes from 1.1 seconds to 0.11 seconds. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-reach.c | 72 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index a34c5ba09e..399f2a8673 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -29,6 +29,10 @@ static int compare_commits_by_gen(const void *_a, const void *_b) return -1; if (generation_a > generation_b) return 1; + if (a->date < b->date) + return -1; + if (a->date > b->date) + return 1; return 0; } @@ -228,11 +232,10 @@ static int remove_redundant_no_gen(struct repository *r, static int remove_redundant_with_gen(struct repository *r, struct commit **array, int cnt) { - int i, count_non_stale = 0; + int i, count_non_stale = 0, count_still_independent = cnt; timestamp_t min_generation = GENERATION_NUMBER_INFINITY; struct commit **walk_start; size_t walk_start_nr = 0, walk_start_alloc = cnt; - struct prio_queue queue = { compare_commits_by_gen_then_commit_date }; ALLOC_ARRAY(walk_start, walk_start_alloc); @@ -242,6 +245,7 @@ static int remove_redundant_with_gen(struct repository *r, timestamp_t generation; repo_parse_commit(r, array[i]); + array[i]->object.flags |= RESULT; parents = array[i]->parents; while (parents) { @@ -250,7 +254,6 @@ static int remove_redundant_with_gen(struct repository *r, parents->item->object.flags |= STALE; ALLOC_GROW(walk_start, walk_start_nr + 1, walk_start_alloc); walk_start[walk_start_nr++] = parents->item; - prio_queue_put(&queue, parents->item); } parents = parents->next; } @@ -261,26 +264,63 @@ static int remove_redundant_with_gen(struct repository *r, min_generation = generation; } - /* push the STALE bits up to min generation */ - while (queue.nr) { - struct commit_list *parents; - struct commit *c = prio_queue_get(&queue); + QSORT(walk_start, walk_start_nr, compare_commits_by_gen); - repo_parse_commit(r, c); + /* remove STALE bit for now to allow walking through parents */ + for (i = 0; i < walk_start_nr; i++) + walk_start[i]->object.flags &= ~STALE; - if (commit_graph_generation(c) < min_generation) - continue; + /* + * Start walking from the highest generation. Hopefully, it will + * find all other items during the first-parent walk, and we can + * terminate early. Otherwise, we will do the same amount of work + * as before. + */ + for (i = walk_start_nr - 1; i >= 0 && count_still_independent > 1; i--) { + /* push the STALE bits up to min generation */ + struct commit_list *stack = NULL; - parents = c->parents; - while (parents) { - if (!(parents->item->object.flags & STALE)) { - parents->item->object.flags |= STALE; - prio_queue_put(&queue, parents->item); + commit_list_insert(walk_start[i], &stack); + walk_start[i]->object.flags |= STALE; + + while (stack) { + struct commit_list *parents; + struct commit *c = stack->item; + + repo_parse_commit(r, c); + + if (c->object.flags & RESULT) { + c->object.flags &= ~RESULT; + if (--count_still_independent <= 1) + break; } - parents = parents->next; + + if (commit_graph_generation(c) < min_generation) { + pop_commit(&stack); + continue; + } + + parents = c->parents; + while (parents) { + if (!(parents->item->object.flags & STALE)) { + parents->item->object.flags |= STALE; + commit_list_insert(parents->item, &stack); + break; + } + parents = parents->next; + } + + /* pop if all parents have been visited already */ + if (!parents) + pop_commit(&stack); } + free_commit_list(stack); } + /* clear result */ + for (i = 0; i < cnt; i++) + array[i]->object.flags &= ~RESULT; + /* rearrange array */ for (i = count_non_stale = 0; i < cnt; i++) { if (!(array[i]->object.flags & STALE)) From 41f3c9949fc384f6a122e7d23bc36b7d9bb96ce2 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 19 Feb 2021 12:34:10 +0000 Subject: [PATCH 5/5] commit-reach: stale commits may prune generation further The remove_redundant_with_gen() algorithm performs a depth-first-search to find commits in the 'array' list, starting at the parents of each commit in 'array'. The result is that commits in 'array' are marked STALE when they are reachable from another commit in 'array'. This depth-first-search is fast when commits lie on or near the first-parent history of the higher commits. The search terminates early if all but one commit becomes marked STALE. However, it is possible that there are two independent commits with high generation number. In that case, the depth-first-search might languish by searching in lower generations due to the fixed min_generation used throughout the method. With the expectation that commits with lower generation are expected to become STALE more often, we can optimize further by increasing that min_generation boundary upon discovery of the commit with minimum generation. We must first sort the commits in 'array' by generation. We cannot sort 'array' itself since it must preserve relative order among the returned results (see revision.c:mark_redundant_parents() for an example). This simplifies the initialization of min_generation, but it also allows us to increase the new min_generation when we find the commit with smallest generation remaining. This requires more than two commits in order to test, so I used the Linux kernel repository with a few commits that are slightly off of the first-parent history. I timed the following command: git merge-base --independent 2ecedd756908 d2360a398f0b \ 1253935ad801 160bab43419e 0e2209629fec 1d0e16ac1a9e The first two commits have similar generation and are near the v5.10 tag. Commit 160bab43419e is off of the first-parent history behind v5.5, while the others are scattered somewhere reachable from v5.9. This is designed to demonstrate the optimization, as that commit within v5.5 would normally cause a lot of extra commit walking. Since remove_redundant_with_alg() is called only when at least one of the input commits has a finite generation number, this algorithm is tested with a commit-graph generated starting at a number of different tags, the earliest being v5.5. commit-graph at v5.5: | Method | Time | |-----------------------+-------| | *_no_gen() | 864ms | | *_with_gen() (before) | 858ms | | *_with_gen() (after) | 810ms | commit-graph at v5.7: | Method | Time | |-----------------------+-------| | *_no_gen() | 625ms | | *_with_gen() (before) | 572ms | | *_with_gen() (after) | 517ms | commit-graph at v5.9: | Method | Time | |-----------------------+-------| | *_no_gen() | 268ms | | *_with_gen() (before) | 224ms | | *_with_gen() (after) | 202ms | commit-graph at v5.10: | Method | Time | |-----------------------+-------| | *_no_gen() | 72ms | | *_with_gen() (before) | 37ms | | *_with_gen() (after) | 9ms | Note that these are only modest improvements for the case where the two independent commits are not in the commit-graph (not until v5.10). All algorithms get faster as more commits are indexed, which is not a surprise. However, the cost of walking extra commits is more and more prevalent in relative terms as more commits are indexed. Finally, the last case allows us to jump to the minimum generation between the last two commits (that are actually independent) so we greatly reduce the cost in that case. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-reach.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index 399f2a8673..2ea84d3dc0 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -234,15 +234,27 @@ static int remove_redundant_with_gen(struct repository *r, { int i, count_non_stale = 0, count_still_independent = cnt; timestamp_t min_generation = GENERATION_NUMBER_INFINITY; - struct commit **walk_start; + struct commit **walk_start, **sorted; size_t walk_start_nr = 0, walk_start_alloc = cnt; + int min_gen_pos = 0; + + /* + * Sort the input by generation number, ascending. This allows + * us to increase the "min_generation" limit when we discover + * the commit with lowest generation is STALE. The index + * min_gen_pos points to the current position within 'array' + * that is not yet known to be STALE. + */ + ALLOC_ARRAY(sorted, cnt); + COPY_ARRAY(sorted, array, cnt); + QSORT(sorted, cnt, compare_commits_by_gen); + min_generation = commit_graph_generation(sorted[0]); ALLOC_ARRAY(walk_start, walk_start_alloc); /* Mark all parents of the input as STALE */ for (i = 0; i < cnt; i++) { struct commit_list *parents; - timestamp_t generation; repo_parse_commit(r, array[i]); array[i]->object.flags |= RESULT; @@ -257,11 +269,6 @@ static int remove_redundant_with_gen(struct repository *r, } parents = parents->next; } - - generation = commit_graph_generation(array[i]); - - if (generation < min_generation) - min_generation = generation; } QSORT(walk_start, walk_start_nr, compare_commits_by_gen); @@ -293,6 +300,12 @@ static int remove_redundant_with_gen(struct repository *r, c->object.flags &= ~RESULT; if (--count_still_independent <= 1) break; + if (oideq(&c->object.oid, &sorted[min_gen_pos]->object.oid)) { + while (min_gen_pos < cnt - 1 && + (sorted[min_gen_pos]->object.flags & STALE)) + min_gen_pos++; + min_generation = commit_graph_generation(sorted[min_gen_pos]); + } } if (commit_graph_generation(c) < min_generation) { @@ -316,6 +329,7 @@ static int remove_redundant_with_gen(struct repository *r, } free_commit_list(stack); } + free(sorted); /* clear result */ for (i = 0; i < cnt; i++)