From 2dd804cd12143741ea4188346fba250e821609b5 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 12 Jul 2022 19:10:28 -0400 Subject: [PATCH 1/3] t5318: demonstrate commit-graph generation v2 corruption When upgrading a commit-graph using generation v1 to one using generation v2, it is possible to force Git into a corrupt state where it (incorrectly) believes that a GDO2 chunk is necessary, *after* deciding not to write one. This makes subsequent reads using the commit-graph produce the following error message: fatal: commit-graph requires overflow generation data but has none Demonstrate this bug by increasing our test coverage to include a minimal example of upgrading a commit-graph from generation v1 to v2. The only notable components of this test are: - The committer date of the commit is chosen carefully so that the offset underflows when computed using a v1 generation number, but would not overflow when using v2 generation numbers. - The upgrade to generation number v2 must read in the v1 generation numbers, which we can do by passing `--changed-paths`, which will force the commit-graph internals to call `fill_commit_graph_info()`. A future patch will squash this bug. Reported-by: Jeff King Reproduced-by: Will Chandler Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/t5318-commit-graph.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index fbf0d64578..b85d07f60a 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -811,4 +811,31 @@ test_expect_success 'set up and verify repo with generation data overflow chunk' graph_git_behavior 'generation data overflow chunk repo' repo left right +test_expect_failure 'overflow during generation version upgrade' ' + git init overflow-v2-upgrade && + ( + cd overflow-v2-upgrade && + + # This commit will have a date at two seconds past the Epoch, + # and a (v1) generation number of 1, since it is a root commit. + # + # The offset will then be computed as 1-2, which will underflow + # to 2^31, which is greater than the v2 offset small limit of + # 2^31-1. + # + # This is sufficient to need a large offset table for the v2 + # generation numbers. + test_commit --date "@2 +0000" base && + git repack -d && + + # Test that upgrading from generation v1 to v2 correctly + # produces the overflow table. + git -c commitGraph.generationVersion=1 commit-graph write && + git -c commitGraph.generationVersion=2 commit-graph write \ + --changed-paths && + + git rev-list --all + ) +' + test_done From 7805360b7a3be02057385bc9d17aa493120b9538 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 12 Jul 2022 19:10:31 -0400 Subject: [PATCH 2/3] commit-graph: introduce `repo_find_commit_pos_in_graph()` Low-level callers in systems that are adjacent to the commit-graph (like the changed-path Bloom filter code) could benefit from being able to call a function like `parse_commit_in_graph()` without modifying the corresponding commit slab data. This is useful in contexts where that slab data is being used to prepare for an upcoming commit-graph write, where Git must be careful to avoid clobbering any of that data during a read operation. Introduce a low-level variant of `parse_commit_in_graph()` which returns the graph position of a given commit only, without modifying any of the slab data. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- commit-graph.c | 12 +++++++++--- commit-graph.h | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 441b36016b..7f69d8c2ec 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -898,6 +898,14 @@ static int find_commit_pos_in_graph(struct commit *item, struct commit_graph *g, } } +int repo_find_commit_pos_in_graph(struct repository *r, struct commit *c, + uint32_t *pos) +{ + if (!prepare_commit_graph(r)) + return 0; + return find_commit_pos_in_graph(c, r->objects->commit_graph, pos); +} + struct commit *lookup_commit_in_graph(struct repository *repo, const struct object_id *id) { struct commit *commit; @@ -955,9 +963,7 @@ int parse_commit_in_graph(struct repository *r, struct commit *item) void load_commit_graph_info(struct repository *r, struct commit *item) { uint32_t pos; - if (!prepare_commit_graph(r)) - return; - if (find_commit_pos_in_graph(item, r->objects->commit_graph, &pos)) + if (repo_find_commit_pos_in_graph(r, item, &pos)) fill_commit_graph_info(item, r->objects->commit_graph, pos); } diff --git a/commit-graph.h b/commit-graph.h index 2e3ac35237..f23b9e9026 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -40,6 +40,21 @@ int open_commit_graph(const char *graph_file, int *fd, struct stat *st); */ int parse_commit_in_graph(struct repository *r, struct commit *item); +/* + * Fills `*pos` with the graph position of `c`, and returns 1 if `c` is + * found in the commit-graph belonging to `r`, or 0 otherwise. + * Initializes the commit-graph belonging to `r` if it hasn't been + * already. + * + * Note: this is a low-level helper that does not alter any slab data + * associated with `c`. Useful in circumstances where the slab data is + * already being modified (e.g., writing the commit-graph itself). + * + * In most cases, callers should use `parse_commit_in_graph()` instead. + */ +int repo_find_commit_pos_in_graph(struct repository *r, struct commit *c, + uint32_t *pos); + /* * Look up the given commit ID in the commit-graph. This will only return a * commit if the ID exists both in the graph and in the object database such From 9550f6c16a8be18bd4868909d4d5e29d05bd9733 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 12 Jul 2022 19:10:33 -0400 Subject: [PATCH 3/3] commit-graph: fix corrupt upgrade from generation v1 to v2 The previous commit demonstrates a bug where a commit-graph using generation v2 could enter a state where one of the GDA2 values has its most-significant bit set (indicating that its value should be read from the extended offset table in the GDO2 chunk) without having a GDO2 chunk to read from. This results in the following error message being displayed to the caller: fatal: commit-graph requires overflow generation data but has none This bug arises in the following scenario: - We decide to write a commit-graph using generation number v2, and decide (correctly) that no GDO2 chunk is necessary (e.g., because all of the commiter date offsets are no larger than 2^31-1). - The v2 generation numbers are stored in the `->generation` member of the commit slab holding `struct commit_graph_data`'s. - Later on, `load_commit_graph_info()` is called, overwriting the v2 generation data in the aforementioned slab with any existing v1 generation data. Then, when the commit-graph code goes to write the GDA2 chunk via `write_graph_chunk_generation_data()`, we use the overwritten generation v1 data in a place where we expect to use a v2 generation number: offset = commit_graph_data_at(c)->generation - c->date; ...because `commit_graph_data_at(c)->generation` used to hold the v2 generation data, but it was overwritten to contain the v1 generation number via `load_commit_graph_info()`. If the `offset` computation above overflows the v2 generation number max, then `write_graph_chunk_generation_data()` will update its count of large offsets and write the marker accordingly: if (offset > GENERATION_NUMBER_V2_OFFSET_MAX) { offset = CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW | num_generation_data_overflows; num_generation_data_overflows++; } and reads will look for the GDO2 chunk containing the overflowing v2 generation number, *after* the commit-graph code decided that no such chunk was necessary. The main problem is that the slab containing `struct commit_graph_data` has a dual purpose. It is used to hold data that we are about to write to disk while generating a commit-graph, as well as hold data that was read from an existing commit-graph. When the two mix, namely when the result of reading the commit-graph has a side-effect that mixes poorly with an in-progress commit-graph write, we end up with corrupt data. A complete fix might be to introduce a new slab that is used exclusively for writing, and gate access between the two slabs based on context provided by the caller (e.g., whether this computation is part of a "read" or "write" operation). But a more minimal fix addresses the only known path which overwrites the slab data, which is `compute_bloom_filters()` -> `get_or_compute_bloom_filter()` -> `load_commit_graph_info()` -> `fill_commit_graph_info()` by avoiding the last call which clobbers the data altogether. This path only needs to learn the graph position of a given commit so that it can be used in `load_bloom_filter_from_graph()`. By replacing the last steps of the above with one that records the graph position into a temporary variable which is then used to load the existing Bloom data, we eliminate the clobbering, removing the corruption. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- bloom.c | 10 +++++----- t/t5318-commit-graph.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bloom.c b/bloom.c index 5e297038bb..816f063dca 100644 --- a/bloom.c +++ b/bloom.c @@ -30,10 +30,9 @@ static inline unsigned char get_bitmask(uint32_t pos) static int load_bloom_filter_from_graph(struct commit_graph *g, struct bloom_filter *filter, - struct commit *c) + uint32_t graph_pos) { uint32_t lex_pos, start_index, end_index; - uint32_t graph_pos = commit_graph_position(c); while (graph_pos < g->num_commits_in_base) g = g->base_graph; @@ -203,9 +202,10 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r, filter = bloom_filter_slab_at(&bloom_filters, c); if (!filter->data) { - load_commit_graph_info(r, c); - if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH) - load_bloom_filter_from_graph(r->objects->commit_graph, filter, c); + uint32_t graph_pos; + if (repo_find_commit_pos_in_graph(r, c, &graph_pos)) + load_bloom_filter_from_graph(r->objects->commit_graph, + filter, graph_pos); } if (filter->data && filter->len) diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index b85d07f60a..db89542dfb 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -811,7 +811,7 @@ test_expect_success 'set up and verify repo with generation data overflow chunk' graph_git_behavior 'generation data overflow chunk repo' repo left right -test_expect_failure 'overflow during generation version upgrade' ' +test_expect_success 'overflow during generation version upgrade' ' git init overflow-v2-upgrade && ( cd overflow-v2-upgrade &&