2018-04-02 23:34:19 +03:00
|
|
|
#ifndef COMMIT_GRAPH_H
|
|
|
|
#define COMMIT_GRAPH_H
|
|
|
|
|
2018-04-10 15:56:02 +03:00
|
|
|
#include "git-compat-util.h"
|
2020-02-04 08:51:50 +03:00
|
|
|
#include "object-store.h"
|
2020-04-14 07:04:25 +03:00
|
|
|
#include "oidset.h"
|
2018-04-10 15:56:02 +03:00
|
|
|
|
2018-08-29 15:49:04 +03:00
|
|
|
#define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
|
2020-06-23 20:47:01 +03:00
|
|
|
#define GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE "GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE"
|
2020-04-06 19:59:55 +03:00
|
|
|
#define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS"
|
2018-08-29 15:49:04 +03:00
|
|
|
|
2020-04-16 23:14:03 +03:00
|
|
|
/*
|
|
|
|
* This method is only used to enhance coverage of the commit-graph
|
|
|
|
* feature in the test suite with the GIT_TEST_COMMIT_GRAPH and
|
|
|
|
* GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS environment variables. Do not
|
|
|
|
* call this method oustide of a builtin, and only if you know what
|
|
|
|
* you are doing!
|
|
|
|
*/
|
|
|
|
void git_test_write_commit_graph_or_die(void);
|
|
|
|
|
2018-07-12 01:42:39 +03:00
|
|
|
struct commit;
|
2020-04-06 19:59:49 +03:00
|
|
|
struct bloom_filter_settings;
|
2020-06-05 16:00:28 +03:00
|
|
|
struct repository;
|
|
|
|
struct raw_object_store;
|
|
|
|
struct string_list;
|
2018-07-12 01:42:39 +03:00
|
|
|
|
commit-graph.c: remove path normalization, comparison
As of the previous patch, all calls to 'commit-graph.c' functions which
perform path normalization (for e.g., 'get_commit_graph_filename()') are
of the form 'ctx->odb->path', which is always in normalized form.
Now that there are no callers passing non-normalized paths to these
functions, ensure that future callers are bound by the same restrictions
by making these functions take a 'struct object_directory *' instead of
a 'const char *'. To match, replace all calls with arguments of the form
'ctx->odb->path' with 'ctx->odb' To recover the path, functions that
perform path manipulation simply use 'odb->path'.
Further, avoid string comparisons with arguments of the form
'odb->path', and instead prefer raw pointer comparisons, which
accomplish the same effect, but are far less brittle.
This has a pleasant side-effect of making these functions much more
robust to paths that cannot be normalized by 'normalize_path_copy()',
i.e., because they are outside of the current working directory.
For example, prior to this patch, Valgrind reports that the following
uninitialized memory read [1]:
$ ( cd t && GIT_DIR=../.git valgrind git rev-parse HEAD^ )
because 'normalize_path_copy()' can't normalize '../.git' (since it's
relative to but above of the current working directory) [2].
By using a 'struct object_directory *' directly,
'get_commit_graph_filename()' does not need to normalize, because all
paths are relative to the current working directory since they are
always read from the '->path' of an object directory.
[1]: https://lore.kernel.org/git/20191027042116.GA5801@sigill.intra.peff.net.
[2]: The bug here is that 'get_commit_graph_filename()' returns the
result of 'normalize_path_copy()' without checking the return
value.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-02-04 00:18:02 +03:00
|
|
|
char *get_commit_graph_filename(struct object_directory *odb);
|
2020-09-17 21:11:46 +03:00
|
|
|
char *get_commit_graph_chain_filename(struct object_directory *odb);
|
2019-03-25 15:08:30 +03:00
|
|
|
int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
|
2018-04-10 15:56:02 +03:00
|
|
|
|
2018-04-10 15:56:05 +03:00
|
|
|
/*
|
|
|
|
* Given a commit struct, try to fill the commit struct info, including:
|
|
|
|
* 1. tree object
|
|
|
|
* 2. date
|
|
|
|
* 3. parents.
|
|
|
|
*
|
|
|
|
* Returns 1 if and only if the commit was found in the packed graph.
|
|
|
|
*
|
|
|
|
* See parse_commit_buffer() for the fallback after this call.
|
|
|
|
*/
|
2018-07-12 01:42:42 +03:00
|
|
|
int parse_commit_in_graph(struct repository *r, struct commit *item);
|
2018-04-10 15:56:05 +03:00
|
|
|
|
2018-05-01 15:47:13 +03:00
|
|
|
/*
|
|
|
|
* It is possible that we loaded commit contents from the commit buffer,
|
|
|
|
* but we also want to ensure the commit-graph content is correctly
|
|
|
|
* checked and filled. Fill the graph_pos and generation members of
|
|
|
|
* the given commit.
|
|
|
|
*/
|
2018-07-12 01:42:42 +03:00
|
|
|
void load_commit_graph_info(struct repository *r, struct commit *item);
|
2018-05-01 15:47:13 +03:00
|
|
|
|
2018-07-12 01:42:42 +03:00
|
|
|
struct tree *get_commit_tree_in_graph(struct repository *r,
|
|
|
|
const struct commit *c);
|
2018-04-06 22:09:46 +03:00
|
|
|
|
2018-04-10 15:56:02 +03:00
|
|
|
struct commit_graph {
|
|
|
|
const unsigned char *data;
|
|
|
|
size_t data_len;
|
|
|
|
|
|
|
|
unsigned char hash_len;
|
|
|
|
unsigned char num_chunks;
|
|
|
|
uint32_t num_commits;
|
|
|
|
struct object_id oid;
|
2019-06-18 21:14:27 +03:00
|
|
|
char *filename;
|
2020-02-04 00:18:00 +03:00
|
|
|
struct object_directory *odb;
|
2018-04-10 15:56:02 +03:00
|
|
|
|
2019-06-18 21:14:24 +03:00
|
|
|
uint32_t num_commits_in_base;
|
commit-graph: use generation v2 only if entire chain does
Since there are released versions of Git that understand generation
numbers in the commit-graph's CDAT chunk but do not understand the GDAT
chunk, the following scenario is possible:
1. "New" Git writes a commit-graph with the GDAT chunk.
2. "Old" Git writes a split commit-graph on top without a GDAT chunk.
If each layer of split commit-graph is treated independently, as it was
the case before this commit, with Git inspecting only the current layer
for chunk_generation_data pointer, commits in the lower layer (one with
GDAT) whould have corrected commit date as their generation number,
while commits in the upper layer would have topological levels as their
generation. Corrected commit dates usually have much larger values than
topological levels. This means that if we take two commits, one from the
upper layer, and one reachable from it in the lower layer, then the
expectation that the generation of a parent is smaller than the
generation of a child would be violated.
It is difficult to expose this issue in a test. Since we _start_ with
artificially low generation numbers, any commit walk that prioritizes
generation numbers will walk all of the commits with high generation
number before walking the commits with low generation number. In all the
cases I tried, the commit-graph layers themselves "protect" any
incorrect behavior since none of the commits in the lower layer can
reach the commits in the upper layer.
This issue would manifest itself as a performance problem in this case,
especially with something like "git log --graph" since the low
generation numbers would cause the in-degree queue to walk all of the
commits in the lower layer before allowing the topo-order queue to write
anything to output (depending on the size of the upper layer).
Therefore, When writing the new layer in split commit-graph, we write a
GDAT chunk only if the topmost layer has a GDAT chunk. This guarantees
that if a layer has GDAT chunk, all lower layers must have a GDAT chunk
as well.
Rewriting layers follows similar approach: if the topmost layer below
the set of layers being rewritten (in the split commit-graph chain)
exists, and it does not contain GDAT chunk, then the result of rewrite
does not have GDAT chunks either.
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
Reviewed-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-16 21:11:16 +03:00
|
|
|
unsigned int read_generation_data;
|
2019-06-18 21:14:24 +03:00
|
|
|
struct commit_graph *base_graph;
|
|
|
|
|
2018-04-10 15:56:02 +03:00
|
|
|
const uint32_t *chunk_oid_fanout;
|
|
|
|
const unsigned char *chunk_oid_lookup;
|
|
|
|
const unsigned char *chunk_commit_data;
|
commit-graph: implement generation data chunk
As discovered by Ævar, we cannot increment graph version to
distinguish between generation numbers v1 and v2 [1]. Thus, one of
pre-requistes before implementing generation number v2 was to
distinguish between graph versions in a backwards compatible manner.
We are going to introduce a new chunk called Generation DATa chunk (or
GDAT). GDAT will store corrected committer date offsets whereas CDAT
will still store topological level.
Old Git does not understand GDAT chunk and would ignore it, reading
topological levels from CDAT. New Git can parse GDAT and take advantage
of newer generation numbers, falling back to topological levels when
GDAT chunk is missing (as it would happen with a commit-graph written
by old Git).
We introduce a test environment variable 'GIT_TEST_COMMIT_GRAPH_NO_GDAT'
which forces commit-graph file to be written without generation data
chunk to emulate a commit-graph file written by old Git.
To minimize the space required to store corrrected commit date, Git
stores corrected commit date offsets into the commit-graph file, instea
of corrected commit dates. This saves us 4 bytes per commit, decreasing
the GDAT chunk size by half, but it's possible for the offset to
overflow the 4-bytes allocated for storage. As such overflows are and
should be exceedingly rare, we use the following overflow management
scheme:
We introduce a new commit-graph chunk, Generation Data OVerflow ('GDOV')
to store corrected commit dates for commits with offsets greater than
GENERATION_NUMBER_V2_OFFSET_MAX.
If the offset is greater than GENERATION_NUMBER_V2_OFFSET_MAX, we set
the MSB of the offset and the other bits store the position of corrected
commit date in GDOV chunk, similar to how Extra Edge List is maintained.
We test the overflow-related code with the following repo history:
F - N - U
/ \
U - N - U N
\ /
N - F - N
Where the commits denoted by U have committer date of zero seconds
since Unix epoch, the commits denoted by N have committer date of
1112354055 (default committer date for the test suite) seconds since
Unix epoch and the commits denoted by F have committer date of
(2 ^ 31 - 2) seconds since Unix epoch.
The largest offset observed is 2 ^ 31, just large enough to overflow.
[1]: https://lore.kernel.org/git/87a7gdspo4.fsf@evledraar.gmail.com/
Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
Reviewed-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-16 21:11:15 +03:00
|
|
|
const unsigned char *chunk_generation_data;
|
|
|
|
const unsigned char *chunk_generation_data_overflow;
|
commit-graph: rename "large edges" to "extra edges"
The optional 'Large Edge List' chunk of the commit graph file stores
parent information for commits with more than two parents, and the
names of most of the macros, variables, struct fields, and functions
related to this chunk contain the term "large edges", e.g.
write_graph_chunk_large_edges(). However, it's not a really great
term, as the edges to the second and subsequent parents stored in this
chunk are not any larger than the edges to the first and second
parents stored in the "main" 'Commit Data' chunk. It's the number of
edges, IOW number of parents, that is larger compared to non-merge and
"regular" two-parent merge commits. And indeed, two functions in
'commit-graph.c' have a local variable called 'num_extra_edges' that
refer to the same thing, and this "extra edges" term is much better at
describing these edges.
So let's rename all these references to "large edges" in macro,
variable, function, etc. names to "extra edges". There is a
GRAPH_OCTOPUS_EDGES_NEEDED macro as well; for the sake of consistency
rename it to GRAPH_EXTRA_EDGES_NEEDED.
We can do so safely without causing any incompatibility issues,
because the term "large edges" doesn't come up in the file format
itself in any form (the chunk's magic is {'E', 'D', 'G', 'E'}, there
is no 'L' in there), but only in the specification text. The string
"large edges", however, does come up in the output of 'git
commit-graph read' and in tests looking at its input, but that command
is explicitly documented as debugging aid, so we can change its output
and the affected tests safely.
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-01-19 23:21:13 +03:00
|
|
|
const unsigned char *chunk_extra_edges;
|
2019-06-18 21:14:26 +03:00
|
|
|
const unsigned char *chunk_base_graphs;
|
2020-04-06 19:59:49 +03:00
|
|
|
const unsigned char *chunk_bloom_indexes;
|
|
|
|
const unsigned char *chunk_bloom_data;
|
|
|
|
|
2021-01-16 21:11:12 +03:00
|
|
|
struct topo_level_slab *topo_levels;
|
2020-04-06 19:59:49 +03:00
|
|
|
struct bloom_filter_settings *bloom_filter_settings;
|
2018-04-10 15:56:02 +03:00
|
|
|
};
|
|
|
|
|
2020-09-09 18:22:56 +03:00
|
|
|
struct commit_graph *load_commit_graph_one_fd_st(struct repository *r,
|
|
|
|
int fd, struct stat *st,
|
2020-02-04 00:18:04 +03:00
|
|
|
struct object_directory *odb);
|
2020-02-04 00:18:00 +03:00
|
|
|
struct commit_graph *read_commit_graph_one(struct repository *r,
|
|
|
|
struct object_directory *odb);
|
2020-09-09 18:22:56 +03:00
|
|
|
struct commit_graph *parse_commit_graph(struct repository *r,
|
|
|
|
void *graph_map, size_t graph_size);
|
2019-01-16 01:25:50 +03:00
|
|
|
|
commit-reach: use can_all_from_reach
The is_descendant_of method previously used in_merge_bases() to check if
the commit can reach any of the commits in the provided list. This had
two performance problems:
1. The performance is quadratic in worst-case.
2. A single in_merge_bases() call requires walking beyond the target
commit in order to find the full set of boundary commits that may be
merge-bases.
The can_all_from_reach method avoids this quadratic behavior and can
limit the search beyond the target commits using generation numbers. It
requires a small prototype adjustment to stop using commit-date as a
cutoff, as that optimization is no longer appropriate here.
Since in_merge_bases() uses paint_down_to_common(), is_descendant_of()
naturally found cutoffs to avoid walking the entire commit graph. Since
we want to always return the correct result, we cannot use the
min_commit_date cutoff in can_all_from_reach. We then rely on generation
numbers to provide the cutoff.
Since not all repos will have a commit-graph file, nor will we always
have generation numbers computed for a commit-graph file, create a new
method, generation_numbers_enabled(), that checks for a commit-graph
file and sees if the first commit in the file has a non-zero generation
number. In the case that we do not have generation numbers, use the old
logic for is_descendant_of().
Performance was meausured on a copy of the Linux repository using the
'test-tool reach is_descendant_of' command using this input:
A:v4.9
X:v4.10
X:v4.11
X:v4.12
X:v4.13
X:v4.14
X:v4.15
X:v4.16
X:v4.17
X.v3.0
Note that this input is tailored to demonstrate the quadratic nature of
the previous method, as it will compute merge-bases for v4.9 versus all
of the later versions before checking against v4.1.
Before: 0.26 s
After: 0.21 s
Since we previously used the is_descendant_of method in the ref_newer
method, we also measured performance there using
'test-tool reach ref_newer' with this input:
A:v4.9
B:v3.19
Before: 0.10 s
After: 0.08 s
By adding a new commit with parent v3.19, we test the non-reachable case
of ref_newer:
Before: 0.09 s
After: 0.08 s
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-20 19:33:30 +03:00
|
|
|
/*
|
|
|
|
* Return 1 if and only if the repository has a commit-graph
|
|
|
|
* file and generation numbers are computed in that file.
|
|
|
|
*/
|
|
|
|
int generation_numbers_enabled(struct repository *r);
|
|
|
|
|
commit-reach: use corrected commit dates in paint_down_to_common()
091f4cf (commit: don't use generation numbers if not needed,
2018-08-30) changed paint_down_to_common() to use commit dates instead
of generation numbers v1 (topological levels) as the performance
regressed on certain topologies. With generation number v2 (corrected
commit dates) implemented, we no longer have to rely on commit dates and
can use generation numbers.
For example, the command `git merge-base v4.8 v4.9` on the Linux
repository walks 167468 commits, taking 0.135s for committer date and
167496 commits, taking 0.157s for corrected committer date respectively.
While using corrected commit dates, Git walks nearly the same number of
commits as commit date, the process is slower as for each comparision we
have to access a commit-slab (for corrected committer date) instead of
accessing struct member (for committer date).
This change incidentally broke the fragile t6404-recursive-merge test.
t6404-recursive-merge sets up a unique repository where all commits have
the same committer date without a well-defined merge-base.
While running tests with GIT_TEST_COMMIT_GRAPH unset, we use committer
date as a heuristic in paint_down_to_common(). 6404.1 'combined merge
conflicts' merges commits in the order:
- Merge C with B to form an intermediate commit.
- Merge the intermediate commit with A.
With GIT_TEST_COMMIT_GRAPH=1, we write a commit-graph and subsequently
use the corrected committer date, which changes the order in which
commits are merged:
- Merge A with B to form an intermediate commit.
- Merge the intermediate commit with C.
While resulting repositories are equivalent, 6404.4 'virtual trees were
processed' fails with GIT_TEST_COMMIT_GRAPH=1 as we are selecting
different merge-bases and thus have different object ids for the
intermediate commits.
As this has already causes problems (as noted in 859fdc0 (commit-graph:
define GIT_TEST_COMMIT_GRAPH, 2018-08-29)), we disable commit graph
within t6404-recursive-merge.
Signed-off-by: Abhishek Kumar <abhishekkumar8222@gmail.com>
Reviewed-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-01-16 21:11:17 +03:00
|
|
|
/*
|
|
|
|
* Return 1 if and only if the repository has a commit-graph
|
|
|
|
* file and generation data chunk has been written for the file.
|
|
|
|
*/
|
|
|
|
int corrected_commit_dates_enabled(struct repository *r);
|
|
|
|
|
commit-graph: introduce 'get_bloom_filter_settings()'
Many places in the code often need a pointer to the commit-graph's
'struct bloom_filter_settings', in which case they often take the value
from the top-most commit-graph.
In the non-split case, this works as expected. In the split case,
however, things get a little tricky. Not all layers in a chain of
incremental commit-graphs are required to themselves have Bloom data,
and so whether or not some part of the code uses Bloom filters depends
entirely on whether or not the top-most level of the commit-graph chain
has Bloom filters.
This has been the behavior since Bloom filters were introduced, and has
been codified into the tests since a759bfa9ee (t4216: add end to end
tests for git log with Bloom filters, 2020-04-06). In fact, t4216.130
requires that Bloom filters are not used in exactly the case described
earlier.
There is no reason that this needs to be the case, since it is perfectly
valid for commits in an earlier layer to have Bloom filters when commits
in a newer layer do not.
Since Bloom settings are guaranteed in practice to be the same for any
layer in a chain that has Bloom data, it is sufficient to traverse the
'->base_graph' pointer until either (1) a non-null 'struct
bloom_filter_settings *' is found, or (2) until we are at the root of
the commit-graph chain.
Introduce a 'get_bloom_filter_settings()' function that does just this,
and use it instead of purely dereferencing the top-most graph's
'->bloom_filter_settings' pointer.
While we're at it, add an additional test in t5324 to guard against code
in the commit-graph writing machinery that doesn't correctly handle a
NULL 'struct bloom_filter *'.
Co-authored-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-09 18:22:44 +03:00
|
|
|
struct bloom_filter_settings *get_bloom_filter_settings(struct repository *r);
|
|
|
|
|
2019-08-05 11:02:39 +03:00
|
|
|
enum commit_graph_write_flags {
|
|
|
|
COMMIT_GRAPH_WRITE_APPEND = (1 << 0),
|
|
|
|
COMMIT_GRAPH_WRITE_PROGRESS = (1 << 1),
|
2019-08-05 11:02:40 +03:00
|
|
|
COMMIT_GRAPH_WRITE_SPLIT = (1 << 2),
|
commit-graph: drop COMMIT_GRAPH_WRITE_CHECK_OIDS flag
Since 7c5c9b9c57 (commit-graph: error out on invalid commit oids in
'write --stdin-commits', 2019-08-05), the commit-graph builtin dies on
receiving non-commit OIDs as input to '--stdin-commits'.
This behavior can be cumbersome to work around in, say, the case of
piping 'git for-each-ref' to 'git commit-graph write --stdin-commits' if
the caller does not want to cull out non-commits themselves. In this
situation, it would be ideal if 'git commit-graph write' wrote the graph
containing the inputs that did pertain to commits, and silently ignored
the remainder of the input.
Some options have been proposed to the effect of '--[no-]check-oids'
which would allow callers to have the commit-graph builtin do just that.
After some discussion, it is difficult to imagine a caller who wouldn't
want to pass '--no-check-oids', suggesting that we should get rid of the
behavior of complaining about non-commit inputs altogether.
If callers do wish to retain this behavior, they can easily work around
this change by doing the following:
git for-each-ref --format='%(objectname) %(objecttype) %(*objecttype)' |
awk '
!/commit/ { print "not-a-commit:"$1 }
/commit/ { print $1 }
' |
git commit-graph write --stdin-commits
To make it so that valid OIDs that refer to non-existent objects are
indeed an error after loosening the error handling, perform an extra
lookup to make sure that object indeed exists before sending it to the
commit-graph internals.
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-05-14 00:59:55 +03:00
|
|
|
COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 3),
|
2020-07-30 23:20:31 +03:00
|
|
|
COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS = (1 << 4),
|
2019-08-05 11:02:39 +03:00
|
|
|
};
|
2019-06-12 16:29:38 +03:00
|
|
|
|
2020-04-14 07:04:08 +03:00
|
|
|
enum commit_graph_split_flags {
|
builtin/commit-graph.c: introduce split strategy 'no-merge'
In the previous commit, we laid the groundwork for supporting different
splitting strategies. In this commit, we introduce the first splitting
strategy: 'no-merge'.
Passing '--split=no-merge' is useful for callers which wish to write a
new incremental commit-graph, but do not want to spend effort condensing
the incremental chain [1]. Previously, this was possible by passing
'--size-multiple=0', but this no longer the case following 63020f175f
(commit-graph: prefer default size_mult when given zero, 2020-01-02).
When '--split=no-merge' is given, the commit-graph machinery will never
condense an existing chain, and it will always write a new incremental.
[1]: This might occur when, for example, a server administrator running
some program after each push may want to ensure that each job runs
proportional in time to the size of the push, and does not "jump" when
the commit-graph machinery decides to trigger a merge.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-14 07:04:12 +03:00
|
|
|
COMMIT_GRAPH_SPLIT_UNSPECIFIED = 0,
|
builtin/commit-graph.c: introduce split strategy 'replace'
When using split commit-graphs, it is sometimes useful to completely
replace the commit-graph chain with a new base.
For example, consider a scenario in which a repository builds a new
commit-graph incremental for each push. Occasionally (say, after some
fixed number of pushes), they may wish to rebuild the commit-graph chain
with all reachable commits.
They can do so with
$ git commit-graph write --reachable
but this removes the chain entirely and replaces it with a single
commit-graph in 'objects/info/commit-graph'. Unfortunately, this means
that the next push will have to move this commit-graph into the first
layer of a new chain, and then write its new commits on top.
Avoid such copying entirely by allowing the caller to specify that they
wish to replace the entirety of their commit-graph chain, while also
specifying that the new commit-graph should become the basis of a fresh,
length-one chain.
This addresses the above situation by making it possible for the caller
to instead write:
$ git commit-graph write --reachable --split=replace
which writes a new length-one chain to 'objects/info/commit-graphs',
making the commit-graph incremental generated by the subsequent push
relatively cheap by avoiding the aforementioned copy.
In order to do this, remove an assumption in 'write_commit_graph_file'
that chains are always at least two incrementals long.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-14 07:04:17 +03:00
|
|
|
COMMIT_GRAPH_SPLIT_MERGE_PROHIBITED = 1,
|
|
|
|
COMMIT_GRAPH_SPLIT_REPLACE = 2
|
2020-04-14 07:04:08 +03:00
|
|
|
};
|
|
|
|
|
2020-09-18 05:59:49 +03:00
|
|
|
struct commit_graph_opts {
|
2019-06-18 21:14:32 +03:00
|
|
|
int size_multiple;
|
|
|
|
int max_commits;
|
|
|
|
timestamp_t expire_time;
|
2020-09-18 05:59:49 +03:00
|
|
|
enum commit_graph_split_flags split_flags;
|
2020-09-18 16:27:27 +03:00
|
|
|
int max_new_filters;
|
2019-06-18 21:14:32 +03:00
|
|
|
};
|
|
|
|
|
2019-06-12 16:29:37 +03:00
|
|
|
/*
|
|
|
|
* The write_commit_graph* methods return zero on success
|
|
|
|
* and a negative value on failure. Note that if the repository
|
|
|
|
* is not compatible with the commit-graph feature, then the
|
|
|
|
* methods will return 0 without writing a commit-graph.
|
|
|
|
*/
|
2020-02-04 08:51:50 +03:00
|
|
|
int write_commit_graph_reachable(struct object_directory *odb,
|
2019-08-05 11:02:39 +03:00
|
|
|
enum commit_graph_write_flags flags,
|
2020-09-18 05:59:49 +03:00
|
|
|
const struct commit_graph_opts *opts);
|
2020-02-04 08:51:50 +03:00
|
|
|
int write_commit_graph(struct object_directory *odb,
|
2019-06-12 16:29:37 +03:00
|
|
|
struct string_list *pack_indexes,
|
2020-04-14 07:04:25 +03:00
|
|
|
struct oidset *commits,
|
2019-08-05 11:02:39 +03:00
|
|
|
enum commit_graph_write_flags flags,
|
2020-09-18 05:59:49 +03:00
|
|
|
const struct commit_graph_opts *opts);
|
2018-04-02 23:34:19 +03:00
|
|
|
|
2019-06-18 21:14:32 +03:00
|
|
|
#define COMMIT_GRAPH_VERIFY_SHALLOW (1 << 0)
|
|
|
|
|
|
|
|
int verify_commit_graph(struct repository *r, struct commit_graph *g, int flags);
|
2018-06-27 16:24:32 +03:00
|
|
|
|
2019-05-17 21:41:47 +03:00
|
|
|
void close_commit_graph(struct raw_object_store *);
|
2018-07-12 01:42:40 +03:00
|
|
|
void free_commit_graph(struct commit_graph *);
|
|
|
|
|
upload-pack: disable commit graph more gently for shallow traversal
When the client has asked for certain shallow options like
"deepen-since", we do a custom rev-list walk that pretends to be
shallow. Before doing so, we have to disable the commit-graph, since it
is not compatible with the shallow view of the repository. That's
handled by 829a321569 (commit-graph: close_commit_graph before shallow
walk, 2018-08-20). That commit literally closes and frees our
repo->objects->commit_graph struct.
That creates an interesting problem for commits that have _already_ been
parsed using the commit graph. Their commit->object.parsed flag is set,
their commit->graph_pos is set, but their commit->maybe_tree may still
be NULL. When somebody later calls repo_get_commit_tree(), we see that
we haven't loaded the tree oid yet and try to get it from the commit
graph. But since it has been freed, we segfault!
So the root of the issue is a data dependency between the commit's
lazy-load of the tree oid and the fact that the commit graph can go
away mid-process. How can we resolve it?
There are a couple of general approaches:
1. The obvious answer is to avoid loading the tree from the graph when
we see that it's NULL. But then what do we return for the tree oid?
If we return NULL, our caller in do_traverse() will rightly
complain that we have no tree. We'd have to fallback to loading the
actual commit object and re-parsing it. That requires teaching
parse_commit_buffer() to understand re-parsing (i.e., not starting
from a clean slate and not leaking any allocated bits like parent
list pointers).
2. When we close the commit graph, walk through the set of in-memory
objects and clear any graph_pos pointers. But this means we also
have to "unparse" any such commits so that we know they still need
to open the commit object to fill in their trees. So it's no less
complicated than (1), and is more expensive (since we clear objects
we might not later need).
3. Stop freeing the commit-graph struct. Continue to let it be used
for lazy-loads of tree oids, but let upload-pack specify that it
shouldn't be used for further commit parsing.
4. Push the whole shallow rev-list out to its own sub-process, with
the commit-graph disabled from the start, giving it a clean memory
space to work from.
I've chosen (3) here. Options (1) and (2) would work, but are
non-trivial to implement. Option (4) is more expensive, and I'm not sure
how complicated it is (shelling out for the actual rev-list part is
easy, but we do then parse the resulting commits internally, and I'm not
clear which parts need to be handling shallow-ness).
The new test in t5500 triggers this segfault, but see the comments there
for how horribly intimate it has to be with how both upload-pack and
commit graphs work.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-09-12 17:44:45 +03:00
|
|
|
/*
|
|
|
|
* Disable further use of the commit graph in this process when parsing a
|
|
|
|
* "struct commit".
|
|
|
|
*/
|
|
|
|
void disable_commit_graph(struct repository *r);
|
|
|
|
|
2020-06-17 12:14:09 +03:00
|
|
|
struct commit_graph_data {
|
|
|
|
uint32_t graph_pos;
|
2021-01-16 21:11:13 +03:00
|
|
|
timestamp_t generation;
|
2020-06-17 12:14:09 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Commits should be parsed before accessing generation, graph positions.
|
|
|
|
*/
|
2021-01-16 21:11:13 +03:00
|
|
|
timestamp_t commit_graph_generation(const struct commit *);
|
2020-06-17 12:14:09 +03:00
|
|
|
uint32_t commit_graph_position(const struct commit *);
|
2018-04-02 23:34:19 +03:00
|
|
|
#endif
|