2023-03-21 09:25:56 +03:00
|
|
|
#include "git-compat-util.h"
|
2023-03-21 09:25:54 +03:00
|
|
|
#include "gettext.h"
|
2023-02-24 03:09:27 +03:00
|
|
|
#include "hex.h"
|
2023-05-16 09:34:06 +03:00
|
|
|
#include "object-store-ll.h"
|
2011-09-03 03:33:22 +04:00
|
|
|
#include "run-command.h"
|
|
|
|
#include "sigchain.h"
|
|
|
|
#include "connected.h"
|
2013-05-26 05:16:17 +04:00
|
|
|
#include "transport.h"
|
2017-08-19 01:20:26 +03:00
|
|
|
#include "packfile.h"
|
2019-06-25 16:40:31 +03:00
|
|
|
#include "promisor-remote.h"
|
2011-09-03 03:33:22 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we feed all the commits we want to verify to this command
|
|
|
|
*
|
2012-03-16 01:57:02 +04:00
|
|
|
* $ git rev-list --objects --stdin --not --all
|
2011-09-03 03:33:22 +04:00
|
|
|
*
|
|
|
|
* and if it does not error out, that means everything reachable from
|
2012-03-16 01:57:02 +04:00
|
|
|
* these commits locally exists and is connected to our existing refs.
|
|
|
|
* Note that this does _not_ validate the individual objects.
|
2011-09-03 03:33:22 +04:00
|
|
|
*
|
|
|
|
* Returns 0 if everything is connected, non-zero otherwise.
|
|
|
|
*/
|
2017-10-16 01:06:54 +03:00
|
|
|
int check_connected(oid_iterate_fn fn, void *cb_data,
|
check_everything_connected: use a struct with named options
The number of variants of check_everything_connected has
grown over the years, so that the "real" function takes
several possibly-zero, possibly-NULL arguments. We hid the
complexity behind some wrapper functions, but this doesn't
scale well when we want to add new options.
If we add more wrapper variants to handle the new options,
then we can get a combinatorial explosion when those options
might be used together (right now nobody wants to use both
"shallow" and "transport" together, so we get by with just a
few wrappers).
If instead we add new parameters to each function, each of
which can have a default value, then callers who want the
defaults end up with confusing invocations like:
check_everything_connected(fn, 0, data, -1, 0, NULL);
where it is unclear which parameter is which (and every
caller needs updated when we add new options).
Instead, let's add a struct to hold all of the optional
parameters. This is a little more verbose for the callers
(who have to declare the struct and fill it in), but it
makes their code much easier to follow, because every option
is named as it is set (and unused options do not have to be
mentioned at all).
Note that we could also stick the iteration function and its
callback data into the option struct, too. But since those
are required for each call, by avoiding doing so, we can let
very simple callers just pass "NULL" for the options and not
worry about the struct at all.
While we're touching each site, let's also rename the
function to check_connected(). The existing name was quite
long, and not all of the wrappers even used the full name.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-15 13:30:40 +03:00
|
|
|
struct check_connected_options *opt)
|
2011-09-03 03:33:22 +04:00
|
|
|
{
|
2014-08-19 23:09:35 +04:00
|
|
|
struct child_process rev_list = CHILD_PROCESS_INIT;
|
2020-08-12 19:52:49 +03:00
|
|
|
FILE *rev_list_in;
|
check_everything_connected: use a struct with named options
The number of variants of check_everything_connected has
grown over the years, so that the "real" function takes
several possibly-zero, possibly-NULL arguments. We hid the
complexity behind some wrapper functions, but this doesn't
scale well when we want to add new options.
If we add more wrapper variants to handle the new options,
then we can get a combinatorial explosion when those options
might be used together (right now nobody wants to use both
"shallow" and "transport" together, so we get by with just a
few wrappers).
If instead we add new parameters to each function, each of
which can have a default value, then callers who want the
defaults end up with confusing invocations like:
check_everything_connected(fn, 0, data, -1, 0, NULL);
where it is unclear which parameter is which (and every
caller needs updated when we add new options).
Instead, let's add a struct to hold all of the optional
parameters. This is a little more verbose for the callers
(who have to declare the struct and fill it in), but it
makes their code much easier to follow, because every option
is named as it is set (and unused options do not have to be
mentioned at all).
Note that we could also stick the iteration function and its
callback data into the option struct, too. But since those
are required for each call, by avoiding doing so, we can let
very simple callers just pass "NULL" for the options and not
worry about the struct at all.
While we're touching each site, let's also rename the
function to check_connected(). The existing name was quite
long, and not all of the wrappers even used the full name.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-15 13:30:40 +03:00
|
|
|
struct check_connected_options defaults = CHECK_CONNECTED_INIT;
|
2021-09-01 16:09:50 +03:00
|
|
|
const struct object_id *oid;
|
2016-07-15 13:28:32 +03:00
|
|
|
int err = 0;
|
2013-05-26 05:16:17 +04:00
|
|
|
struct packed_git *new_pack = NULL;
|
check_everything_connected: use a struct with named options
The number of variants of check_everything_connected has
grown over the years, so that the "real" function takes
several possibly-zero, possibly-NULL arguments. We hid the
complexity behind some wrapper functions, but this doesn't
scale well when we want to add new options.
If we add more wrapper variants to handle the new options,
then we can get a combinatorial explosion when those options
might be used together (right now nobody wants to use both
"shallow" and "transport" together, so we get by with just a
few wrappers).
If instead we add new parameters to each function, each of
which can have a default value, then callers who want the
defaults end up with confusing invocations like:
check_everything_connected(fn, 0, data, -1, 0, NULL);
where it is unclear which parameter is which (and every
caller needs updated when we add new options).
Instead, let's add a struct to hold all of the optional
parameters. This is a little more verbose for the callers
(who have to declare the struct and fill it in), but it
makes their code much easier to follow, because every option
is named as it is set (and unused options do not have to be
mentioned at all).
Note that we could also stick the iteration function and its
callback data into the option struct, too. But since those
are required for each call, by avoiding doing so, we can let
very simple callers just pass "NULL" for the options and not
worry about the struct at all.
While we're touching each site, let's also rename the
function to check_connected(). The existing name was quite
long, and not all of the wrappers even used the full name.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-15 13:30:40 +03:00
|
|
|
struct transport *transport;
|
2014-06-30 20:58:51 +04:00
|
|
|
size_t base_len;
|
2011-09-03 03:33:22 +04:00
|
|
|
|
check_everything_connected: use a struct with named options
The number of variants of check_everything_connected has
grown over the years, so that the "real" function takes
several possibly-zero, possibly-NULL arguments. We hid the
complexity behind some wrapper functions, but this doesn't
scale well when we want to add new options.
If we add more wrapper variants to handle the new options,
then we can get a combinatorial explosion when those options
might be used together (right now nobody wants to use both
"shallow" and "transport" together, so we get by with just a
few wrappers).
If instead we add new parameters to each function, each of
which can have a default value, then callers who want the
defaults end up with confusing invocations like:
check_everything_connected(fn, 0, data, -1, 0, NULL);
where it is unclear which parameter is which (and every
caller needs updated when we add new options).
Instead, let's add a struct to hold all of the optional
parameters. This is a little more verbose for the callers
(who have to declare the struct and fill it in), but it
makes their code much easier to follow, because every option
is named as it is set (and unused options do not have to be
mentioned at all).
Note that we could also stick the iteration function and its
callback data into the option struct, too. But since those
are required for each call, by avoiding doing so, we can let
very simple callers just pass "NULL" for the options and not
worry about the struct at all.
While we're touching each site, let's also rename the
function to check_connected(). The existing name was quite
long, and not all of the wrappers even used the full name.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-15 13:30:40 +03:00
|
|
|
if (!opt)
|
|
|
|
opt = &defaults;
|
|
|
|
transport = opt->transport;
|
|
|
|
|
2021-09-01 16:09:50 +03:00
|
|
|
oid = fn(cb_data);
|
|
|
|
if (!oid) {
|
2016-07-15 13:32:03 +03:00
|
|
|
if (opt->err_fd)
|
|
|
|
close(opt->err_fd);
|
2011-09-03 03:33:22 +04:00
|
|
|
return err;
|
2016-07-15 13:32:03 +03:00
|
|
|
}
|
2011-09-03 03:33:22 +04:00
|
|
|
|
2013-05-26 05:16:17 +04:00
|
|
|
if (transport && transport->smart_options &&
|
|
|
|
transport->smart_options->self_contained_and_connected &&
|
fetch-pack: support more than one pack lockfile
Whenever a fetch results in a packfile being downloaded, a .keep file is
generated, so that the packfile can be preserved (from, say, a running
"git repack") until refs are written referring to the contents of the
packfile.
In a subsequent patch, a successful fetch using protocol v2 may result
in more than one .keep file being generated. Therefore, teach
fetch_pack() and the transport mechanism to support multiple .keep
files.
Implementation notes:
- builtin/fetch-pack.c normally does not generate .keep files, and thus
is unaffected by this or future changes. However, it has an
undocumented "--lock-pack" feature, used by remote-curl.c when
implementing the "fetch" remote helper command. In keeping with the
remote helper protocol, only one "lock" line will ever be written;
the rest will result in warnings to stderr. However, in practice,
warnings will never be written because the remote-curl.c "fetch" is
only used for protocol v0/v1 (which will not generate multiple .keep
files). (Protocol v2 uses the "stateless-connect" command, not the
"fetch" command.)
- connected.c has an optimization in that connectivity checks on a ref
need not be done if the target object is in a pack known to be
self-contained and connected. If there are multiple packfiles, this
optimization can no longer be done.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-06-10 23:57:22 +03:00
|
|
|
transport->pack_lockfiles.nr == 1 &&
|
|
|
|
strip_suffix(transport->pack_lockfiles.items[0].string,
|
|
|
|
".keep", &base_len)) {
|
2013-05-26 05:16:17 +04:00
|
|
|
struct strbuf idx_file = STRBUF_INIT;
|
fetch-pack: support more than one pack lockfile
Whenever a fetch results in a packfile being downloaded, a .keep file is
generated, so that the packfile can be preserved (from, say, a running
"git repack") until refs are written referring to the contents of the
packfile.
In a subsequent patch, a successful fetch using protocol v2 may result
in more than one .keep file being generated. Therefore, teach
fetch_pack() and the transport mechanism to support multiple .keep
files.
Implementation notes:
- builtin/fetch-pack.c normally does not generate .keep files, and thus
is unaffected by this or future changes. However, it has an
undocumented "--lock-pack" feature, used by remote-curl.c when
implementing the "fetch" remote helper command. In keeping with the
remote helper protocol, only one "lock" line will ever be written;
the rest will result in warnings to stderr. However, in practice,
warnings will never be written because the remote-curl.c "fetch" is
only used for protocol v0/v1 (which will not generate multiple .keep
files). (Protocol v2 uses the "stateless-connect" command, not the
"fetch" command.)
- connected.c has an optimization in that connectivity checks on a ref
need not be done if the target object is in a pack known to be
self-contained and connected. If there are multiple packfiles, this
optimization can no longer be done.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-06-10 23:57:22 +03:00
|
|
|
strbuf_add(&idx_file, transport->pack_lockfiles.items[0].string,
|
|
|
|
base_len);
|
2013-05-26 05:16:17 +04:00
|
|
|
strbuf_addstr(&idx_file, ".idx");
|
|
|
|
new_pack = add_packed_git(idx_file.buf, idx_file.len, 1);
|
|
|
|
strbuf_release(&idx_file);
|
|
|
|
}
|
|
|
|
|
2023-03-28 16:58:53 +03:00
|
|
|
if (repo_has_promisor_remote(the_repository)) {
|
2019-04-20 00:00:13 +03:00
|
|
|
/*
|
|
|
|
* For partial clones, we don't want to have to do a regular
|
|
|
|
* connectivity check because we have to enumerate and exclude
|
|
|
|
* all promisor objects (slow), and then the connectivity check
|
|
|
|
* itself becomes a no-op because in a partial clone every
|
|
|
|
* object is a promisor object. Instead, just make sure we
|
2020-01-12 07:15:24 +03:00
|
|
|
* received, in a promisor packfile, the objects pointed to by
|
|
|
|
* each wanted ref.
|
connected.c: reprepare packs for corner cases
While updating the microsoft/git fork on top of v2.26.0-rc0 and
consuming that build into Scalar, I noticed a corner case bug around
partial clone.
The "scalar clone" command can create a Git repository with the
proper config for using partial clone with the "blob:none" filter.
Instead of calling "git clone", it runs "git init" then sets a few
more config values before running "git fetch".
In our builds on v2.26.0-rc0, we noticed that our "git fetch"
command was failing with
error: https://github.com/microsoft/scalar did not send all necessary objects
This does not happen if you copy the config file from a repository
created by "git clone --filter=blob:none <url>", but it does happen
when adding the config option "core.logAllRefUpdates = true".
By debugging, I was able to see that the loop inside
check_connnected() that checks if all refs are contained in
promisor packs actually did not have any packfiles in the packed_git
list.
I'm not sure what corner-case issues caused this config option to
prevent the reprepare_packed_git() from being called at the proper
spot during the fetch operation. This approach requires a situation
where we use the remote helper process, which makes it difficult to
test.
It is possible to place a reprepare_packed_git() call in the fetch code
closer to where we receive a pack, but that leaves an opening for a
later change to re-introduce this problem. Further, a concurrent repack
operation could replace the pack-file list we already loaded into
memory, causing this issue in an even harder to reproduce scenario.
It is really the responsibility of anyone looping through the list of
pack-files for a certain object to fall back to reprepare_packed_git()
on a fail-to-find. The loop in check_connected() does not have this
fallback, leading to this bug.
We _could_ try looping through the packs and only reprepare the packs
after a miss, but that change is more involved and has little value.
Since this case is isolated to the case when
opt->check_refs_are_promisor_objects_only is true, we are confident that
we are verifying the refs after downloading new data. This implies that
calling reprepare_packed_git() in advance is not a huge cost compared to
the rest of the operations already made.
Helped-by: Jeff King <peff@peff.net>
Helped-by: Junio Hamano <gitster@pobox.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-03-14 00:11:55 +03:00
|
|
|
*
|
|
|
|
* Before checking for promisor packs, be sure we have the
|
|
|
|
* latest pack-files loaded into memory.
|
2019-04-20 00:00:13 +03:00
|
|
|
*/
|
connected.c: reprepare packs for corner cases
While updating the microsoft/git fork on top of v2.26.0-rc0 and
consuming that build into Scalar, I noticed a corner case bug around
partial clone.
The "scalar clone" command can create a Git repository with the
proper config for using partial clone with the "blob:none" filter.
Instead of calling "git clone", it runs "git init" then sets a few
more config values before running "git fetch".
In our builds on v2.26.0-rc0, we noticed that our "git fetch"
command was failing with
error: https://github.com/microsoft/scalar did not send all necessary objects
This does not happen if you copy the config file from a repository
created by "git clone --filter=blob:none <url>", but it does happen
when adding the config option "core.logAllRefUpdates = true".
By debugging, I was able to see that the loop inside
check_connnected() that checks if all refs are contained in
promisor packs actually did not have any packfiles in the packed_git
list.
I'm not sure what corner-case issues caused this config option to
prevent the reprepare_packed_git() from being called at the proper
spot during the fetch operation. This approach requires a situation
where we use the remote helper process, which makes it difficult to
test.
It is possible to place a reprepare_packed_git() call in the fetch code
closer to where we receive a pack, but that leaves an opening for a
later change to re-introduce this problem. Further, a concurrent repack
operation could replace the pack-file list we already loaded into
memory, causing this issue in an even harder to reproduce scenario.
It is really the responsibility of anyone looping through the list of
pack-files for a certain object to fall back to reprepare_packed_git()
on a fail-to-find. The loop in check_connected() does not have this
fallback, leading to this bug.
We _could_ try looping through the packs and only reprepare the packs
after a miss, but that change is more involved and has little value.
Since this case is isolated to the case when
opt->check_refs_are_promisor_objects_only is true, we are confident that
we are verifying the refs after downloading new data. This implies that
calling reprepare_packed_git() in advance is not a huge cost compared to
the rest of the operations already made.
Helped-by: Jeff King <peff@peff.net>
Helped-by: Junio Hamano <gitster@pobox.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-03-14 00:11:55 +03:00
|
|
|
reprepare_packed_git(the_repository);
|
2019-04-20 00:00:13 +03:00
|
|
|
do {
|
2020-01-12 07:15:24 +03:00
|
|
|
struct packed_git *p;
|
|
|
|
|
|
|
|
for (p = get_all_packs(the_repository); p; p = p->next) {
|
|
|
|
if (!p->pack_promisor)
|
|
|
|
continue;
|
2021-09-01 16:09:50 +03:00
|
|
|
if (find_pack_entry_one(oid->hash, p))
|
2020-01-12 07:15:24 +03:00
|
|
|
goto promisor_pack_found;
|
|
|
|
}
|
connected: always use partial clone optimization
With 50033772d5 ("connected: verify promisor-ness of partial clone",
2020-01-30), the fast path (checking promisor packs) in
check_connected() now passes a subset of the slow path (rev-list) - if
all objects to be checked are found in promisor packs, both the fast
path and the slow path will pass; otherwise, the fast path will
definitely not pass. This means that we can always attempt the fast path
whenever we need to do the slow path.
The fast path is currently guarded by a flag; therefore, remove that
flag. Also, make the fast path fallback to the slow path - if the fast
path fails, the failing OID and all remaining OIDs will be passed to
rev-list.
The main user-visible benefit is the performance of fetch from a partial
clone - specifically, the speedup of the connectivity check done before
the fetch. In particular, a no-op fetch into a partial clone on my
computer was sped up from 7 seconds to 0.01 seconds. This is a
complement to the work in 2df1aa239c ("fetch: forgo full
connectivity check if --filter", 2020-01-30), which is the child of the
aforementioned 50033772d5. In that commit, the connectivity check
*after* the fetch was sped up.
The addition of the fast path might cause performance reductions in
these cases:
- If a partial clone or a fetch into a partial clone fails, Git will
fruitlessly run rev-list (it is expected that everything fetched
would go into promisor packs, so if that didn't happen, it is most
likely that rev-list will fail too).
- Any connectivity checks done by receive-pack, in the (in my opinion,
unlikely) event that a partial clone serves receive-pack.
I think that these cases are rare enough, and the performance reduction
in this case minor enough (additional object DB access), that the
benefit of avoiding a flag outweighs these.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Reviewed-by: Josh Steadmon <steadmon@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-03-21 01:00:45 +03:00
|
|
|
/*
|
|
|
|
* Fallback to rev-list with oid and the rest of the
|
|
|
|
* object IDs provided by fn.
|
|
|
|
*/
|
|
|
|
goto no_promisor_pack_found;
|
2020-01-12 07:15:24 +03:00
|
|
|
promisor_pack_found:
|
|
|
|
;
|
2021-09-01 16:09:50 +03:00
|
|
|
} while ((oid = fn(cb_data)) != NULL);
|
2022-11-08 21:17:47 +03:00
|
|
|
free(new_pack);
|
2019-04-20 00:00:13 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
connected: always use partial clone optimization
With 50033772d5 ("connected: verify promisor-ness of partial clone",
2020-01-30), the fast path (checking promisor packs) in
check_connected() now passes a subset of the slow path (rev-list) - if
all objects to be checked are found in promisor packs, both the fast
path and the slow path will pass; otherwise, the fast path will
definitely not pass. This means that we can always attempt the fast path
whenever we need to do the slow path.
The fast path is currently guarded by a flag; therefore, remove that
flag. Also, make the fast path fallback to the slow path - if the fast
path fails, the failing OID and all remaining OIDs will be passed to
rev-list.
The main user-visible benefit is the performance of fetch from a partial
clone - specifically, the speedup of the connectivity check done before
the fetch. In particular, a no-op fetch into a partial clone on my
computer was sped up from 7 seconds to 0.01 seconds. This is a
complement to the work in 2df1aa239c ("fetch: forgo full
connectivity check if --filter", 2020-01-30), which is the child of the
aforementioned 50033772d5. In that commit, the connectivity check
*after* the fetch was sped up.
The addition of the fast path might cause performance reductions in
these cases:
- If a partial clone or a fetch into a partial clone fails, Git will
fruitlessly run rev-list (it is expected that everything fetched
would go into promisor packs, so if that didn't happen, it is most
likely that rev-list will fail too).
- Any connectivity checks done by receive-pack, in the (in my opinion,
unlikely) event that a partial clone serves receive-pack.
I think that these cases are rare enough, and the performance reduction
in this case minor enough (additional object DB access), that the
benefit of avoiding a flag outweighs these.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Reviewed-by: Josh Steadmon <steadmon@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-03-21 01:00:45 +03:00
|
|
|
no_promisor_pack_found:
|
check_everything_connected: use a struct with named options
The number of variants of check_everything_connected has
grown over the years, so that the "real" function takes
several possibly-zero, possibly-NULL arguments. We hid the
complexity behind some wrapper functions, but this doesn't
scale well when we want to add new options.
If we add more wrapper variants to handle the new options,
then we can get a combinatorial explosion when those options
might be used together (right now nobody wants to use both
"shallow" and "transport" together, so we get by with just a
few wrappers).
If instead we add new parameters to each function, each of
which can have a default value, then callers who want the
defaults end up with confusing invocations like:
check_everything_connected(fn, 0, data, -1, 0, NULL);
where it is unclear which parameter is which (and every
caller needs updated when we add new options).
Instead, let's add a struct to hold all of the optional
parameters. This is a little more verbose for the callers
(who have to declare the struct and fill it in), but it
makes their code much easier to follow, because every option
is named as it is set (and unused options do not have to be
mentioned at all).
Note that we could also stick the iteration function and its
callback data into the option struct, too. But since those
are required for each call, by avoiding doing so, we can let
very simple callers just pass "NULL" for the options and not
worry about the struct at all.
While we're touching each site, let's also rename the
function to check_connected(). The existing name was quite
long, and not all of the wrappers even used the full name.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-07-15 13:30:40 +03:00
|
|
|
if (opt->shallow_file) {
|
2020-07-28 23:24:53 +03:00
|
|
|
strvec_push(&rev_list.args, "--shallow-file");
|
|
|
|
strvec_push(&rev_list.args, opt->shallow_file);
|
2013-12-05 17:02:46 +04:00
|
|
|
}
|
2020-07-28 23:24:53 +03:00
|
|
|
strvec_push(&rev_list.args,"rev-list");
|
|
|
|
strvec_push(&rev_list.args, "--objects");
|
|
|
|
strvec_push(&rev_list.args, "--stdin");
|
2023-03-28 16:58:53 +03:00
|
|
|
if (repo_has_promisor_remote(the_repository))
|
2020-07-28 23:24:53 +03:00
|
|
|
strvec_push(&rev_list.args, "--exclude-promisor-objects");
|
fetch-pack: write shallow, then check connectivity
When fetching, connectivity is checked after the shallow file is
updated. There are 2 issues with this: (1) the connectivity check is
only performed up to ancestors of existing refs (which is not thorough
enough if we were deepening an existing ref in the first place), and (2)
there is no rollback of the shallow file if the connectivity check
fails.
To solve (1), update the connectivity check to check the ancestry chain
completely in the case of a deepening fetch by refraining from passing
"--not --all" when invoking rev-list in connected.c.
To solve (2), have fetch_pack() perform its own connectivity check
before updating the shallow file. To support existing use cases in which
"git fetch-pack" is used to download objects without much regard as to
the connectivity of the resulting objects with respect to the existing
repository, the connectivity check is only done if necessary (that is,
the fetch is not a clone, and the fetch involves shallow/deepen
functionality). "git fetch" still performs its own connectivity check,
preserving correctness but sometimes performing redundant work. This
redundancy is mitigated by the fact that fetch_pack() reports if it has
performed a connectivity check itself, and if the transport supports
connect or stateless-connect, it will bubble up that report so that "git
fetch" knows not to perform the connectivity check in such a case.
This was noticed when a user tried to deepen an existing repository by
fetching with --no-shallow from a server that did not send all necessary
objects - the connectivity check as run by "git fetch" succeeded, but a
subsequent "git fsck" failed.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-03 01:08:43 +03:00
|
|
|
if (!opt->is_deepening_fetch) {
|
2020-07-28 23:24:53 +03:00
|
|
|
strvec_push(&rev_list.args, "--not");
|
receive-pack: only use visible refs for connectivity check
When serving a push, git-receive-pack(1) needs to verify that the
packfile sent by the client contains all objects that are required by
the updated references. This connectivity check works by marking all
preexisting references as uninteresting and using the new reference tips
as starting point for a graph walk.
Marking all preexisting references as uninteresting can be a problem
when it comes to performance. Git forges tend to do internal bookkeeping
to keep alive sets of objects for internal use or make them easy to find
via certain references. These references are typically hidden away from
the user so that they are neither advertised nor writeable. At GitLab,
we have one particular repository that contains a total of 7 million
references, of which 6.8 million are indeed internal references. With
the current connectivity check we are forced to load all these
references in order to mark them as uninteresting, and this alone takes
around 15 seconds to compute.
We can optimize this by only taking into account the set of visible refs
when marking objects as uninteresting. This means that we may now walk
more objects until we hit any object that is marked as uninteresting.
But it is rather unlikely that clients send objects that make large
parts of objects reachable that have previously only ever been hidden,
whereas the common case is to push incremental changes that build on top
of the visible object graph.
This provides a huge boost to performance in the mentioned repository,
where the vast majority of its refs hidden. Pushing a new commit into
this repo with `transfer.hideRefs` set up to hide 6.8 million of 7 refs
as it is configured in Gitaly leads to a 4.5-fold speedup:
Benchmark 1: main
Time (mean ± σ): 30.977 s ± 0.157 s [User: 30.226 s, System: 1.083 s]
Range (min … max): 30.796 s … 31.071 s 3 runs
Benchmark 2: pks-connectivity-check-hide-refs
Time (mean ± σ): 6.799 s ± 0.063 s [User: 6.803 s, System: 0.354 s]
Range (min … max): 6.729 s … 6.850 s 3 runs
Summary
'pks-connectivity-check-hide-refs' ran
4.56 ± 0.05 times faster than 'main'
As we mostly go through the same codepaths even in the case where there
are no hidden refs at all compared to the code before there is no change
in performance when no refs are hidden:
Benchmark 1: main
Time (mean ± σ): 48.188 s ± 0.432 s [User: 49.326 s, System: 5.009 s]
Range (min … max): 47.706 s … 48.539 s 3 runs
Benchmark 2: pks-connectivity-check-hide-refs
Time (mean ± σ): 48.027 s ± 0.500 s [User: 48.934 s, System: 5.025 s]
Range (min … max): 47.504 s … 48.500 s 3 runs
Summary
'pks-connectivity-check-hide-refs' ran
1.00 ± 0.01 times faster than 'main'
Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
2022-11-17 08:47:04 +03:00
|
|
|
if (opt->exclude_hidden_refs_section)
|
|
|
|
strvec_pushf(&rev_list.args, "--exclude-hidden=%s",
|
|
|
|
opt->exclude_hidden_refs_section);
|
2020-07-28 23:24:53 +03:00
|
|
|
strvec_push(&rev_list.args, "--all");
|
fetch-pack: write shallow, then check connectivity
When fetching, connectivity is checked after the shallow file is
updated. There are 2 issues with this: (1) the connectivity check is
only performed up to ancestors of existing refs (which is not thorough
enough if we were deepening an existing ref in the first place), and (2)
there is no rollback of the shallow file if the connectivity check
fails.
To solve (1), update the connectivity check to check the ancestry chain
completely in the case of a deepening fetch by refraining from passing
"--not --all" when invoking rev-list in connected.c.
To solve (2), have fetch_pack() perform its own connectivity check
before updating the shallow file. To support existing use cases in which
"git fetch-pack" is used to download objects without much regard as to
the connectivity of the resulting objects with respect to the existing
repository, the connectivity check is only done if necessary (that is,
the fetch is not a clone, and the fetch involves shallow/deepen
functionality). "git fetch" still performs its own connectivity check,
preserving correctness but sometimes performing redundant work. This
redundancy is mitigated by the fact that fetch_pack() reports if it has
performed a connectivity check itself, and if the transport supports
connect or stateless-connect, it will bubble up that report so that "git
fetch" knows not to perform the connectivity check in such a case.
This was noticed when a user tried to deepen an existing repository by
fetching with --no-shallow from a server that did not send all necessary
objects - the connectivity check as run by "git fetch" succeeded, but a
subsequent "git fsck" failed.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-07-03 01:08:43 +03:00
|
|
|
}
|
2020-07-28 23:24:53 +03:00
|
|
|
strvec_push(&rev_list.args, "--quiet");
|
|
|
|
strvec_push(&rev_list.args, "--alternate-refs");
|
2016-07-15 13:32:28 +03:00
|
|
|
if (opt->progress)
|
2020-07-28 23:24:53 +03:00
|
|
|
strvec_pushf(&rev_list.args, "--progress=%s",
|
strvec: fix indentation in renamed calls
Code which split an argv_array call across multiple lines, like:
argv_array_pushl(&args, "one argument",
"another argument", "and more",
NULL);
was recently mechanically renamed to use strvec, which results in
mis-matched indentation like:
strvec_pushl(&args, "one argument",
"another argument", "and more",
NULL);
Let's fix these up to align the arguments with the opening paren. I did
this manually by sifting through the results of:
git jump grep 'strvec_.*,$'
and liberally applying my editor's auto-format. Most of the changes are
of the form shown above, though I also normalized a few that had
originally used a single-tab indentation (rather than our usual style of
aligning with the open paren). I also rewrapped a couple of obvious
cases (e.g., where previously too-long lines became short enough to fit
on one), but I wasn't aggressive about it. In cases broken to three or
more lines, the grouping of arguments is sometimes meaningful, and it
wasn't worth my time or reviewer time to ponder each case individually.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-07-28 23:26:31 +03:00
|
|
|
_("Checking connectivity"));
|
2011-09-03 03:33:22 +04:00
|
|
|
|
|
|
|
rev_list.git_cmd = 1;
|
2021-11-26 01:52:24 +03:00
|
|
|
if (opt->env)
|
2022-06-02 12:09:50 +03:00
|
|
|
strvec_pushv(&rev_list.env, opt->env);
|
2011-09-03 03:33:22 +04:00
|
|
|
rev_list.in = -1;
|
|
|
|
rev_list.no_stdout = 1;
|
2016-07-15 13:32:03 +03:00
|
|
|
if (opt->err_fd)
|
|
|
|
rev_list.err = opt->err_fd;
|
|
|
|
else
|
|
|
|
rev_list.no_stderr = opt->quiet;
|
|
|
|
|
2022-11-08 21:17:47 +03:00
|
|
|
if (start_command(&rev_list)) {
|
|
|
|
free(new_pack);
|
2011-09-03 03:33:22 +04:00
|
|
|
return error(_("Could not run 'git rev-list'"));
|
2022-11-08 21:17:47 +03:00
|
|
|
}
|
2011-09-03 03:33:22 +04:00
|
|
|
|
|
|
|
sigchain_push(SIGPIPE, SIG_IGN);
|
|
|
|
|
2020-08-12 19:52:49 +03:00
|
|
|
rev_list_in = xfdopen(rev_list.in, "w");
|
|
|
|
|
2011-09-03 03:33:22 +04:00
|
|
|
do {
|
2013-05-26 05:16:17 +04:00
|
|
|
/*
|
|
|
|
* If index-pack already checked that:
|
|
|
|
* - there are no dangling pointers in the new pack
|
|
|
|
* - the pack is self contained
|
|
|
|
* Then if the updated ref is in the new pack, then we
|
|
|
|
* are sure the ref is good and not sending it to
|
|
|
|
* rev-list for verification.
|
|
|
|
*/
|
2021-09-01 16:09:50 +03:00
|
|
|
if (new_pack && find_pack_entry_one(oid->hash, new_pack))
|
2013-05-26 05:16:17 +04:00
|
|
|
continue;
|
|
|
|
|
2021-09-01 16:09:50 +03:00
|
|
|
if (fprintf(rev_list_in, "%s\n", oid_to_hex(oid)) < 0)
|
2011-09-03 03:33:22 +04:00
|
|
|
break;
|
2021-09-01 16:09:50 +03:00
|
|
|
} while ((oid = fn(cb_data)) != NULL);
|
2011-09-03 03:33:22 +04:00
|
|
|
|
2020-08-12 19:52:49 +03:00
|
|
|
if (ferror(rev_list_in) || fflush(rev_list_in)) {
|
|
|
|
if (errno != EPIPE && errno != EINVAL)
|
|
|
|
error_errno(_("failed write to rev-list"));
|
|
|
|
err = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fclose(rev_list_in))
|
2016-05-08 12:47:39 +03:00
|
|
|
err = error_errno(_("failed to close rev-list's stdin"));
|
2011-09-03 03:33:22 +04:00
|
|
|
|
|
|
|
sigchain_pop(SIGPIPE);
|
2022-11-08 21:17:47 +03:00
|
|
|
free(new_pack);
|
2011-09-03 03:33:22 +04:00
|
|
|
return finish_command(&rev_list) || err;
|
|
|
|
}
|