From b5624a44744d6e5f9c749e4df02932c724e2097e Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 9 Aug 2022 13:11:39 +0000 Subject: [PATCH 1/6] remote-curl: add 'get' capability A future change will want a way to download a file over HTTP(S) using the simplest of download mechanisms. We do not want to assume that the server on the other side understands anything about the Git protocol but could be a simple static web server. Create the new 'get' capability for the remote helpers which advertises that the 'get' command is avalable. A caller can send a line containing 'get ' to download the file at into the file at . Reviewed-by: Josh Steadmon Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/gitremote-helpers.txt | 9 +++++++ remote-curl.c | 28 ++++++++++++++++++++++ t/t5557-http-get.sh | 37 +++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100755 t/t5557-http-get.sh diff --git a/Documentation/gitremote-helpers.txt b/Documentation/gitremote-helpers.txt index 6f1e269ae4..ed8da428c9 100644 --- a/Documentation/gitremote-helpers.txt +++ b/Documentation/gitremote-helpers.txt @@ -168,6 +168,9 @@ Supported commands: 'list', 'import'. Can guarantee that when a clone is requested, the received pack is self contained and is connected. +'get':: + Can use the 'get' command to download a file from a given URI. + If a helper advertises 'connect', Git will use it if possible and fall back to another capability if the helper requests so when connecting (see the 'connect' command under COMMANDS). @@ -418,6 +421,12 @@ Supported if the helper has the "connect" capability. + Supported if the helper has the "stateless-connect" capability. +'get' :: + Downloads the file from the given `` to the given ``. If + `.temp` exists, then Git assumes that the `.temp` file is a + partial download from a previous attempt and will resume the + download from that position. + If a fatal error occurs, the program writes the error message to stderr and exits. The caller should expect that a suitable error message has been printed if the child closes the connection without diff --git a/remote-curl.c b/remote-curl.c index b8758757ec..72dfb8fb86 100644 --- a/remote-curl.c +++ b/remote-curl.c @@ -1286,6 +1286,29 @@ static void parse_fetch(struct strbuf *buf) strbuf_reset(buf); } +static void parse_get(const char *arg) +{ + struct strbuf url = STRBUF_INIT; + struct strbuf path = STRBUF_INIT; + const char *space; + + space = strchr(arg, ' '); + + if (!space) + die(_("protocol error: expected ' ', missing space")); + + strbuf_add(&url, arg, space - arg); + strbuf_addstr(&path, space + 1); + + if (http_get_file(url.buf, path.buf, NULL)) + die(_("failed to download file at URL '%s'"), url.buf); + + strbuf_release(&url); + strbuf_release(&path); + printf("\n"); + fflush(stdout); +} + static int push_dav(int nr_spec, const char **specs) { struct child_process child = CHILD_PROCESS_INIT; @@ -1564,9 +1587,14 @@ int cmd_main(int argc, const char **argv) printf("unsupported\n"); fflush(stdout); + } else if (skip_prefix(buf.buf, "get ", &arg)) { + parse_get(arg); + fflush(stdout); + } else if (!strcmp(buf.buf, "capabilities")) { printf("stateless-connect\n"); printf("fetch\n"); + printf("get\n"); printf("option\n"); printf("push\n"); printf("check-connectivity\n"); diff --git a/t/t5557-http-get.sh b/t/t5557-http-get.sh new file mode 100755 index 0000000000..09fb0bd9a8 --- /dev/null +++ b/t/t5557-http-get.sh @@ -0,0 +1,37 @@ +#!/bin/sh + +test_description='test downloading a file by URL' + +. ./test-lib.sh + +. "$TEST_DIRECTORY"/lib-httpd.sh +start_httpd + +test_expect_success 'get by URL: 404' ' + test_when_finished "rm -f file.temp" && + url="$HTTPD_URL/none.txt" && + cat >input <<-EOF && + capabilities + get $url file1 + EOF + + test_must_fail git remote-http $url err && + test_path_is_missing file1 && + grep "failed to download file at URL" err +' + +test_expect_success 'get by URL: 200' ' + echo data >"$HTTPD_DOCUMENT_ROOT_PATH/exists.txt" && + + url="$HTTPD_URL/exists.txt" && + cat >input <<-EOF && + capabilities + get $url file2 + + EOF + + git remote-http $url Date: Tue, 9 Aug 2022 13:11:40 +0000 Subject: [PATCH 2/6] bundle-uri: create basic file-copy logic Before implementing a way to fetch bundles into a repository, create the basic logic. Assume that the URI is actually a file path. Future logic will make this more careful to other protocols. For now, we also only succeed if the content at the URI is a bundle file, not a bundle list. Bundle lists will be implemented in a future change. Note that the discovery of a temporary filename is slightly racy because the odb_mkstemp() relies on the temporary file not existing. With the current implementation being limited to file copies, we could replace the copy_file() with copy_fd(). The tricky part comes in future changes that send the filename to 'git remote-https' and its 'get' capability. At that point, we need the file descriptor closed _and_ the file unlinked. If we were to keep the file descriptor open for the sake of normal file copies, then we would pollute the rest of the code for little benefit. This is especially the case because we expect that most bundle URI use will be based on HTTPS instead of file copies. Reviewed-by: Josh Steadmon Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Makefile | 1 + bundle-uri.c | 104 ++++++++++++++++++++++++++++++++++++++++++++ bundle-uri.h | 14 ++++++ t/t5557-http-get.sh | 2 + 4 files changed, 121 insertions(+) create mode 100644 bundle-uri.c create mode 100644 bundle-uri.h diff --git a/Makefile b/Makefile index 1624471bad..7d5f48069e 100644 --- a/Makefile +++ b/Makefile @@ -906,6 +906,7 @@ LIB_OBJS += blob.o LIB_OBJS += bloom.o LIB_OBJS += branch.o LIB_OBJS += bulk-checkin.o +LIB_OBJS += bundle-uri.o LIB_OBJS += bundle.o LIB_OBJS += cache-tree.o LIB_OBJS += cbtree.o diff --git a/bundle-uri.c b/bundle-uri.c new file mode 100644 index 0000000000..b35babc36a --- /dev/null +++ b/bundle-uri.c @@ -0,0 +1,104 @@ +#include "cache.h" +#include "bundle-uri.h" +#include "bundle.h" +#include "object-store.h" +#include "refs.h" +#include "run-command.h" + +static int find_temp_filename(struct strbuf *name) +{ + int fd; + /* + * Find a temporary filename that is available. This is briefly + * racy, but unlikely to collide. + */ + fd = odb_mkstemp(name, "bundles/tmp_uri_XXXXXX"); + if (fd < 0) { + warning(_("failed to create temporary file")); + return -1; + } + + close(fd); + unlink(name->buf); + return 0; +} + +static int copy_uri_to_file(const char *file, const char *uri) +{ + /* File-based URIs only for now. */ + return copy_file(file, uri, 0); +} + +static int unbundle_from_file(struct repository *r, const char *file) +{ + int result = 0; + int bundle_fd; + struct bundle_header header = BUNDLE_HEADER_INIT; + struct string_list_item *refname; + struct strbuf bundle_ref = STRBUF_INIT; + size_t bundle_prefix_len; + + if ((bundle_fd = read_bundle_header(file, &header)) < 0) + return 1; + + if ((result = unbundle(r, &header, bundle_fd, NULL))) + return 1; + + /* + * Convert all refs/heads/ from the bundle into refs/bundles/ + * in the local repository. + */ + strbuf_addstr(&bundle_ref, "refs/bundles/"); + bundle_prefix_len = bundle_ref.len; + + for_each_string_list_item(refname, &header.references) { + struct object_id *oid = refname->util; + struct object_id old_oid; + const char *branch_name; + int has_old; + + if (!skip_prefix(refname->string, "refs/heads/", &branch_name)) + continue; + + strbuf_setlen(&bundle_ref, bundle_prefix_len); + strbuf_addstr(&bundle_ref, branch_name); + + has_old = !read_ref(bundle_ref.buf, &old_oid); + update_ref("fetched bundle", bundle_ref.buf, oid, + has_old ? &old_oid : NULL, + REF_SKIP_OID_VERIFICATION, + UPDATE_REFS_MSG_ON_ERR); + } + + bundle_header_release(&header); + return result; +} + +int fetch_bundle_uri(struct repository *r, const char *uri) +{ + int result = 0; + struct strbuf filename = STRBUF_INIT; + + if ((result = find_temp_filename(&filename))) + goto cleanup; + + if ((result = copy_uri_to_file(filename.buf, uri))) { + warning(_("failed to download bundle from URI '%s'"), uri); + goto cleanup; + } + + if ((result = !is_bundle(filename.buf, 0))) { + warning(_("file at URI '%s' is not a bundle"), uri); + goto cleanup; + } + + if ((result = unbundle_from_file(r, filename.buf))) { + warning(_("failed to unbundle bundle from URI '%s'"), uri); + goto cleanup; + } + +cleanup: + unlink(filename.buf); + strbuf_release(&filename); + return result; +} diff --git a/bundle-uri.h b/bundle-uri.h new file mode 100644 index 0000000000..8a152f1ef1 --- /dev/null +++ b/bundle-uri.h @@ -0,0 +1,14 @@ +#ifndef BUNDLE_URI_H +#define BUNDLE_URI_H + +struct repository; + +/** + * Fetch data from the given 'uri' and unbundle the bundle data found + * based on that information. + * + * Returns non-zero if no bundle information is found at the given 'uri'. + */ +int fetch_bundle_uri(struct repository *r, const char *uri); + +#endif diff --git a/t/t5557-http-get.sh b/t/t5557-http-get.sh index 09fb0bd9a8..76a4bbd16a 100755 --- a/t/t5557-http-get.sh +++ b/t/t5557-http-get.sh @@ -2,6 +2,8 @@ test_description='test downloading a file by URL' +TEST_PASSES_SANITIZE_LEAK=true + . ./test-lib.sh . "$TEST_DIRECTORY"/lib-httpd.sh From 55568919616429fbc209880cf189a3adaceb6093 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 9 Aug 2022 13:11:41 +0000 Subject: [PATCH 3/6] clone: add --bundle-uri option Cloning a remote repository is one of the most expensive operations in Git. The server can spend a lot of CPU time generating a pack-file for the client's request. The amount of data can clog the network for a long time, and the Git protocol is not resumable. For users with poor network connections or are located far away from the origin server, this can be especially painful. Add a new '--bundle-uri' option to 'git clone' to bootstrap a clone from a bundle. If the user is aware of a bundle server, then they can tell Git to bootstrap the new repository with these bundles before fetching the remaining objects from the origin server. Reviewed-by: Josh Steadmon Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/git-clone.txt | 6 ++++++ builtin/clone.c | 15 +++++++++++++++ t/t5558-clone-bundle-uri.sh | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100755 t/t5558-clone-bundle-uri.sh diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt index 632bd1348e..60fedf7eb5 100644 --- a/Documentation/git-clone.txt +++ b/Documentation/git-clone.txt @@ -323,6 +323,12 @@ or `--mirror` is given) for `host.xz:foo/.git`). Cloning into an existing directory is only allowed if the directory is empty. +--bundle-uri=:: + Before fetching from the remote, fetch a bundle from the given + `` and unbundle the data into the local repository. The refs + in the bundle will be stored under the hidden `refs/bundle/*` + namespace. + :git-clone: 1 include::urls.txt[] diff --git a/builtin/clone.c b/builtin/clone.c index c4ff4643ec..4224d56275 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -34,6 +34,7 @@ #include "list-objects-filter-options.h" #include "hook.h" #include "bundle.h" +#include "bundle-uri.h" /* * Overall FIXMEs: @@ -77,6 +78,7 @@ static int option_filter_submodules = -1; /* unspecified */ static int config_filter_submodules = -1; /* unspecified */ static struct string_list server_options = STRING_LIST_INIT_NODUP; static int option_remote_submodules; +static const char *bundle_uri; static int recurse_submodules_cb(const struct option *opt, const char *arg, int unset) @@ -160,6 +162,8 @@ static struct option builtin_clone_options[] = { N_("any cloned submodules will use their remote-tracking branch")), OPT_BOOL(0, "sparse", &option_sparse_checkout, N_("initialize sparse-checkout file to include only files at root")), + OPT_STRING(0, "bundle-uri", &bundle_uri, + N_("uri"), N_("a URI for downloading bundles before fetching from origin remote")), OPT_END() }; @@ -1232,6 +1236,17 @@ int cmd_clone(int argc, const char **argv, const char *prefix) if (transport->smart_options && !deepen && !filter_options.choice) transport->smart_options->check_self_contained_and_connected = 1; + /* + * Before fetching from the remote, download and install bundle + * data from the --bundle-uri option. + */ + if (bundle_uri) { + /* At this point, we need the_repository to match the cloned repo. */ + repo_init(the_repository, git_dir, work_tree); + if (fetch_bundle_uri(the_repository, bundle_uri)) + warning(_("failed to fetch objects from bundle URI '%s'"), + bundle_uri); + } strvec_push(&transport_ls_refs_options.ref_prefixes, "HEAD"); refspec_ref_prefixes(&remote->fetch, diff --git a/t/t5558-clone-bundle-uri.sh b/t/t5558-clone-bundle-uri.sh new file mode 100755 index 0000000000..f709bcb729 --- /dev/null +++ b/t/t5558-clone-bundle-uri.sh @@ -0,0 +1,36 @@ +#!/bin/sh + +test_description='test fetching bundles with --bundle-uri' + +. ./test-lib.sh + +test_expect_success 'fail to clone from non-existent file' ' + test_when_finished rm -rf test && + git clone --bundle-uri="$(pwd)/does-not-exist" . test 2>err && + grep "failed to download bundle from URI" err +' + +test_expect_success 'fail to clone from non-bundle file' ' + test_when_finished rm -rf test && + echo bogus >bogus && + git clone --bundle-uri="$(pwd)/bogus" . test 2>err && + grep "is not a bundle" err +' + +test_expect_success 'create bundle' ' + git init clone-from && + git -C clone-from checkout -b topic && + test_commit -C clone-from A && + test_commit -C clone-from B && + git -C clone-from bundle create B.bundle topic +' + +test_expect_success 'clone with path bundle' ' + git clone --bundle-uri="clone-from/B.bundle" \ + clone-from clone-path && + git -C clone-path rev-parse refs/bundles/topic >actual && + git -C clone-from rev-parse topic >expect && + test_cmp expect actual +' + +test_done From 59c1752ab6768cb9c380f0a7c9d06af79d183f67 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 9 Aug 2022 13:11:42 +0000 Subject: [PATCH 4/6] bundle-uri: add support for http(s):// and file:// The previous change created the 'git clone --bundle-uri=' option. Currently, must be a filename. Update copy_uri_to_file() to first inspect the URI for an HTTP(S) prefix and use git-remote-https as the way to download the data at that URI. Otherwise, check to see if file:// is present and modify the prefix accordingly. Reviewed-by: Josh Steadmon Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- bundle-uri.c | 70 +++++++++++++++++++++++++++++++++++-- t/t5558-clone-bundle-uri.sh | 45 ++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 3 deletions(-) diff --git a/bundle-uri.c b/bundle-uri.c index b35babc36a..4a8cc74ed0 100644 --- a/bundle-uri.c +++ b/bundle-uri.c @@ -23,10 +23,74 @@ static int find_temp_filename(struct strbuf *name) return 0; } -static int copy_uri_to_file(const char *file, const char *uri) +static int download_https_uri_to_file(const char *file, const char *uri) { - /* File-based URIs only for now. */ - return copy_file(file, uri, 0); + int result = 0; + struct child_process cp = CHILD_PROCESS_INIT; + FILE *child_in = NULL, *child_out = NULL; + struct strbuf line = STRBUF_INIT; + int found_get = 0; + + strvec_pushl(&cp.args, "git-remote-https", uri, NULL); + cp.in = -1; + cp.out = -1; + + if (start_command(&cp)) + return 1; + + child_in = fdopen(cp.in, "w"); + if (!child_in) { + result = 1; + goto cleanup; + } + + child_out = fdopen(cp.out, "r"); + if (!child_out) { + result = 1; + goto cleanup; + } + + fprintf(child_in, "capabilities\n"); + fflush(child_in); + + while (!strbuf_getline(&line, child_out)) { + if (!line.len) + break; + if (!strcmp(line.buf, "get")) + found_get = 1; + } + strbuf_release(&line); + + if (!found_get) { + result = error(_("insufficient capabilities")); + goto cleanup; + } + + fprintf(child_in, "get %s %s\n\n", uri, file); + +cleanup: + if (child_in) + fclose(child_in); + if (finish_command(&cp)) + return 1; + if (child_out) + fclose(child_out); + return result; +} + +static int copy_uri_to_file(const char *filename, const char *uri) +{ + const char *out; + + if (starts_with(uri, "https:") || + starts_with(uri, "http:")) + return download_https_uri_to_file(filename, uri); + + if (skip_prefix(uri, "file://", &out)) + uri = out; + + /* Copy as a file */ + return copy_file(filename, uri, 0); } static int unbundle_from_file(struct repository *r, const char *file) diff --git a/t/t5558-clone-bundle-uri.sh b/t/t5558-clone-bundle-uri.sh index f709bcb729..ad666a2d28 100755 --- a/t/t5558-clone-bundle-uri.sh +++ b/t/t5558-clone-bundle-uri.sh @@ -33,4 +33,49 @@ test_expect_success 'clone with path bundle' ' test_cmp expect actual ' +test_expect_success 'clone with file:// bundle' ' + git clone --bundle-uri="file://$(pwd)/clone-from/B.bundle" \ + clone-from clone-file && + git -C clone-file rev-parse refs/bundles/topic >actual && + git -C clone-from rev-parse topic >expect && + test_cmp expect actual +' + +######################################################################### +# HTTP tests begin here + +. "$TEST_DIRECTORY"/lib-httpd.sh +start_httpd + +test_expect_success 'fail to fetch from non-existent HTTP URL' ' + test_when_finished rm -rf test && + git clone --bundle-uri="$HTTPD_URL/does-not-exist" . test 2>err && + grep "failed to download bundle from URI" err +' + +test_expect_success 'fail to fetch from non-bundle HTTP URL' ' + test_when_finished rm -rf test && + echo bogus >"$HTTPD_DOCUMENT_ROOT_PATH/bogus" && + git clone --bundle-uri="$HTTPD_URL/bogus" . test 2>err && + grep "is not a bundle" err +' + +test_expect_success 'clone HTTP bundle' ' + cp clone-from/B.bundle "$HTTPD_DOCUMENT_ROOT_PATH/B.bundle" && + + git clone --no-local --mirror clone-from \ + "$HTTPD_DOCUMENT_ROOT_PATH/fetch.git" && + + git clone --bundle-uri="$HTTPD_URL/B.bundle" \ + "$HTTPD_URL/smart/fetch.git" clone-http && + git -C clone-http rev-parse refs/bundles/topic >actual && + git -C clone-from rev-parse topic >expect && + test_cmp expect actual && + + test_config -C clone-http log.excludedecoration refs/bundle/ +' + +# Do not add tests here unless they use the HTTP server, as they will +# not run unless the HTTP dependencies exist. + test_done From e21e663cd1942df29979d3e01f7eacb532727bb7 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 9 Aug 2022 13:11:43 +0000 Subject: [PATCH 5/6] clone: --bundle-uri cannot be combined with --depth A previous change added the '--bundle-uri' option, but did not check if the --depth parameter was included. Since bundles are not compatible with shallow clones, provide an error message to the user who is attempting this combination. I am leaving this as its own change, separate from the one that implements '--bundle-uri', because this is more of an advisory for the user. There is nothing wrong with bootstrapping with bundles and then fetching a shallow clone. However, that is likely going to involve too much work for the client _and_ the server. The client will download all of this bundle information containing the full history of the repository only to ignore most of it. The server will get a shallow fetch request, but with a list of haves that might cause a more painful computation of that shallow pack-file. Reviewed-by: Josh Steadmon Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/git-clone.txt | 3 ++- builtin/clone.c | 3 +++ t/t5606-clone-options.sh | 8 ++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt index 60fedf7eb5..d032d971dd 100644 --- a/Documentation/git-clone.txt +++ b/Documentation/git-clone.txt @@ -327,7 +327,8 @@ or `--mirror` is given) Before fetching from the remote, fetch a bundle from the given `` and unbundle the data into the local repository. The refs in the bundle will be stored under the hidden `refs/bundle/*` - namespace. + namespace. This option is incompatible with `--depth`, + `--shallow-since`, and `--shallow-exclude`. :git-clone: 1 include::urls.txt[] diff --git a/builtin/clone.c b/builtin/clone.c index 4224d56275..4463789680 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -937,6 +937,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix) option_no_checkout = 1; } + if (bundle_uri && deepen) + die(_("--bundle-uri is incompatible with --depth, --shallow-since, and --shallow-exclude")); + repo_name = argv[0]; path = get_repo_path(repo_name, &is_bundle); diff --git a/t/t5606-clone-options.sh b/t/t5606-clone-options.sh index 8f676d6b0c..f6bb02ab94 100755 --- a/t/t5606-clone-options.sh +++ b/t/t5606-clone-options.sh @@ -58,6 +58,14 @@ test_expect_success 'disallows --bare with --separate-git-dir' ' ' +test_expect_success 'disallows --bundle-uri with shallow options' ' + for option in --depth=1 --shallow-since=01-01-2000 --shallow-exclude=HEAD + do + test_must_fail git clone --bundle-uri=bundle $option from to 2>err && + grep "bundle-uri is incompatible" err || return 1 + done +' + test_expect_success 'reject cloning shallow repository' ' test_when_finished "rm -rf repo" && test_must_fail git clone --reject-shallow shallow-repo out 2>err && From 65da93891680edc0d1471d436d92d4da7d0b4465 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 23 Aug 2022 10:05:13 -0400 Subject: [PATCH 6/6] clone: warn on failure to repo_init() The --bundle-uri option was added in 55568919616 (clone: add --bundle-uri option, 2022-08-09), but this also introduced a call to repo_init() whose return value was ignored. Fix that ignored value by warning that the bundle URI process could not continue if it failed. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/clone.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/builtin/clone.c b/builtin/clone.c index 4463789680..e21d42dfee 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -1245,8 +1245,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix) */ if (bundle_uri) { /* At this point, we need the_repository to match the cloned repo. */ - repo_init(the_repository, git_dir, work_tree); - if (fetch_bundle_uri(the_repository, bundle_uri)) + if (repo_init(the_repository, git_dir, work_tree)) + warning(_("failed to initialize the repo, skipping bundle URI")); + else if (fetch_bundle_uri(the_repository, bundle_uri)) warning(_("failed to fetch objects from bundle URI '%s'"), bundle_uri); }