From e9e33ab0fba44c2680f5461cdc774c88b3bf4ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 15 Apr 2018 17:36:12 +0200 Subject: [PATCH 1/7] t7700: have closing quote of a test at the beginning of line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The closing quote of a test body by convention is always at the start of line. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t7700-repack.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh index 6061a04147..38247afbec 100755 --- a/t/t7700-repack.sh +++ b/t/t7700-repack.sh @@ -194,7 +194,7 @@ test_expect_success 'objects made unreachable by grafts only are kept' ' git reflog expire --expire=$test_tick --expire-unreachable=$test_tick --all && git repack -a -d && git cat-file -t $H1 - ' +' test_done From ed7e5fc3a2562e3d4fb4fe25e8dca0e9daa0ed13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 15 Apr 2018 17:36:13 +0200 Subject: [PATCH 2/7] repack: add --keep-pack option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We allow to keep existing packs by having companion .keep files. This is helpful when a pack is permanently kept. In the next patch, git-gc just wants to keep a pack temporarily, for one pack-objects run. git-gc can use --keep-pack for this use case. A note about why the pack_keep field cannot be reused and pack_keep_in_core has to be added. This is about the case when --keep-pack is specified together with either --keep-unreachable or --unpack-unreachable, but --honor-pack-keep is NOT specified. In this case, we want to exclude objects from the packs specified on command line, not from ones with .keep files. If only one bit flag is used, we have to clear pack_keep on pack files with the .keep file. But we can't make any assumption about unreachable objects in .keep packs. If "pack_keep" field is false for .keep packs, we could potentially pull lots of unreachable objects into the new pack, or unpack them loose. The safer approach is ignore all packs with either .keep file or --keep-pack. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/git-pack-objects.txt | 9 ++++- Documentation/git-repack.txt | 9 ++++- builtin/pack-objects.c | 63 ++++++++++++++++++++++++------ builtin/repack.c | 21 ++++++++-- object-store.h | 1 + t/t7700-repack.sh | 25 ++++++++++++ 6 files changed, 110 insertions(+), 18 deletions(-) diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 81bc490ac5..403524652a 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -12,7 +12,7 @@ SYNOPSIS 'git pack-objects' [-q | --progress | --all-progress] [--all-progress-implied] [--no-reuse-delta] [--delta-base-offset] [--non-empty] [--local] [--incremental] [--window=] [--depth=] - [--revs [--unpacked | --all]] + [--revs [--unpacked | --all]] [--keep-pack=] [--stdout [--filter=] | base-name] [--shallow] [--keep-true-parents] < object-list @@ -126,6 +126,13 @@ base-name:: has a .keep file to be ignored, even if it would have otherwise been packed. +--keep-pack=:: + This flag causes an object already in the given pack to be + ignored, even if it would have otherwise been + packed. `` is the the pack file name without + leading directory (e.g. `pack-123.pack`). The option could be + specified multiple times to keep multiple packs. + --incremental:: This flag causes an object already in a pack to be ignored even if it would have otherwise been packed. diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt index ae750e9e11..ce497d9d12 100644 --- a/Documentation/git-repack.txt +++ b/Documentation/git-repack.txt @@ -9,7 +9,7 @@ git-repack - Pack unpacked objects in a repository SYNOPSIS -------- [verse] -'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=] [--depth=] [--threads=] +'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [--window=] [--depth=] [--threads=] [--keep-pack=] DESCRIPTION ----------- @@ -133,6 +133,13 @@ other objects in that pack they already have locally. with `-b` or `repack.writeBitmaps`, as it ensures that the bitmapped packfile has the necessary objects. +--keep-pack=:: + Exclude the given pack from repacking. This is the equivalent + of having `.keep` file on the pack. `` is the the + pack file name without leading directory (e.g. `pack-123.pack`). + The option could be specified multiple times to keep multiple + packs. + --unpack-unreachable=:: When loosening unreachable objects, do not bother loosening any objects older than ``. This can be used to optimize out diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 4bdae5a1d8..9b9a6d6268 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -30,6 +30,7 @@ #include "list.h" #include "packfile.h" #include "object-store.h" +#include "dir.h" static const char *pack_usage[] = { N_("git pack-objects --stdout [...] [< | < ]"), @@ -55,7 +56,8 @@ static int pack_loose_unreachable; static int local; static int have_non_local_packs; static int incremental; -static int ignore_packed_keep; +static int ignore_packed_keep_on_disk; +static int ignore_packed_keep_in_core; static int allow_ofs_delta; static struct pack_idx_option pack_idx_opts; static const char *base_name; @@ -982,13 +984,16 @@ static int want_found_object(int exclude, struct packed_git *p) * Otherwise, we signal "-1" at the end to tell the caller that we do * not know either way, and it needs to check more packs. */ - if (!ignore_packed_keep && + if (!ignore_packed_keep_on_disk && + !ignore_packed_keep_in_core && (!local || !have_non_local_packs)) return 1; if (local && !p->pack_local) return 0; - if (ignore_packed_keep && p->pack_local && p->pack_keep) + if (p->pack_local && + ((ignore_packed_keep_on_disk && p->pack_keep) || + (ignore_packed_keep_in_core && p->pack_keep_in_core))) return 0; /* we don't know yet; keep looking for more packs */ @@ -2675,7 +2680,7 @@ static void add_objects_in_unpacked_packs(struct rev_info *revs) struct object_id oid; struct object *o; - if (!p->pack_local || p->pack_keep) + if (!p->pack_local || p->pack_keep || p->pack_keep_in_core) continue; if (open_pack_index(p)) die("cannot open pack index"); @@ -2738,7 +2743,8 @@ static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid) get_packed_git(the_repository); while (p) { - if ((!p->pack_local || p->pack_keep) && + if ((!p->pack_local || p->pack_keep || + p->pack_keep_in_core) && find_pack_entry_one(oid->hash, p)) { last_found = p; return 1; @@ -2781,7 +2787,7 @@ static void loosen_unused_packed_objects(struct rev_info *revs) struct object_id oid; for (p = get_packed_git(the_repository); p; p = p->next) { - if (!p->pack_local || p->pack_keep) + if (!p->pack_local || p->pack_keep || p->pack_keep_in_core) continue; if (open_pack_index(p)) @@ -2807,7 +2813,8 @@ static int pack_options_allow_reuse(void) { return pack_to_stdout && allow_ofs_delta && - !ignore_packed_keep && + !ignore_packed_keep_on_disk && + !ignore_packed_keep_in_core && (!local || !have_non_local_packs) && !incremental; } @@ -2916,6 +2923,32 @@ static void get_object_list(int ac, const char **av) oid_array_clear(&recent_objects); } +static void add_extra_kept_packs(const struct string_list *names) +{ + struct packed_git *p; + + if (!names->nr) + return; + + for (p = get_packed_git(the_repository); p; p = p->next) { + const char *name = basename(p->pack_name); + int i; + + if (!p->pack_local) + continue; + + for (i = 0; i < names->nr; i++) + if (!fspathcmp(name, names->items[i].string)) + break; + + if (i < names->nr) { + p->pack_keep_in_core = 1; + ignore_packed_keep_in_core = 1; + continue; + } + } +} + static int option_parse_index_version(const struct option *opt, const char *arg, int unset) { @@ -2955,6 +2988,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) struct argv_array rp = ARGV_ARRAY_INIT; int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0; int rev_list_index = 0; + struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; struct option pack_objects_options[] = { OPT_SET_INT('q', "quiet", &progress, N_("do not show progress meter"), 0), @@ -3019,8 +3053,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("create thin packs")), OPT_BOOL(0, "shallow", &shallow, N_("create packs suitable for shallow fetches")), - OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep, + OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk, N_("ignore packs that have companion .keep file")), + OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"), + N_("ignore this pack")), OPT_INTEGER(0, "compression", &pack_compression_level, N_("pack compression level")), OPT_SET_INT(0, "keep-true-parents", &grafts_replace_parents, @@ -3148,19 +3184,20 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (progress && all_progress_implied) progress = 2; - if (ignore_packed_keep) { + add_extra_kept_packs(&keep_pack_list); + if (ignore_packed_keep_on_disk) { struct packed_git *p; for (p = get_packed_git(the_repository); p; p = p->next) if (p->pack_local && p->pack_keep) break; if (!p) /* no keep-able packs found */ - ignore_packed_keep = 0; + ignore_packed_keep_on_disk = 0; } if (local) { /* - * unlike ignore_packed_keep above, we do not want to - * unset "local" based on looking at packs, as it - * also covers non-local objects + * unlike ignore_packed_keep_on_disk above, we do not + * want to unset "local" based on looking at packs, as + * it also covers non-local objects */ struct packed_git *p; for (p = get_packed_git(the_repository); p; p = p->next) { diff --git a/builtin/repack.c b/builtin/repack.c index 7bdb40142f..6c636e159e 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -86,7 +86,8 @@ static void remove_pack_on_signal(int signo) * have a corresponding .keep or .promisor file. These packs are not to * be kept if we are going to pack everything into one file. */ -static void get_non_kept_pack_filenames(struct string_list *fname_list) +static void get_non_kept_pack_filenames(struct string_list *fname_list, + const struct string_list *extra_keep) { DIR *dir; struct dirent *e; @@ -97,6 +98,14 @@ static void get_non_kept_pack_filenames(struct string_list *fname_list) while ((e = readdir(dir)) != NULL) { size_t len; + int i; + + for (i = 0; i < extra_keep->nr; i++) + if (!fspathcmp(e->d_name, extra_keep->items[i].string)) + break; + if (extra_keep->nr > 0 && i < extra_keep->nr) + continue; + if (!strip_suffix(e->d_name, ".pack", &len)) continue; @@ -148,7 +157,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) struct string_list rollback = STRING_LIST_INIT_NODUP; struct string_list existing_packs = STRING_LIST_INIT_DUP; struct strbuf line = STRBUF_INIT; - int ext, ret, failed; + int i, ext, ret, failed; FILE *out; /* variables to be filled by option parsing */ @@ -160,6 +169,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) const char *depth = NULL; const char *threads = NULL; const char *max_pack_size = NULL; + struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; int no_reuse_delta = 0, no_reuse_object = 0; int no_update_server_info = 0; int quiet = 0; @@ -200,6 +210,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) N_("maximum size of each packfile")), OPT_BOOL(0, "pack-kept-objects", &pack_kept_objects, N_("repack objects in packs marked with .keep")), + OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"), + N_("do not repack this pack")), OPT_END() }; @@ -230,6 +242,9 @@ int cmd_repack(int argc, const char **argv, const char *prefix) argv_array_push(&cmd.args, "--keep-true-parents"); if (!pack_kept_objects) argv_array_push(&cmd.args, "--honor-pack-keep"); + for (i = 0; i < keep_pack_list.nr; i++) + argv_array_pushf(&cmd.args, "--keep-pack=%s", + keep_pack_list.items[i].string); argv_array_push(&cmd.args, "--non-empty"); argv_array_push(&cmd.args, "--all"); argv_array_push(&cmd.args, "--reflog"); @@ -254,7 +269,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) argv_array_push(&cmd.args, "--write-bitmap-index"); if (pack_everything & ALL_INTO_ONE) { - get_non_kept_pack_filenames(&existing_packs); + get_non_kept_pack_filenames(&existing_packs, &keep_pack_list); if (existing_packs.nr && delete_redundant) { if (unpack_unreachable) { diff --git a/object-store.h b/object-store.h index fef33f345f..0a4dbb74d3 100644 --- a/object-store.h +++ b/object-store.h @@ -71,6 +71,7 @@ struct packed_git { int pack_fd; unsigned pack_local:1, pack_keep:1, + pack_keep_in_core:1, freshened:1, do_not_close:1, pack_promisor:1; diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh index 38247afbec..6162e2a8e6 100755 --- a/t/t7700-repack.sh +++ b/t/t7700-repack.sh @@ -4,6 +4,12 @@ test_description='git repack works correctly' . ./test-lib.sh +commit_and_pack() { + test_commit "$@" >/dev/null && + SHA1=$(git pack-objects --all --unpacked --incremental .git/objects/pack/pack file1 && echo content2 > file2 && @@ -196,5 +202,24 @@ test_expect_success 'objects made unreachable by grafts only are kept' ' git cat-file -t $H1 ' +test_expect_success 'repack --keep-pack' ' + test_create_repo keep-pack && + ( + cd keep-pack && + P1=$(commit_and_pack 1) && + P2=$(commit_and_pack 2) && + P3=$(commit_and_pack 3) && + P4=$(commit_and_pack 4) && + ls .git/objects/pack/*.pack >old-counts && + test_line_count = 4 old-counts && + git repack -a -d --keep-pack $P1 --keep-pack $P4 && + ls .git/objects/pack/*.pack >new-counts && + grep -q $P1 new-counts && + grep -q $P4 new-counts && + test_line_count = 3 new-counts && + git fsck + ) +' + test_done From ae4e89e549b76b561a1c384dd7314c9b671c22bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 15 Apr 2018 17:36:14 +0200 Subject: [PATCH 3/7] gc: add --keep-largest-pack option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a new repack mode that combines everything into a secondary pack, leaving the largest pack alone. This could help reduce memory pressure. On linux-2.6.git, valgrind massif reports 1.6GB heap in "pack all" case, and 535MB in "pack all except the base pack" case. We save roughly 1GB memory by excluding the base pack. This should also lower I/O because we don't have to rewrite a giant pack every time (e.g. for linux-2.6.git that's a 1.4GB pack file).. PS. The use of string_list here seems overkill, but we'll need it in the next patch... Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/git-gc.txt | 6 +++++- builtin/gc.c | 45 ++++++++++++++++++++++++++++++++++++---- t/t6500-gc.sh | 25 ++++++++++++++++++++++ 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt index 3126e0dd00..8f903231da 100644 --- a/Documentation/git-gc.txt +++ b/Documentation/git-gc.txt @@ -9,7 +9,7 @@ git-gc - Cleanup unnecessary files and optimize the local repository SYNOPSIS -------- [verse] -'git gc' [--aggressive] [--auto] [--quiet] [--prune= | --no-prune] [--force] +'git gc' [--aggressive] [--auto] [--quiet] [--prune= | --no-prune] [--force] [--keep-largest-pack] DESCRIPTION ----------- @@ -84,6 +84,10 @@ be performed as well. Force `git gc` to run even if there may be another `git gc` instance running on this repository. +--keep-largest-pack:: + All packs except the largest pack and those marked with a + `.keep` files are consolidated into a single pack. + Configuration ------------- diff --git a/builtin/gc.c b/builtin/gc.c index 3e67124eaa..f251662a8f 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -166,6 +166,22 @@ static int too_many_loose_objects(void) return needed; } +static void find_base_packs(struct string_list *packs) +{ + struct packed_git *p, *base = NULL; + + for (p = get_packed_git(the_repository); p; p = p->next) { + if (!p->pack_local) + continue; + if (!base || base->pack_size < p->pack_size) { + base = p; + } + } + + if (base) + string_list_append(packs, base->pack_name); +} + static int too_many_packs(void) { struct packed_git *p; @@ -188,7 +204,13 @@ static int too_many_packs(void) return gc_auto_pack_limit < cnt; } -static void add_repack_all_option(void) +static int keep_one_pack(struct string_list_item *item, void *data) +{ + argv_array_pushf(&repack, "--keep-pack=%s", basename(item->string)); + return 0; +} + +static void add_repack_all_option(struct string_list *keep_pack) { if (prune_expire && !strcmp(prune_expire, "now")) argv_array_push(&repack, "-a"); @@ -197,6 +219,9 @@ static void add_repack_all_option(void) if (prune_expire) argv_array_pushf(&repack, "--unpack-unreachable=%s", prune_expire); } + + if (keep_pack) + for_each_string_list(keep_pack, keep_one_pack, NULL); } static void add_repack_incremental_option(void) @@ -220,7 +245,7 @@ static int need_to_gc(void) * there is no need. */ if (too_many_packs()) - add_repack_all_option(); + add_repack_all_option(NULL); else if (too_many_loose_objects()) add_repack_incremental_option(); else @@ -354,6 +379,7 @@ int cmd_gc(int argc, const char **argv, const char *prefix) const char *name; pid_t pid; int daemonized = 0; + int keep_base_pack = -1; struct option builtin_gc_options[] = { OPT__QUIET(&quiet, N_("suppress progress reporting")), @@ -366,6 +392,8 @@ int cmd_gc(int argc, const char **argv, const char *prefix) OPT_BOOL_F(0, "force", &force, N_("force running gc even if there may be another gc running"), PARSE_OPT_NOCOMPLETE), + OPT_BOOL(0, "keep-largest-pack", &keep_base_pack, + N_("repack all other packs except the largest pack")), OPT_END() }; @@ -431,8 +459,17 @@ int cmd_gc(int argc, const char **argv, const char *prefix) */ daemonized = !daemonize(); } - } else - add_repack_all_option(); + } else { + struct string_list keep_pack = STRING_LIST_INIT_NODUP; + + if (keep_base_pack != -1) { + if (keep_base_pack) + find_base_packs(&keep_pack); + } + + add_repack_all_option(&keep_pack); + string_list_clear(&keep_pack, 0); + } name = lock_repo_for_gc(force, &pid); if (name) { diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh index d5255dd576..c42f60bc5b 100755 --- a/t/t6500-gc.sh +++ b/t/t6500-gc.sh @@ -43,6 +43,31 @@ test_expect_success 'gc is not aborted due to a stale symref' ' ) ' +test_expect_success 'gc --keep-largest-pack' ' + test_create_repo keep-pack && + ( + cd keep-pack && + test_commit one && + test_commit two && + test_commit three && + git gc && + ( cd .git/objects/pack && ls *.pack ) >pack-list && + test_line_count = 1 pack-list && + BASE_PACK=.git/objects/pack/pack-*.pack && + test_commit four && + git repack -d && + test_commit five && + git repack -d && + ( cd .git/objects/pack && ls *.pack ) >pack-list && + test_line_count = 3 pack-list && + git gc --keep-largest-pack && + ( cd .git/objects/pack && ls *.pack ) >pack-list && + test_line_count = 2 pack-list && + test_path_is_file $BASE_PACK && + git fsck + ) +' + test_expect_success 'auto gc with too many loose objects does not attempt to create bitmaps' ' test_config gc.auto 3 && test_config gc.autodetach false && From 55dfe13df9bb38809bc45b8d6d5c7f5bf0470c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 15 Apr 2018 17:36:15 +0200 Subject: [PATCH 4/7] gc: add gc.bigPackThreshold config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The --keep-largest-pack option is not very convenient to use because you need to tell gc to do this explicitly (and probably on just a few large repos). Add a config key that enables this mode when packs larger than a limit are found. Note that there's a slight behavior difference compared to --keep-largest-pack: all packs larger than the threshold are kept, not just the largest one. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/config.txt | 7 +++++++ Documentation/git-gc.txt | 6 ++++-- builtin/gc.c | 26 ++++++++++++++++++++------ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index 2659153cb3..728ae902e6 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -1558,6 +1558,13 @@ gc.autoDetach:: Make `git gc --auto` return immediately and run in background if the system supports it. Default is true. +gc.bigPackThreshold:: + If non-zero, all packs larger than this limit are kept when + `git gc` is run. This is very similar to `--keep-base-pack` + except that all packs that meet the threshold are kept, not + just the base pack. Defaults to zero. Common unit suffixes of + 'k', 'm', or 'g' are supported. + gc.logExpiry:: If the file gc.log exists, then `git gc --auto` won't run unless that file is more than 'gc.logExpiry' old. Default is diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt index 8f903231da..649faddfa6 100644 --- a/Documentation/git-gc.txt +++ b/Documentation/git-gc.txt @@ -56,7 +56,8 @@ single pack using `git repack -d -l`. Setting the value of `gc.auto` to 0 disables automatic packing of loose objects. + If the number of packs exceeds the value of `gc.autoPackLimit`, -then existing packs (except those marked with a `.keep` file) +then existing packs (except those marked with a `.keep` file +or over `gc.bigPackThreshold` limit) are consolidated into a single pack by using the `-A` option of 'git repack'. Setting `gc.autoPackLimit` to 0 disables automatic consolidation of packs. @@ -86,7 +87,8 @@ be performed as well. --keep-largest-pack:: All packs except the largest pack and those marked with a - `.keep` files are consolidated into a single pack. + `.keep` files are consolidated into a single pack. When this + option is used, `gc.bigPackThreshold` is ignored. Configuration ------------- diff --git a/builtin/gc.c b/builtin/gc.c index f251662a8f..81ecdc5ffa 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -41,6 +41,7 @@ static timestamp_t gc_log_expire_time; static const char *gc_log_expire = "1.day.ago"; static const char *prune_expire = "2.weeks.ago"; static const char *prune_worktrees_expire = "3.months.ago"; +static unsigned long big_pack_threshold; static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT; static struct argv_array reflog = ARGV_ARRAY_INIT; @@ -128,6 +129,8 @@ static void gc_config(void) git_config_get_expiry("gc.worktreepruneexpire", &prune_worktrees_expire); git_config_get_expiry("gc.logexpiry", &gc_log_expire); + git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold); + git_config(git_default_config, NULL); } @@ -166,14 +169,17 @@ static int too_many_loose_objects(void) return needed; } -static void find_base_packs(struct string_list *packs) +static void find_base_packs(struct string_list *packs, unsigned long limit) { struct packed_git *p, *base = NULL; for (p = get_packed_git(the_repository); p; p = p->next) { if (!p->pack_local) continue; - if (!base || base->pack_size < p->pack_size) { + if (limit) { + if (p->pack_size >= limit) + string_list_append(packs, p->pack_name); + } else if (!base || base->pack_size < p->pack_size) { base = p; } } @@ -244,9 +250,15 @@ static int need_to_gc(void) * we run "repack -A -d -l". Otherwise we tell the caller * there is no need. */ - if (too_many_packs()) - add_repack_all_option(NULL); - else if (too_many_loose_objects()) + if (too_many_packs()) { + struct string_list keep_pack = STRING_LIST_INIT_NODUP; + + if (big_pack_threshold) + find_base_packs(&keep_pack, big_pack_threshold); + + add_repack_all_option(&keep_pack); + string_list_clear(&keep_pack, 0); + } else if (too_many_loose_objects()) add_repack_incremental_option(); else return 0; @@ -464,7 +476,9 @@ int cmd_gc(int argc, const char **argv, const char *prefix) if (keep_base_pack != -1) { if (keep_base_pack) - find_base_packs(&keep_pack); + find_base_packs(&keep_pack, 0); + } else if (big_pack_threshold) { + find_base_packs(&keep_pack, big_pack_threshold); } add_repack_all_option(&keep_pack); From 8fc67762471c60ee644e6e100a3a85cf5a8631a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 15 Apr 2018 17:36:16 +0200 Subject: [PATCH 5/7] gc: handle a corner case in gc.bigPackThreshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This config allows us to keep packs back if their size is larger than a limit. But if this N >= gc.autoPackLimit, we may have a problem. We are supposed to reduce the number of packs after a threshold because it affects performance. We could tell the user that they have incompatible gc.bigPackThreshold and gc.autoPackLimit, but it's kinda hard when 'git gc --auto' runs in background. Instead let's fall back to the next best stategy: try to reduce the number of packs anyway, but keep the base pack out. This reduces the number of packs to two and hopefully won't take up too much resources to repack (the assumption still is the base pack takes most resources to handle). Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/config.txt | 5 +++++ builtin/gc.c | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/config.txt b/Documentation/config.txt index 728ae902e6..4c3f1d7651 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -1564,6 +1564,11 @@ gc.bigPackThreshold:: except that all packs that meet the threshold are kept, not just the base pack. Defaults to zero. Common unit suffixes of 'k', 'm', or 'g' are supported. ++ +Note that if the number of kept packs is more than gc.autoPackLimit, +this configuration variable is ignored, all packs except the base pack +will be repacked. After this the number of packs should go below +gc.autoPackLimit and gc.bigPackThreshold should be respected again. gc.logExpiry:: If the file gc.log exists, then `git gc --auto` won't run diff --git a/builtin/gc.c b/builtin/gc.c index 81ecdc5ffa..23c17ba7e9 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -253,8 +253,14 @@ static int need_to_gc(void) if (too_many_packs()) { struct string_list keep_pack = STRING_LIST_INIT_NODUP; - if (big_pack_threshold) + if (big_pack_threshold) { find_base_packs(&keep_pack, big_pack_threshold); + if (keep_pack.nr >= gc_auto_pack_limit) { + big_pack_threshold = 0; + string_list_clear(&keep_pack, 0); + find_base_packs(&keep_pack, 0); + } + } add_repack_all_option(&keep_pack); string_list_clear(&keep_pack, 0); From 9806f5a7bf3d02247c2c500ef74f56213cd7b07a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 15 Apr 2018 17:36:17 +0200 Subject: [PATCH 6/7] gc --auto: exclude base pack if not enough mem to "repack -ad" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pack-objects could be a big memory hog especially on large repos, everybody knows that. The suggestion to stick a .keep file on the giant base pack to avoid this problem is also known for a long time. Recent patches add an option to do just this, but it has to be either configured or activated manually. This patch lets `git gc --auto` activate this mode automatically when it thinks `repack -ad` will use a lot of memory and start affecting the system due to swapping or flushing OS cache. gc --auto decides to do this based on an estimation of pack-objects memory usage, which is quite accurate at least for the heap part, and whether that fits in half of system memory (the assumption here is for desktop environment where there are many other applications running). This mechanism only kicks in if gc.bigBasePackThreshold is not configured. If it is, it is assumed that the user already knows what they want. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- Documentation/git-gc.txt | 9 +++- builtin/gc.c | 98 +++++++++++++++++++++++++++++++++++++++- builtin/pack-objects.c | 2 +- config.mak.uname | 1 + git-compat-util.h | 4 ++ pack-objects.h | 2 + t/t6500-gc.sh | 7 +++ 7 files changed, 119 insertions(+), 4 deletions(-) diff --git a/Documentation/git-gc.txt b/Documentation/git-gc.txt index 649faddfa6..0fb1d43b2c 100644 --- a/Documentation/git-gc.txt +++ b/Documentation/git-gc.txt @@ -59,8 +59,13 @@ If the number of packs exceeds the value of `gc.autoPackLimit`, then existing packs (except those marked with a `.keep` file or over `gc.bigPackThreshold` limit) are consolidated into a single pack by using the `-A` option of -'git repack'. Setting `gc.autoPackLimit` to 0 disables -automatic consolidation of packs. +'git repack'. +If the amount of memory is estimated not enough for `git repack` to +run smoothly and `gc.bigPackThreshold` is not set, the largest +pack will also be excluded (this is the equivalent of running `git gc` +with `--keep-base-pack`). +Setting `gc.autoPackLimit` to 0 disables automatic consolidation of +packs. + If houskeeping is required due to many loose objects or packs, all other housekeeping tasks (e.g. rerere, working trees, reflog...) will diff --git a/builtin/gc.c b/builtin/gc.c index 23c17ba7e9..3c7c93e961 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -22,6 +22,10 @@ #include "commit.h" #include "packfile.h" #include "object-store.h" +#include "pack.h" +#include "pack-objects.h" +#include "blob.h" +#include "tree.h" #define FAILED_RUN "failed to run %s" @@ -42,6 +46,7 @@ static const char *gc_log_expire = "1.day.ago"; static const char *prune_expire = "2.weeks.ago"; static const char *prune_worktrees_expire = "3.months.ago"; static unsigned long big_pack_threshold; +static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE; static struct argv_array pack_refs_cmd = ARGV_ARRAY_INIT; static struct argv_array reflog = ARGV_ARRAY_INIT; @@ -130,6 +135,7 @@ static void gc_config(void) git_config_get_expiry("gc.logexpiry", &gc_log_expire); git_config_get_ulong("gc.bigpackthreshold", &big_pack_threshold); + git_config_get_ulong("pack.deltacachesize", &max_delta_cache_size); git_config(git_default_config, NULL); } @@ -169,7 +175,8 @@ static int too_many_loose_objects(void) return needed; } -static void find_base_packs(struct string_list *packs, unsigned long limit) +static struct packed_git *find_base_packs(struct string_list *packs, + unsigned long limit) { struct packed_git *p, *base = NULL; @@ -186,6 +193,8 @@ static void find_base_packs(struct string_list *packs, unsigned long limit) if (base) string_list_append(packs, base->pack_name); + + return base; } static int too_many_packs(void) @@ -210,6 +219,79 @@ static int too_many_packs(void) return gc_auto_pack_limit < cnt; } +static uint64_t total_ram(void) +{ +#if defined(HAVE_SYSINFO) + struct sysinfo si; + + if (!sysinfo(&si)) + return si.totalram; +#elif defined(HAVE_BSD_SYSCTL) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM)) + int64_t physical_memory; + int mib[2]; + size_t length; + + mib[0] = CTL_HW; +# if defined(HW_MEMSIZE) + mib[1] = HW_MEMSIZE; +# else + mib[1] = HW_PHYSMEM; +# endif + length = sizeof(int64_t); + if (!sysctl(mib, 2, &physical_memory, &length, NULL, 0)) + return physical_memory; +#elif defined(GIT_WINDOWS_NATIVE) + MEMORYSTATUSEX memInfo; + + memInfo.dwLength = sizeof(MEMORYSTATUSEX); + if (GlobalMemoryStatusEx(&memInfo)) + return memInfo.ullTotalPhys; +#endif + return 0; +} + +static uint64_t estimate_repack_memory(struct packed_git *pack) +{ + unsigned long nr_objects = approximate_object_count(); + size_t os_cache, heap; + + if (!pack || !nr_objects) + return 0; + + /* + * First we have to scan through at least one pack. + * Assume enough room in OS file cache to keep the entire pack + * or we may accidentally evict data of other processes from + * the cache. + */ + os_cache = pack->pack_size + pack->index_size; + /* then pack-objects needs lots more for book keeping */ + heap = sizeof(struct object_entry) * nr_objects; + /* + * internal rev-list --all --objects takes up some memory too, + * let's say half of it is for blobs + */ + heap += sizeof(struct blob) * nr_objects / 2; + /* + * and the other half is for trees (commits and tags are + * usually insignificant) + */ + heap += sizeof(struct tree) * nr_objects / 2; + /* and then obj_hash[], underestimated in fact */ + heap += sizeof(struct object *) * nr_objects; + /* revindex is used also */ + heap += sizeof(struct revindex_entry) * nr_objects; + /* + * read_sha1_file() (either at delta calculation phase, or + * writing phase) also fills up the delta base cache + */ + heap += delta_base_cache_limit; + /* and of course pack-objects has its own delta cache */ + heap += max_delta_cache_size; + + return os_cache + heap; +} + static int keep_one_pack(struct string_list_item *item, void *data) { argv_array_pushf(&repack, "--keep-pack=%s", basename(item->string)); @@ -260,6 +342,20 @@ static int need_to_gc(void) string_list_clear(&keep_pack, 0); find_base_packs(&keep_pack, 0); } + } else { + struct packed_git *p = find_base_packs(&keep_pack, 0); + uint64_t mem_have, mem_want; + + mem_have = total_ram(); + mem_want = estimate_repack_memory(p); + + /* + * Only allow 1/2 of memory for pack-objects, leave + * the rest for the OS and other processes in the + * system. + */ + if (!mem_have || mem_want < mem_have / 2) + string_list_clear(&keep_pack, 0); } add_repack_all_option(&keep_pack); diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 9b9a6d6268..c77bea404d 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -82,7 +82,7 @@ static uint16_t write_bitmap_options; static int exclude_promisor_objects; static unsigned long delta_cache_size = 0; -static unsigned long max_delta_cache_size = 256 * 1024 * 1024; +static unsigned long max_delta_cache_size = DEFAULT_DELTA_CACHE_SIZE; static unsigned long cache_max_small_delta_size = 1000; static unsigned long window_memory_limit = 0; diff --git a/config.mak.uname b/config.mak.uname index 6a1d0de0cc..ae9cbccec1 100644 --- a/config.mak.uname +++ b/config.mak.uname @@ -37,6 +37,7 @@ ifeq ($(uname_S),Linux) HAVE_GETDELIM = YesPlease SANE_TEXT_GREP=-a FREAD_READS_DIRECTORIES = UnfortunatelyYes + BASIC_CFLAGS += -DHAVE_SYSINFO endif ifeq ($(uname_S),GNU/kFreeBSD) HAVE_ALLOCA_H = YesPlease diff --git a/git-compat-util.h b/git-compat-util.h index 07e383257b..e373af48b8 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -284,6 +284,10 @@ extern char *gitdirname(char *); #include #endif +#ifdef HAVE_SYSINFO +# include +#endif + /* On most systems would have given us this, but * not on some systems (e.g. z/OS). */ diff --git a/pack-objects.h b/pack-objects.h index 03f1191659..af4f46c026 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -1,6 +1,8 @@ #ifndef PACK_OBJECTS_H #define PACK_OBJECTS_H +#define DEFAULT_DELTA_CACHE_SIZE (256 * 1024 * 1024) + struct object_entry { struct pack_idx_entry idx; unsigned long size; /* uncompressed size */ diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh index c42f60bc5b..818435f04e 100755 --- a/t/t6500-gc.sh +++ b/t/t6500-gc.sh @@ -5,6 +5,13 @@ test_description='basic git gc tests . ./test-lib.sh +test_expect_success 'setup' ' + # do not let the amount of physical memory affects gc + # behavior, make sure we always pack everything to one pack by + # default + git config gc.bigPackThreshold 2g +' + test_expect_success 'gc empty repository' ' git gc ' From 5af050437af1c061804b4317154085c65490bbac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sun, 15 Apr 2018 17:36:18 +0200 Subject: [PATCH 7/7] pack-objects: show some progress when counting kept objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We only show progress when there are new objects to be packed. But when --keep-pack is specified on the base pack, we will exclude most of objects. This makes 'pack-objects' stay silent for a long time while the counting phase is going. Let's show some progress whenever we visit an object instead. The old "Counting objects" is renamed to "Enumerating objects" and a new progress "Counting objects" line is added. This new "Counting objects" line should progress pretty quick when the system is beefy. But when the system is under pressure, the reading object header done in this phase could be slow and showing progress is an improvement over staying silent in the current code. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index c77bea404d..6a1346c41f 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -46,7 +46,7 @@ static const char *pack_usage[] = { static struct packing_data to_pack; static struct pack_idx_entry **written_list; -static uint32_t nr_result, nr_written; +static uint32_t nr_result, nr_written, nr_seen; static int non_empty; static int reuse_delta = 1, reuse_object = 1; @@ -1096,6 +1096,8 @@ static int add_object_entry(const struct object_id *oid, enum object_type type, off_t found_offset = 0; uint32_t index_pos; + display_progress(progress_state, ++nr_seen); + if (have_duplicate_entry(oid, exclude, &index_pos)) return 0; @@ -1111,8 +1113,6 @@ static int add_object_entry(const struct object_id *oid, enum object_type type, create_object_entry(oid, type, pack_name_hash(name), exclude, name && no_try_delta(name), index_pos, found_pack, found_offset); - - display_progress(progress_state, nr_result); return 1; } @@ -1123,6 +1123,8 @@ static int add_object_entry_from_bitmap(const struct object_id *oid, { uint32_t index_pos; + display_progress(progress_state, ++nr_seen); + if (have_duplicate_entry(oid, 0, &index_pos)) return 0; @@ -1130,8 +1132,6 @@ static int add_object_entry_from_bitmap(const struct object_id *oid, return 0; create_object_entry(oid, type, name_hash, 0, 0, index_pos, pack, offset); - - display_progress(progress_state, nr_result); return 1; } @@ -1716,6 +1716,10 @@ static void get_object_details(void) uint32_t i; struct object_entry **sorted_by_offset; + if (progress) + progress_state = start_progress(_("Counting objects"), + to_pack.nr_objects); + sorted_by_offset = xcalloc(to_pack.nr_objects, sizeof(struct object_entry *)); for (i = 0; i < to_pack.nr_objects; i++) sorted_by_offset[i] = to_pack.objects + i; @@ -1726,7 +1730,9 @@ static void get_object_details(void) check_object(entry); if (big_file_threshold < entry->size) entry->no_try_delta = 1; + display_progress(progress_state, i + 1); } + stop_progress(&progress_state); /* * This must happen in a second pass, since we rely on the delta @@ -3209,7 +3215,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) } if (progress) - progress_state = start_progress(_("Counting objects"), 0); + progress_state = start_progress(_("Enumerating objects"), 0); if (!use_internal_rev_list) read_object_list_from_stdin(); else {