зеркало из https://github.com/microsoft/git.git
Merge branch 'jk/pack-objects-optim-mru'
"git pack-objects" in a repository with many packfiles used to spend a lot of time looking for/at objects in them; the accesses to the packfiles are now optimized by checking the most-recently-used packfile first. * jk/pack-objects-optim-mru: pack-objects: use mru list when iterating over packs pack-objects: break delta cycles before delta-search phase sha1_file: make packed_object_info public provide an initializer for "struct object_info"
This commit is contained in:
Коммит
e6e24c94df
|
@ -53,7 +53,7 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name,
|
|||
char *buf;
|
||||
unsigned long size;
|
||||
struct object_context obj_context;
|
||||
struct object_info oi = {NULL};
|
||||
struct object_info oi = OBJECT_INFO_INIT;
|
||||
struct strbuf sb = STRBUF_INIT;
|
||||
unsigned flags = LOOKUP_REPLACE_OBJECT;
|
||||
const char *path = force_path;
|
||||
|
@ -449,8 +449,7 @@ static int batch_objects(struct batch_options *opt)
|
|||
data.split_on_whitespace = 1;
|
||||
|
||||
if (opt->all_objects) {
|
||||
struct object_info empty;
|
||||
memset(&empty, 0, sizeof(empty));
|
||||
struct object_info empty = OBJECT_INFO_INIT;
|
||||
if (!memcmp(&data.info, &empty, sizeof(empty)))
|
||||
data.skip_object_info = 1;
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "reachable.h"
|
||||
#include "sha1-array.h"
|
||||
#include "argv-array.h"
|
||||
#include "mru.h"
|
||||
|
||||
static const char *pack_usage[] = {
|
||||
N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
|
||||
|
@ -994,7 +995,7 @@ static int want_object_in_pack(const unsigned char *sha1,
|
|||
struct packed_git **found_pack,
|
||||
off_t *found_offset)
|
||||
{
|
||||
struct packed_git *p;
|
||||
struct mru_entry *entry;
|
||||
int want;
|
||||
|
||||
if (!exclude && local && has_loose_object_nonlocal(sha1))
|
||||
|
@ -1011,7 +1012,8 @@ static int want_object_in_pack(const unsigned char *sha1,
|
|||
return want;
|
||||
}
|
||||
|
||||
for (p = packed_git; p; p = p->next) {
|
||||
for (entry = packed_git_mru->head; entry; entry = entry->next) {
|
||||
struct packed_git *p = entry->item;
|
||||
off_t offset;
|
||||
|
||||
if (p == *found_pack)
|
||||
|
@ -1027,6 +1029,8 @@ static int want_object_in_pack(const unsigned char *sha1,
|
|||
*found_pack = p;
|
||||
}
|
||||
want = want_found_object(exclude, p);
|
||||
if (!exclude && want > 0)
|
||||
mru_mark(packed_git_mru, entry);
|
||||
if (want != -1)
|
||||
return want;
|
||||
}
|
||||
|
@ -1527,6 +1531,83 @@ static int pack_offset_sort(const void *_a, const void *_b)
|
|||
(a->in_pack_offset > b->in_pack_offset);
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop an on-disk delta we were planning to reuse. Naively, this would
|
||||
* just involve blanking out the "delta" field, but we have to deal
|
||||
* with some extra book-keeping:
|
||||
*
|
||||
* 1. Removing ourselves from the delta_sibling linked list.
|
||||
*
|
||||
* 2. Updating our size/type to the non-delta representation. These were
|
||||
* either not recorded initially (size) or overwritten with the delta type
|
||||
* (type) when check_object() decided to reuse the delta.
|
||||
*/
|
||||
static void drop_reused_delta(struct object_entry *entry)
|
||||
{
|
||||
struct object_entry **p = &entry->delta->delta_child;
|
||||
struct object_info oi = OBJECT_INFO_INIT;
|
||||
|
||||
while (*p) {
|
||||
if (*p == entry)
|
||||
*p = (*p)->delta_sibling;
|
||||
else
|
||||
p = &(*p)->delta_sibling;
|
||||
}
|
||||
entry->delta = NULL;
|
||||
|
||||
oi.sizep = &entry->size;
|
||||
oi.typep = &entry->type;
|
||||
if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
|
||||
/*
|
||||
* We failed to get the info from this pack for some reason;
|
||||
* fall back to sha1_object_info, which may find another copy.
|
||||
* And if that fails, the error will be recorded in entry->type
|
||||
* and dealt with in prepare_pack().
|
||||
*/
|
||||
entry->type = sha1_object_info(entry->idx.sha1, &entry->size);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Follow the chain of deltas from this entry onward, throwing away any links
|
||||
* that cause us to hit a cycle (as determined by the DFS state flags in
|
||||
* the entries).
|
||||
*/
|
||||
static void break_delta_chains(struct object_entry *entry)
|
||||
{
|
||||
/* If it's not a delta, it can't be part of a cycle. */
|
||||
if (!entry->delta) {
|
||||
entry->dfs_state = DFS_DONE;
|
||||
return;
|
||||
}
|
||||
|
||||
switch (entry->dfs_state) {
|
||||
case DFS_NONE:
|
||||
/*
|
||||
* This is the first time we've seen the object. We mark it as
|
||||
* part of the active potential cycle and recurse.
|
||||
*/
|
||||
entry->dfs_state = DFS_ACTIVE;
|
||||
break_delta_chains(entry->delta);
|
||||
entry->dfs_state = DFS_DONE;
|
||||
break;
|
||||
|
||||
case DFS_DONE:
|
||||
/* object already examined, and not part of a cycle */
|
||||
break;
|
||||
|
||||
case DFS_ACTIVE:
|
||||
/*
|
||||
* We found a cycle that needs broken. It would be correct to
|
||||
* break any link in the chain, but it's convenient to
|
||||
* break this one.
|
||||
*/
|
||||
drop_reused_delta(entry);
|
||||
entry->dfs_state = DFS_DONE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void get_object_details(void)
|
||||
{
|
||||
uint32_t i;
|
||||
|
@ -1544,6 +1625,13 @@ static void get_object_details(void)
|
|||
entry->no_try_delta = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* This must happen in a second pass, since we rely on the delta
|
||||
* information for the whole list being completed.
|
||||
*/
|
||||
for (i = 0; i < to_pack.nr_objects; i++)
|
||||
break_delta_chains(&to_pack.objects[i]);
|
||||
|
||||
free(sorted_by_offset);
|
||||
}
|
||||
|
||||
|
|
8
cache.h
8
cache.h
|
@ -1602,7 +1602,15 @@ struct object_info {
|
|||
} packed;
|
||||
} u;
|
||||
};
|
||||
|
||||
/*
|
||||
* Initializer for a "struct object_info" that wants no items. You may
|
||||
* also memset() the memory to all-zeroes.
|
||||
*/
|
||||
#define OBJECT_INFO_INIT {NULL}
|
||||
|
||||
extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags);
|
||||
extern int packed_object_info(struct packed_git *pack, off_t offset, struct object_info *);
|
||||
|
||||
/* Dumb servers support */
|
||||
extern int update_server_info(int);
|
||||
|
|
|
@ -27,6 +27,15 @@ struct object_entry {
|
|||
unsigned no_try_delta:1;
|
||||
unsigned tagged:1; /* near the very tip of refs */
|
||||
unsigned filled:1; /* assigned write-order */
|
||||
|
||||
/*
|
||||
* State flags for depth-first search used for analyzing delta cycles.
|
||||
*/
|
||||
enum {
|
||||
DFS_NONE = 0,
|
||||
DFS_ACTIVE,
|
||||
DFS_DONE
|
||||
} dfs_state;
|
||||
};
|
||||
|
||||
struct packing_data {
|
||||
|
|
|
@ -1826,11 +1826,9 @@ static int parse_sha1_header_extended(const char *hdr, struct object_info *oi,
|
|||
|
||||
int parse_sha1_header(const char *hdr, unsigned long *sizep)
|
||||
{
|
||||
struct object_info oi;
|
||||
struct object_info oi = OBJECT_INFO_INIT;
|
||||
|
||||
oi.sizep = sizep;
|
||||
oi.typename = NULL;
|
||||
oi.typep = NULL;
|
||||
return parse_sha1_header_extended(hdr, &oi, LOOKUP_REPLACE_OBJECT);
|
||||
}
|
||||
|
||||
|
@ -2068,7 +2066,7 @@ unwind:
|
|||
goto out;
|
||||
}
|
||||
|
||||
static int packed_object_info(struct packed_git *p, off_t obj_offset,
|
||||
int packed_object_info(struct packed_git *p, off_t obj_offset,
|
||||
struct object_info *oi)
|
||||
{
|
||||
struct pack_window *w_curs = NULL;
|
||||
|
@ -2840,7 +2838,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi,
|
|||
int sha1_object_info(const unsigned char *sha1, unsigned long *sizep)
|
||||
{
|
||||
enum object_type type;
|
||||
struct object_info oi = {NULL};
|
||||
struct object_info oi = OBJECT_INFO_INIT;
|
||||
|
||||
oi.typep = &type;
|
||||
oi.sizep = sizep;
|
||||
|
|
|
@ -135,7 +135,7 @@ struct git_istream *open_istream(const unsigned char *sha1,
|
|||
struct stream_filter *filter)
|
||||
{
|
||||
struct git_istream *st;
|
||||
struct object_info oi = {NULL};
|
||||
struct object_info oi = OBJECT_INFO_INIT;
|
||||
const unsigned char *real = lookup_replace_object(sha1);
|
||||
enum input_source src = istream_source(real, type, &oi);
|
||||
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
#!/bin/sh
|
||||
|
||||
test_description='test handling of inter-pack delta cycles during repack
|
||||
|
||||
The goal here is to create a situation where we have two blobs, A and B, with A
|
||||
as a delta against B in one pack, and vice versa in the other. Then if we can
|
||||
persuade a full repack to find A from one pack and B from the other, that will
|
||||
give us a cycle when we attempt to reuse those deltas.
|
||||
|
||||
The trick is in the "persuade" step, as it depends on the internals of how
|
||||
pack-objects picks which pack to reuse the deltas from. But we can assume
|
||||
that it does so in one of two general strategies:
|
||||
|
||||
1. Using a static ordering of packs. In this case, no inter-pack cycles can
|
||||
happen. Any objects with a delta relationship must be present in the same
|
||||
pack (i.e., no "--thin" packs on disk), so we will find all related objects
|
||||
from that pack. So assuming there are no cycles within a single pack (and
|
||||
we avoid generating them via pack-objects or importing them via
|
||||
index-pack), then our result will have no cycles.
|
||||
|
||||
So this case should pass the tests no matter how we arrange things.
|
||||
|
||||
2. Picking the next pack to examine based on locality (i.e., where we found
|
||||
something else recently).
|
||||
|
||||
In this case, we want to make sure that we find the delta versions of A and
|
||||
B and not their base versions. We can do this by putting two blobs in each
|
||||
pack. The first is a "dummy" blob that can only be found in the pack in
|
||||
question. And then the second is the actual delta we want to find.
|
||||
|
||||
The two blobs must be present in the same tree, not present in other trees,
|
||||
and the dummy pathname must sort before the delta path.
|
||||
|
||||
The setup below focuses on case 2. We have two commits HEAD and HEAD^, each
|
||||
which has two files: "dummy" and "file". Then we can make two packs which
|
||||
contain:
|
||||
|
||||
[pack one]
|
||||
HEAD:dummy
|
||||
HEAD:file (as delta against HEAD^:file)
|
||||
HEAD^:file (as base)
|
||||
|
||||
[pack two]
|
||||
HEAD^:dummy
|
||||
HEAD^:file (as delta against HEAD:file)
|
||||
HEAD:file (as base)
|
||||
|
||||
Then no matter which order we start looking at the packs in, we know that we
|
||||
will always find a delta for "file", because its lookup will always come
|
||||
immediately after the lookup for "dummy".
|
||||
'
|
||||
. ./test-lib.sh
|
||||
|
||||
|
||||
|
||||
# Create a pack containing the the tree $1 and blob $1:file, with
|
||||
# the latter stored as a delta against $2:file.
|
||||
#
|
||||
# We convince pack-objects to make the delta in the direction of our choosing
|
||||
# by marking $2 as a preferred-base edge. That results in $1:file as a thin
|
||||
# delta, and index-pack completes it by adding $2:file as a base.
|
||||
#
|
||||
# Note that the two variants of "file" must be similar enough to convince git
|
||||
# to create the delta.
|
||||
make_pack () {
|
||||
{
|
||||
printf '%s\n' "-$(git rev-parse $2)"
|
||||
printf '%s dummy\n' "$(git rev-parse $1:dummy)"
|
||||
printf '%s file\n' "$(git rev-parse $1:file)"
|
||||
} |
|
||||
git pack-objects --stdout |
|
||||
git index-pack --stdin --fix-thin
|
||||
}
|
||||
|
||||
test_expect_success 'setup' '
|
||||
test-genrandom base 4096 >base &&
|
||||
for i in one two
|
||||
do
|
||||
# we want shared content here to encourage deltas...
|
||||
cp base file &&
|
||||
echo $i >>file &&
|
||||
|
||||
# ...whereas dummy should be short, because we do not want
|
||||
# deltas that would create duplicates when we --fix-thin
|
||||
echo $i >dummy &&
|
||||
|
||||
git add file dummy &&
|
||||
test_tick &&
|
||||
git commit -m $i ||
|
||||
return 1
|
||||
done &&
|
||||
|
||||
make_pack HEAD^ HEAD &&
|
||||
make_pack HEAD HEAD^
|
||||
'
|
||||
|
||||
test_expect_success 'repack' '
|
||||
# We first want to check that we do not have any internal errors,
|
||||
# and also that we do not hit the last-ditch cycle-breaking code
|
||||
# in write_object(), which will issue a warning to stderr.
|
||||
>expect &&
|
||||
git repack -ad 2>stderr &&
|
||||
test_cmp expect stderr &&
|
||||
|
||||
# And then double-check that the resulting pack is usable (i.e.,
|
||||
# we did not fail to notice any cycles). We know we are accessing
|
||||
# the objects via the new pack here, because "repack -d" will have
|
||||
# removed the others.
|
||||
git cat-file blob HEAD:file >/dev/null &&
|
||||
git cat-file blob HEAD^:file >/dev/null
|
||||
'
|
||||
|
||||
test_done
|
Загрузка…
Ссылка в новой задаче