Merge branch 'jk/pack-objects-optim-mru'

"git pack-objects" in a repository with many packfiles used to
spend a lot of time looking for/at objects in them; the accesses to
the packfiles are now optimized by checking the most-recently-used
packfile first.

* jk/pack-objects-optim-mru:
  pack-objects: use mru list when iterating over packs
  pack-objects: break delta cycles before delta-search phase
  sha1_file: make packed_object_info public
  provide an initializer for "struct object_info"
This commit is contained in:
Junio C Hamano 2016-10-10 14:03:46 -07:00
Родитель b8688adb12 c9af708b1a
Коммит e6e24c94df
7 изменённых файлов: 227 добавлений и 12 удалений

Просмотреть файл

@ -53,7 +53,7 @@ static int cat_one_file(int opt, const char *exp_type, const char *obj_name,
char *buf;
unsigned long size;
struct object_context obj_context;
struct object_info oi = {NULL};
struct object_info oi = OBJECT_INFO_INIT;
struct strbuf sb = STRBUF_INIT;
unsigned flags = LOOKUP_REPLACE_OBJECT;
const char *path = force_path;
@ -449,8 +449,7 @@ static int batch_objects(struct batch_options *opt)
data.split_on_whitespace = 1;
if (opt->all_objects) {
struct object_info empty;
memset(&empty, 0, sizeof(empty));
struct object_info empty = OBJECT_INFO_INIT;
if (!memcmp(&data.info, &empty, sizeof(empty)))
data.skip_object_info = 1;
}

Просмотреть файл

@ -23,6 +23,7 @@
#include "reachable.h"
#include "sha1-array.h"
#include "argv-array.h"
#include "mru.h"
static const char *pack_usage[] = {
N_("git pack-objects --stdout [<options>...] [< <ref-list> | < <object-list>]"),
@ -994,7 +995,7 @@ static int want_object_in_pack(const unsigned char *sha1,
struct packed_git **found_pack,
off_t *found_offset)
{
struct packed_git *p;
struct mru_entry *entry;
int want;
if (!exclude && local && has_loose_object_nonlocal(sha1))
@ -1011,7 +1012,8 @@ static int want_object_in_pack(const unsigned char *sha1,
return want;
}
for (p = packed_git; p; p = p->next) {
for (entry = packed_git_mru->head; entry; entry = entry->next) {
struct packed_git *p = entry->item;
off_t offset;
if (p == *found_pack)
@ -1027,6 +1029,8 @@ static int want_object_in_pack(const unsigned char *sha1,
*found_pack = p;
}
want = want_found_object(exclude, p);
if (!exclude && want > 0)
mru_mark(packed_git_mru, entry);
if (want != -1)
return want;
}
@ -1527,6 +1531,83 @@ static int pack_offset_sort(const void *_a, const void *_b)
(a->in_pack_offset > b->in_pack_offset);
}
/*
* Drop an on-disk delta we were planning to reuse. Naively, this would
* just involve blanking out the "delta" field, but we have to deal
* with some extra book-keeping:
*
* 1. Removing ourselves from the delta_sibling linked list.
*
* 2. Updating our size/type to the non-delta representation. These were
* either not recorded initially (size) or overwritten with the delta type
* (type) when check_object() decided to reuse the delta.
*/
static void drop_reused_delta(struct object_entry *entry)
{
struct object_entry **p = &entry->delta->delta_child;
struct object_info oi = OBJECT_INFO_INIT;
while (*p) {
if (*p == entry)
*p = (*p)->delta_sibling;
else
p = &(*p)->delta_sibling;
}
entry->delta = NULL;
oi.sizep = &entry->size;
oi.typep = &entry->type;
if (packed_object_info(entry->in_pack, entry->in_pack_offset, &oi) < 0) {
/*
* We failed to get the info from this pack for some reason;
* fall back to sha1_object_info, which may find another copy.
* And if that fails, the error will be recorded in entry->type
* and dealt with in prepare_pack().
*/
entry->type = sha1_object_info(entry->idx.sha1, &entry->size);
}
}
/*
* Follow the chain of deltas from this entry onward, throwing away any links
* that cause us to hit a cycle (as determined by the DFS state flags in
* the entries).
*/
static void break_delta_chains(struct object_entry *entry)
{
/* If it's not a delta, it can't be part of a cycle. */
if (!entry->delta) {
entry->dfs_state = DFS_DONE;
return;
}
switch (entry->dfs_state) {
case DFS_NONE:
/*
* This is the first time we've seen the object. We mark it as
* part of the active potential cycle and recurse.
*/
entry->dfs_state = DFS_ACTIVE;
break_delta_chains(entry->delta);
entry->dfs_state = DFS_DONE;
break;
case DFS_DONE:
/* object already examined, and not part of a cycle */
break;
case DFS_ACTIVE:
/*
* We found a cycle that needs broken. It would be correct to
* break any link in the chain, but it's convenient to
* break this one.
*/
drop_reused_delta(entry);
entry->dfs_state = DFS_DONE;
break;
}
}
static void get_object_details(void)
{
uint32_t i;
@ -1544,6 +1625,13 @@ static void get_object_details(void)
entry->no_try_delta = 1;
}
/*
* This must happen in a second pass, since we rely on the delta
* information for the whole list being completed.
*/
for (i = 0; i < to_pack.nr_objects; i++)
break_delta_chains(&to_pack.objects[i]);
free(sorted_by_offset);
}

Просмотреть файл

@ -1602,7 +1602,15 @@ struct object_info {
} packed;
} u;
};
/*
* Initializer for a "struct object_info" that wants no items. You may
* also memset() the memory to all-zeroes.
*/
#define OBJECT_INFO_INIT {NULL}
extern int sha1_object_info_extended(const unsigned char *, struct object_info *, unsigned flags);
extern int packed_object_info(struct packed_git *pack, off_t offset, struct object_info *);
/* Dumb servers support */
extern int update_server_info(int);

Просмотреть файл

@ -27,6 +27,15 @@ struct object_entry {
unsigned no_try_delta:1;
unsigned tagged:1; /* near the very tip of refs */
unsigned filled:1; /* assigned write-order */
/*
* State flags for depth-first search used for analyzing delta cycles.
*/
enum {
DFS_NONE = 0,
DFS_ACTIVE,
DFS_DONE
} dfs_state;
};
struct packing_data {

Просмотреть файл

@ -1826,11 +1826,9 @@ static int parse_sha1_header_extended(const char *hdr, struct object_info *oi,
int parse_sha1_header(const char *hdr, unsigned long *sizep)
{
struct object_info oi;
struct object_info oi = OBJECT_INFO_INIT;
oi.sizep = sizep;
oi.typename = NULL;
oi.typep = NULL;
return parse_sha1_header_extended(hdr, &oi, LOOKUP_REPLACE_OBJECT);
}
@ -2068,7 +2066,7 @@ unwind:
goto out;
}
static int packed_object_info(struct packed_git *p, off_t obj_offset,
int packed_object_info(struct packed_git *p, off_t obj_offset,
struct object_info *oi)
{
struct pack_window *w_curs = NULL;
@ -2840,7 +2838,7 @@ int sha1_object_info_extended(const unsigned char *sha1, struct object_info *oi,
int sha1_object_info(const unsigned char *sha1, unsigned long *sizep)
{
enum object_type type;
struct object_info oi = {NULL};
struct object_info oi = OBJECT_INFO_INIT;
oi.typep = &type;
oi.sizep = sizep;

Просмотреть файл

@ -135,7 +135,7 @@ struct git_istream *open_istream(const unsigned char *sha1,
struct stream_filter *filter)
{
struct git_istream *st;
struct object_info oi = {NULL};
struct object_info oi = OBJECT_INFO_INIT;
const unsigned char *real = lookup_replace_object(sha1);
enum input_source src = istream_source(real, type, &oi);

113
t/t5314-pack-cycle-detection.sh Executable file
Просмотреть файл

@ -0,0 +1,113 @@
#!/bin/sh
test_description='test handling of inter-pack delta cycles during repack
The goal here is to create a situation where we have two blobs, A and B, with A
as a delta against B in one pack, and vice versa in the other. Then if we can
persuade a full repack to find A from one pack and B from the other, that will
give us a cycle when we attempt to reuse those deltas.
The trick is in the "persuade" step, as it depends on the internals of how
pack-objects picks which pack to reuse the deltas from. But we can assume
that it does so in one of two general strategies:
1. Using a static ordering of packs. In this case, no inter-pack cycles can
happen. Any objects with a delta relationship must be present in the same
pack (i.e., no "--thin" packs on disk), so we will find all related objects
from that pack. So assuming there are no cycles within a single pack (and
we avoid generating them via pack-objects or importing them via
index-pack), then our result will have no cycles.
So this case should pass the tests no matter how we arrange things.
2. Picking the next pack to examine based on locality (i.e., where we found
something else recently).
In this case, we want to make sure that we find the delta versions of A and
B and not their base versions. We can do this by putting two blobs in each
pack. The first is a "dummy" blob that can only be found in the pack in
question. And then the second is the actual delta we want to find.
The two blobs must be present in the same tree, not present in other trees,
and the dummy pathname must sort before the delta path.
The setup below focuses on case 2. We have two commits HEAD and HEAD^, each
which has two files: "dummy" and "file". Then we can make two packs which
contain:
[pack one]
HEAD:dummy
HEAD:file (as delta against HEAD^:file)
HEAD^:file (as base)
[pack two]
HEAD^:dummy
HEAD^:file (as delta against HEAD:file)
HEAD:file (as base)
Then no matter which order we start looking at the packs in, we know that we
will always find a delta for "file", because its lookup will always come
immediately after the lookup for "dummy".
'
. ./test-lib.sh
# Create a pack containing the the tree $1 and blob $1:file, with
# the latter stored as a delta against $2:file.
#
# We convince pack-objects to make the delta in the direction of our choosing
# by marking $2 as a preferred-base edge. That results in $1:file as a thin
# delta, and index-pack completes it by adding $2:file as a base.
#
# Note that the two variants of "file" must be similar enough to convince git
# to create the delta.
make_pack () {
{
printf '%s\n' "-$(git rev-parse $2)"
printf '%s dummy\n' "$(git rev-parse $1:dummy)"
printf '%s file\n' "$(git rev-parse $1:file)"
} |
git pack-objects --stdout |
git index-pack --stdin --fix-thin
}
test_expect_success 'setup' '
test-genrandom base 4096 >base &&
for i in one two
do
# we want shared content here to encourage deltas...
cp base file &&
echo $i >>file &&
# ...whereas dummy should be short, because we do not want
# deltas that would create duplicates when we --fix-thin
echo $i >dummy &&
git add file dummy &&
test_tick &&
git commit -m $i ||
return 1
done &&
make_pack HEAD^ HEAD &&
make_pack HEAD HEAD^
'
test_expect_success 'repack' '
# We first want to check that we do not have any internal errors,
# and also that we do not hit the last-ditch cycle-breaking code
# in write_object(), which will issue a warning to stderr.
>expect &&
git repack -ad 2>stderr &&
test_cmp expect stderr &&
# And then double-check that the resulting pack is usable (i.e.,
# we did not fail to notice any cycles). We know we are accessing
# the objects via the new pack here, because "repack -d" will have
# removed the others.
git cat-file blob HEAD:file >/dev/null &&
git cat-file blob HEAD^:file >/dev/null
'
test_done