Trivially shrinks the on-disk size of the index file to save both I/O and
checksum overhead.

The topic should give a solid base to build on further updates, with the
code refactoring in its earlier parts, and the backward compatibility
mechanism in its later parts.

* jc/index-v4:
  index-v4: document the entry format
  unpack-trees: preserve the index file version of original
  update-index: upgrade/downgrade on-disk index version
  read-cache.c: write prefix-compressed names in the index
  read-cache.c: read prefix-compressed names in index on-disk version v4
  read-cache.c: move code to copy incore to ondisk cache to a helper function
  read-cache.c: move code to copy ondisk to incore cache to a helper function
  read-cache.c: report the header version we do not understand
  read-cache.c: make create_from_disk() report number of bytes it consumed
  read-cache.c: allow unaligned mapping of the index file
  cache.h: hide on-disk index details
  varint: make it available outside the context of pack
This commit is contained in:
Junio C Hamano 2012-05-02 13:51:13 -07:00
Родитель 242cab3972 afd7bd2220
Коммит d4a5d872c0
9 изменённых файлов: 298 добавлений и 89 удалений

Просмотреть файл

@ -19,7 +19,7 @@ SYNOPSIS
[--ignore-submodules]
[--really-refresh] [--unresolve] [--again | -g]
[--info-only] [--index-info]
[-z] [--stdin]
[-z] [--stdin] [--index-version <n>]
[--verbose]
[--] [<file>...]
@ -143,6 +143,10 @@ you will need to handle the situation manually.
--verbose::
Report what is being added and removed from index.
--index-version <n>::
Write the resulting index out in the named on-disk format version.
The current default version is 2.
-z::
Only meaningful with `--stdin` or `--index-info`; paths are
separated with NUL character instead of LF.

Просмотреть файл

@ -113,9 +113,22 @@ GIT index format
are encoded in 7-bit ASCII and the encoding cannot contain a NUL
byte (iow, this is a UNIX pathname).
(Version 4) In version 4, the entry path name is prefix-compressed
relative to the path name for the previous entry (the very first
entry is encoded as if the path name for the previous entry is an
empty string). At the beginning of an entry, an integer N in the
variable width encoding (the same encoding as the offset is encoded
for OFS_DELTA pack entries; see pack-format.txt) is stored, followed
by a NUL-terminated string S. Removing N bytes from the end of the
path name for the previous entry, and replacing it with the string S
yields the path name for this entry.
1-8 nul bytes as necessary to pad the entry to a multiple of eight bytes
while keeping the name NUL-terminated.
(Version 4) In version 4, the padding after the pathname does not
exist.
== Extensions
=== Cached tree

Просмотреть файл

@ -631,6 +631,7 @@ LIB_H += tree-walk.h
LIB_H += unpack-trees.h
LIB_H += userdiff.h
LIB_H += utf8.h
LIB_H += varint.h
LIB_H += xdiff-interface.h
LIB_H += xdiff/xdiff.h
@ -757,6 +758,7 @@ LIB_OBJS += url.o
LIB_OBJS += usage.o
LIB_OBJS += userdiff.o
LIB_OBJS += utf8.o
LIB_OBJS += varint.o
LIB_OBJS += walker.o
LIB_OBJS += wrapper.o
LIB_OBJS += write_or_die.o

Просмотреть файл

@ -708,6 +708,7 @@ int cmd_update_index(int argc, const char **argv, const char *prefix)
int newfd, entries, has_errors = 0, line_termination = '\n';
int read_from_stdin = 0;
int prefix_length = prefix ? strlen(prefix) : 0;
int preferred_index_format = 0;
char set_executable_bit = 0;
struct refresh_params refresh_args = {0, &has_errors};
int lock_error = 0;
@ -791,6 +792,8 @@ int cmd_update_index(int argc, const char **argv, const char *prefix)
"(for porcelains) forget saved unresolved conflicts",
PARSE_OPT_NOARG | PARSE_OPT_NONEG,
resolve_undo_clear_callback},
OPT_INTEGER(0, "index-version", &preferred_index_format,
"write index in this format"),
OPT_END()
};
@ -851,6 +854,17 @@ int cmd_update_index(int argc, const char **argv, const char *prefix)
}
}
argc = parse_options_end(&ctx);
if (preferred_index_format) {
if (preferred_index_format < INDEX_FORMAT_LB ||
INDEX_FORMAT_UB < preferred_index_format)
die("index-version %d not in range: %d..%d",
preferred_index_format,
INDEX_FORMAT_LB, INDEX_FORMAT_UB);
if (the_index.version != preferred_index_format)
active_cache_changed = 1;
the_index.version = preferred_index_format;
}
if (read_from_stdin) {
struct strbuf buf = STRBUF_INIT, nbuf = STRBUF_INIT;

52
cache.h
Просмотреть файл

@ -105,6 +105,9 @@ struct cache_header {
unsigned int hdr_entries;
};
#define INDEX_FORMAT_LB 2
#define INDEX_FORMAT_UB 4
/*
* The "cache_time" is just the low 32 bits of the
* time. It doesn't matter if it overflows - we only
@ -115,48 +118,6 @@ struct cache_time {
unsigned int nsec;
};
/*
* dev/ino/uid/gid/size are also just tracked to the low 32 bits
* Again - this is just a (very strong in practice) heuristic that
* the inode hasn't changed.
*
* We save the fields in big-endian order to allow using the
* index file over NFS transparently.
*/
struct ondisk_cache_entry {
struct cache_time ctime;
struct cache_time mtime;
unsigned int dev;
unsigned int ino;
unsigned int mode;
unsigned int uid;
unsigned int gid;
unsigned int size;
unsigned char sha1[20];
unsigned short flags;
char name[FLEX_ARRAY]; /* more */
};
/*
* This struct is used when CE_EXTENDED bit is 1
* The struct must match ondisk_cache_entry exactly from
* ctime till flags
*/
struct ondisk_cache_entry_extended {
struct cache_time ctime;
struct cache_time mtime;
unsigned int dev;
unsigned int ino;
unsigned int mode;
unsigned int uid;
unsigned int gid;
unsigned int size;
unsigned char sha1[20];
unsigned short flags;
unsigned short flags2;
char name[FLEX_ARRAY]; /* more */
};
struct cache_entry {
struct cache_time ce_ctime;
struct cache_time ce_mtime;
@ -253,9 +214,6 @@ static inline size_t ce_namelen(const struct cache_entry *ce)
}
#define ce_size(ce) cache_entry_size(ce_namelen(ce))
#define ondisk_ce_size(ce) (((ce)->ce_flags & CE_EXTENDED) ? \
ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
ondisk_cache_entry_size(ce_namelen(ce)))
#define ce_stage(ce) ((CE_STAGEMASK & (ce)->ce_flags) >> CE_STAGESHIFT)
#define ce_uptodate(ce) ((ce)->ce_flags & CE_UPTODATE)
#define ce_skip_worktree(ce) ((ce)->ce_flags & CE_SKIP_WORKTREE)
@ -306,13 +264,11 @@ static inline unsigned int canon_mode(unsigned int mode)
return S_IFGITLINK;
}
#define flexible_size(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 8) & ~7)
#define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1)
#define ondisk_cache_entry_size(len) flexible_size(ondisk_cache_entry,len)
#define ondisk_cache_entry_extended_size(len) flexible_size(ondisk_cache_entry_extended,len)
struct index_state {
struct cache_entry **cache;
unsigned int version;
unsigned int cache_nr, cache_alloc, cache_changed;
struct string_list *resolve_undo;
struct cache_tree *cache_tree;

Просмотреть файл

@ -12,6 +12,8 @@
#include "commit.h"
#include "blob.h"
#include "resolve-undo.h"
#include "strbuf.h"
#include "varint.h"
static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int really);
@ -1179,15 +1181,74 @@ static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int reall
return refresh_cache_ent(&the_index, ce, really, NULL, NULL);
}
/*****************************************************************
* Index File I/O
*****************************************************************/
#define INDEX_FORMAT_DEFAULT 3
/*
* dev/ino/uid/gid/size are also just tracked to the low 32 bits
* Again - this is just a (very strong in practice) heuristic that
* the inode hasn't changed.
*
* We save the fields in big-endian order to allow using the
* index file over NFS transparently.
*/
struct ondisk_cache_entry {
struct cache_time ctime;
struct cache_time mtime;
unsigned int dev;
unsigned int ino;
unsigned int mode;
unsigned int uid;
unsigned int gid;
unsigned int size;
unsigned char sha1[20];
unsigned short flags;
char name[FLEX_ARRAY]; /* more */
};
/*
* This struct is used when CE_EXTENDED bit is 1
* The struct must match ondisk_cache_entry exactly from
* ctime till flags
*/
struct ondisk_cache_entry_extended {
struct cache_time ctime;
struct cache_time mtime;
unsigned int dev;
unsigned int ino;
unsigned int mode;
unsigned int uid;
unsigned int gid;
unsigned int size;
unsigned char sha1[20];
unsigned short flags;
unsigned short flags2;
char name[FLEX_ARRAY]; /* more */
};
/* These are only used for v3 or lower */
#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 8) & ~7)
#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len)
#define ondisk_cache_entry_extended_size(len) align_flex_name(ondisk_cache_entry_extended,len)
#define ondisk_ce_size(ce) (((ce)->ce_flags & CE_EXTENDED) ? \
ondisk_cache_entry_extended_size(ce_namelen(ce)) : \
ondisk_cache_entry_size(ce_namelen(ce)))
static int verify_hdr(struct cache_header *hdr, unsigned long size)
{
git_SHA_CTX c;
unsigned char sha1[20];
int hdr_version;
if (hdr->hdr_signature != htonl(CACHE_SIGNATURE))
return error("bad signature");
if (hdr->hdr_version != htonl(2) && hdr->hdr_version != htonl(3))
return error("bad index version");
hdr_version = ntohl(hdr->hdr_version);
if (hdr_version < 2 || 4 < hdr_version)
return error("bad index version %d", hdr_version);
git_SHA1_Init(&c);
git_SHA1_Update(&c, hdr, size - 20);
git_SHA1_Final(sha1, &c);
@ -1221,7 +1282,74 @@ int read_index(struct index_state *istate)
return read_index_from(istate, get_index_file());
}
static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk)
#ifndef NEEDS_ALIGNED_ACCESS
#define ntoh_s(var) ntohs(var)
#define ntoh_l(var) ntohl(var)
#else
static inline uint16_t ntoh_s_force_align(void *p)
{
uint16_t x;
memcpy(&x, p, sizeof(x));
return ntohs(x);
}
static inline uint32_t ntoh_l_force_align(void *p)
{
uint32_t x;
memcpy(&x, p, sizeof(x));
return ntohl(x);
}
#define ntoh_s(var) ntoh_s_force_align(&(var))
#define ntoh_l(var) ntoh_l_force_align(&(var))
#endif
static struct cache_entry *cache_entry_from_ondisk(struct ondisk_cache_entry *ondisk,
unsigned int flags,
const char *name,
size_t len)
{
struct cache_entry *ce = xmalloc(cache_entry_size(len));
ce->ce_ctime.sec = ntoh_l(ondisk->ctime.sec);
ce->ce_mtime.sec = ntoh_l(ondisk->mtime.sec);
ce->ce_ctime.nsec = ntoh_l(ondisk->ctime.nsec);
ce->ce_mtime.nsec = ntoh_l(ondisk->mtime.nsec);
ce->ce_dev = ntoh_l(ondisk->dev);
ce->ce_ino = ntoh_l(ondisk->ino);
ce->ce_mode = ntoh_l(ondisk->mode);
ce->ce_uid = ntoh_l(ondisk->uid);
ce->ce_gid = ntoh_l(ondisk->gid);
ce->ce_size = ntoh_l(ondisk->size);
ce->ce_flags = flags;
hashcpy(ce->sha1, ondisk->sha1);
memcpy(ce->name, name, len);
ce->name[len] = '\0';
return ce;
}
/*
* Adjacent cache entries tend to share the leading paths, so it makes
* sense to only store the differences in later entries. In the v4
* on-disk format of the index, each on-disk cache entry stores the
* number of bytes to be stripped from the end of the previous name,
* and the bytes to append to the result, to come up with its name.
*/
static unsigned long expand_name_field(struct strbuf *name, const char *cp_)
{
const unsigned char *ep, *cp = (const unsigned char *)cp_;
size_t len = decode_varint(&cp);
if (name->len < len)
die("malformed name field in the index");
strbuf_remove(name, name->len - len, len);
for (ep = cp; *ep; ep++)
; /* find the end */
strbuf_add(name, cp, ep - cp);
return (const char *)ep + 1 - cp_;
}
static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk,
unsigned long *ent_size,
struct strbuf *previous_name)
{
struct cache_entry *ce;
size_t len;
@ -1229,14 +1357,14 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk)
unsigned int flags;
/* On-disk flags are just 16 bits */
flags = ntohs(ondisk->flags);
flags = ntoh_s(ondisk->flags);
len = flags & CE_NAMEMASK;
if (flags & CE_EXTENDED) {
struct ondisk_cache_entry_extended *ondisk2;
int extended_flags;
ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
extended_flags = ntohs(ondisk2->flags2) << 16;
extended_flags = ntoh_s(ondisk2->flags2) << 16;
/* We do not yet understand any bit out of CE_EXTENDED_FLAGS */
if (extended_flags & ~CE_EXTENDED_FLAGS)
die("Unknown index entry format %08x", extended_flags);
@ -1246,27 +1374,22 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk)
else
name = ondisk->name;
if (len == CE_NAMEMASK)
len = strlen(name);
if (!previous_name) {
/* v3 and earlier */
if (len == CE_NAMEMASK)
len = strlen(name);
ce = cache_entry_from_ondisk(ondisk, flags, name, len);
ce = xmalloc(cache_entry_size(len));
*ent_size = ondisk_ce_size(ce);
} else {
unsigned long consumed;
consumed = expand_name_field(previous_name, name);
ce = cache_entry_from_ondisk(ondisk, flags,
previous_name->buf,
previous_name->len);
ce->ce_ctime.sec = ntohl(ondisk->ctime.sec);
ce->ce_mtime.sec = ntohl(ondisk->mtime.sec);
ce->ce_ctime.nsec = ntohl(ondisk->ctime.nsec);
ce->ce_mtime.nsec = ntohl(ondisk->mtime.nsec);
ce->ce_dev = ntohl(ondisk->dev);
ce->ce_ino = ntohl(ondisk->ino);
ce->ce_mode = ntohl(ondisk->mode);
ce->ce_uid = ntohl(ondisk->uid);
ce->ce_gid = ntohl(ondisk->gid);
ce->ce_size = ntohl(ondisk->size);
ce->ce_flags = flags;
hashcpy(ce->sha1, ondisk->sha1);
memcpy(ce->name, name, len);
ce->name[len] = '\0';
*ent_size = (name - ((char *)ondisk)) + consumed;
}
return ce;
}
@ -1279,6 +1402,7 @@ int read_index_from(struct index_state *istate, const char *path)
struct cache_header *hdr;
void *mmap;
size_t mmap_size;
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
errno = EBUSY;
if (istate->initialized)
@ -1311,22 +1435,30 @@ int read_index_from(struct index_state *istate, const char *path)
if (verify_hdr(hdr, mmap_size) < 0)
goto unmap;
istate->version = ntohl(hdr->hdr_version);
istate->cache_nr = ntohl(hdr->hdr_entries);
istate->cache_alloc = alloc_nr(istate->cache_nr);
istate->cache = xcalloc(istate->cache_alloc, sizeof(struct cache_entry *));
istate->initialized = 1;
if (istate->version == 4)
previous_name = &previous_name_buf;
else
previous_name = NULL;
src_offset = sizeof(*hdr);
for (i = 0; i < istate->cache_nr; i++) {
struct ondisk_cache_entry *disk_ce;
struct cache_entry *ce;
unsigned long consumed;
disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset);
ce = create_from_disk(disk_ce);
ce = create_from_disk(disk_ce, &consumed, previous_name);
set_index_entry(istate, i, ce);
src_offset += ondisk_ce_size(ce);
src_offset += consumed;
}
strbuf_release(&previous_name_buf);
istate->timestamp.sec = st.st_mtime;
istate->timestamp.nsec = ST_MTIME_NSEC(st);
@ -1510,13 +1642,10 @@ static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
}
}
static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce)
/* Copy miscellaneous fields but not the name */
static char *copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk,
struct cache_entry *ce)
{
int size = ondisk_ce_size(ce);
struct ondisk_cache_entry *ondisk = xcalloc(1, size);
char *name;
int result;
ondisk->ctime.sec = htonl(ce->ce_ctime.sec);
ondisk->mtime.sec = htonl(ce->ce_mtime.sec);
ondisk->ctime.nsec = htonl(ce->ce_ctime.nsec);
@ -1533,11 +1662,52 @@ static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce)
struct ondisk_cache_entry_extended *ondisk2;
ondisk2 = (struct ondisk_cache_entry_extended *)ondisk;
ondisk2->flags2 = htons((ce->ce_flags & CE_EXTENDED_FLAGS) >> 16);
name = ondisk2->name;
return ondisk2->name;
}
else {
return ondisk->name;
}
}
static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce,
struct strbuf *previous_name)
{
int size;
struct ondisk_cache_entry *ondisk;
char *name;
int result;
if (!previous_name) {
size = ondisk_ce_size(ce);
ondisk = xcalloc(1, size);
name = copy_cache_entry_to_ondisk(ondisk, ce);
memcpy(name, ce->name, ce_namelen(ce));
} else {
int common, to_remove, prefix_size;
unsigned char to_remove_vi[16];
for (common = 0;
(ce->name[common] &&
common < previous_name->len &&
ce->name[common] == previous_name->buf[common]);
common++)
; /* still matching */
to_remove = previous_name->len - common;
prefix_size = encode_varint(to_remove, to_remove_vi);
if (ce->ce_flags & CE_EXTENDED)
size = offsetof(struct ondisk_cache_entry_extended, name);
else
size = offsetof(struct ondisk_cache_entry, name);
size += prefix_size + (ce_namelen(ce) - common + 1);
ondisk = xcalloc(1, size);
name = copy_cache_entry_to_ondisk(ondisk, ce);
memcpy(name, to_remove_vi, prefix_size);
memcpy(name + prefix_size, ce->name + common, ce_namelen(ce) - common);
strbuf_splice(previous_name, common, to_remove,
ce->name + common, ce_namelen(ce) - common);
}
else
name = ondisk->name;
memcpy(name, ce->name, ce_namelen(ce));
result = ce_write(c, fd, ondisk, size);
free(ondisk);
@ -1573,10 +1743,11 @@ int write_index(struct index_state *istate, int newfd)
{
git_SHA_CTX c;
struct cache_header hdr;
int i, err, removed, extended;
int i, err, removed, extended, hdr_version;
struct cache_entry **cache = istate->cache;
int entries = istate->cache_nr;
struct stat st;
struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
for (i = removed = extended = 0; i < entries; i++) {
if (cache[i]->ce_flags & CE_REMOVE)
@ -1590,24 +1761,34 @@ int write_index(struct index_state *istate, int newfd)
}
}
if (!istate->version)
istate->version = INDEX_FORMAT_DEFAULT;
/* demote version 3 to version 2 when the latter suffices */
if (istate->version == 3 || istate->version == 2)
istate->version = extended ? 3 : 2;
hdr_version = istate->version;
hdr.hdr_signature = htonl(CACHE_SIGNATURE);
/* for extended format, increase version so older git won't try to read it */
hdr.hdr_version = htonl(extended ? 3 : 2);
hdr.hdr_version = htonl(hdr_version);
hdr.hdr_entries = htonl(entries - removed);
git_SHA1_Init(&c);
if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
return -1;
previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
for (i = 0; i < entries; i++) {
struct cache_entry *ce = cache[i];
if (ce->ce_flags & CE_REMOVE)
continue;
if (!ce_uptodate(ce) && is_racy_timestamp(istate, ce))
ce_smudge_racily_clean_entry(ce);
if (ce_write_entry(&c, newfd, ce) < 0)
if (ce_write_entry(&c, newfd, ce, previous_name) < 0)
return -1;
}
strbuf_release(&previous_name_buf);
/* Write extension data here */
if (istate->cache_tree) {

Просмотреть файл

@ -1027,6 +1027,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
o->result.initialized = 1;
o->result.timestamp.sec = o->src_index->timestamp.sec;
o->result.timestamp.nsec = o->src_index->timestamp.nsec;
o->result.version = o->src_index->version;
o->merge_size = len;
mark_all_ce_unused(o->src_index);

29
varint.c Normal file
Просмотреть файл

@ -0,0 +1,29 @@
#include "varint.h"
uintmax_t decode_varint(const unsigned char **bufp)
{
const unsigned char *buf = *bufp;
unsigned char c = *buf++;
uintmax_t val = c & 127;
while (c & 128) {
val += 1;
if (!val || MSB(val, 7))
return 0; /* overflow */
c = *buf++;
val = (val << 7) + (c & 127);
}
*bufp = buf;
return val;
}
int encode_varint(uintmax_t value, unsigned char *buf)
{
unsigned char varint[16];
unsigned pos = sizeof(varint) - 1;
varint[pos] = value & 127;
while (value >>= 7)
varint[--pos] = 128 | (--value & 127);
if (buf)
memcpy(buf, varint + pos, sizeof(varint) - pos);
return sizeof(varint) - pos;
}

9
varint.h Normal file
Просмотреть файл

@ -0,0 +1,9 @@
#ifndef VARINT_H
#define VARINT_H
#include "git-compat-util.h"
extern int encode_varint(uintmax_t, unsigned char *);
extern uintmax_t decode_varint(const unsigned char **);
#endif /* VARINT_H */